From 71261970002bd3b3c896f33fb95cb2c21a96c971 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 24 May 2023 12:36:07 -0400
Subject: [PATCH 01/59] pagectl: refactor ctl and support dump kv in delta
 (#4268)

This PR refactors the original page_binutils with a single tool pagectl,
use clap derive for better command line parsing, and adds the dump kv
tool to extract information from delta file. This helps me better
understand what's inside the page server. We can add support for other
types of file and more functionalities in the future.

---------

Signed-off-by: Alex Chi <iskyzh@gmail.com>
---
 Cargo.lock                                    |  15 ++
 Cargo.toml                                    |   1 +
 Dockerfile                                    |   6 +-
 pageserver/ctl/Cargo.toml                     |  18 ++
 .../{src/bin => ctl/src}/draw_timeline_dir.rs |   4 +-
 .../bin => ctl/src}/layer_map_analyzer.rs     |  33 ++--
 pageserver/ctl/src/layers.rs                  | 169 +++++++++++++++++
 pageserver/ctl/src/main.rs                    | 179 ++++++++++++++++++
 pageserver/src/bin/pageserver_binutils.rs     | 174 -----------------
 pageserver/src/tenant.rs                      |   2 +-
 pageserver/src/tenant/storage_layer.rs        |   2 +-
 .../src/tenant/storage_layer/delta_layer.rs   |   4 +-
 .../src/tenant/storage_layer/image_layer.rs   |   2 +-
 13 files changed, 404 insertions(+), 205 deletions(-)
 create mode 100644 pageserver/ctl/Cargo.toml
 rename pageserver/{src/bin => ctl/src}/draw_timeline_dir.rs (97%)
 rename pageserver/{src/bin => ctl/src}/layer_map_analyzer.rs (92%)
 create mode 100644 pageserver/ctl/src/layers.rs
 create mode 100644 pageserver/ctl/src/main.rs
 delete mode 100644 pageserver/src/bin/pageserver_binutils.rs

diff --git a/Cargo.lock b/Cargo.lock
index 2223453a0866..6501d9557d1e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2587,6 +2587,21 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 
+[[package]]
+name = "pagectl"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "bytes",
+ "clap 4.2.2",
+ "git-version",
+ "pageserver",
+ "postgres_ffi",
+ "svg_fmt",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "pageserver"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index 789545984172..19d178385153 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,6 +3,7 @@ members = [
     "compute_tools",
     "control_plane",
     "pageserver",
+    "pageserver/ctl",
     "proxy",
     "safekeeper",
     "storage_broker",
diff --git a/Dockerfile b/Dockerfile
index 736465464178..9467e41ae40f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -47,8 +47,7 @@ RUN set -e \
     && mold -run cargo build  \
       --bin pg_sni_router  \
       --bin pageserver  \
-      --bin pageserver_binutils  \
-      --bin draw_timeline_dir \
+      --bin pagectl  \
       --bin safekeeper  \
       --bin storage_broker  \
       --bin proxy  \
@@ -73,8 +72,7 @@ RUN set -e \
 
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router       /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir   /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml
new file mode 100644
index 000000000000..89e0d0486e85
--- /dev/null
+++ b/pageserver/ctl/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "pagectl"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow.workspace = true
+bytes.workspace = true
+clap = { workspace = true, features = ["string"] }
+git-version.workspace = true
+pageserver = { path = ".." }
+postgres_ffi.workspace = true
+utils.workspace = true
+svg_fmt.workspace = true
+workspace_hack.workspace = true
diff --git a/pageserver/src/bin/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs
similarity index 97%
rename from pageserver/src/bin/draw_timeline_dir.rs
rename to pageserver/ctl/src/draw_timeline_dir.rs
index da13ee452c2c..bfde5ba054ce 100644
--- a/pageserver/src/bin/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -12,7 +12,7 @@
 //! Example use:
 //! ```
 //! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
-//! $   grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
+//! $   grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
 //! $ firefox out.svg
 //! ```
 //!
@@ -62,7 +62,7 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
     (keys, lsns)
 }
 
-fn main() -> Result<()> {
+pub fn main() -> Result<()> {
     // Parse layer filenames from stdin
     let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
     let stdin = io::stdin();
diff --git a/pageserver/src/bin/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
similarity index 92%
rename from pageserver/src/bin/layer_map_analyzer.rs
rename to pageserver/ctl/src/layer_map_analyzer.rs
index e7408794584c..f2ced6154f5e 100644
--- a/pageserver/src/bin/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -6,7 +6,7 @@ use anyhow::Result;
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::ops::Range;
-use std::{env, fs, path::Path, path::PathBuf, str, str::FromStr};
+use std::{fs, path::Path, str};
 
 use pageserver::page_cache::PAGE_SZ;
 use pageserver::repository::{Key, KEY_SIZE};
@@ -18,12 +18,14 @@ use pageserver::virtual_file::VirtualFile;
 
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
+use crate::AnalyzeLayerMapCmd;
+
 const MIN_HOLE_LENGTH: i128 = (128 * 1024 * 1024 / PAGE_SZ) as i128;
 const DEFAULT_MAX_HOLES: usize = 10;
 
 /// Wrapper for key range to provide reverse ordering by range length for BinaryHeap
 #[derive(PartialEq, Eq)]
-struct Hole(Range<Key>);
+pub struct Hole(Range<Key>);
 
 impl Ord for Hole {
     fn cmp(&self, other: &Self) -> Ordering {
@@ -39,11 +41,11 @@ impl PartialOrd for Hole {
     }
 }
 
-struct LayerFile {
-    key_range: Range<Key>,
-    lsn_range: Range<Lsn>,
-    is_delta: bool,
-    holes: Vec<Hole>,
+pub(crate) struct LayerFile {
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+    pub is_delta: bool,
+    pub holes: Vec<Hole>,
 }
 
 impl LayerFile {
@@ -67,7 +69,7 @@ impl LayerFile {
     }
 }
 
-fn parse_filename(name: &str) -> Option<LayerFile> {
+pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
     let split: Vec<&str> = name.split("__").collect();
     if split.len() != 2 {
         return None;
@@ -127,18 +129,9 @@ fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
     Ok(holes)
 }
 
-fn main() -> Result<()> {
-    let args: Vec<String> = env::args().collect();
-    if args.len() < 2 {
-        println!("Usage: layer_map_analyzer PAGESERVER_DATA_DIR [MAX_HOLES]");
-        return Ok(());
-    }
-    let storage_path = PathBuf::from_str(&args[1])?;
-    let max_holes = if args.len() > 2 {
-        args[2].parse::<usize>().unwrap()
-    } else {
-        DEFAULT_MAX_HOLES
-    };
+pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
+    let storage_path = &cmd.path;
+    let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);
 
     // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
     pageserver::virtual_file::init(10);
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
new file mode 100644
index 000000000000..d77cf0908cff
--- /dev/null
+++ b/pageserver/ctl/src/layers.rs
@@ -0,0 +1,169 @@
+use std::path::{Path, PathBuf};
+
+use anyhow::Result;
+use clap::Subcommand;
+use pageserver::tenant::block_io::BlockCursor;
+use pageserver::tenant::disk_btree::DiskBtreeReader;
+use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
+use pageserver::{page_cache, virtual_file};
+use pageserver::{
+    repository::{Key, KEY_SIZE},
+    tenant::{
+        block_io::FileBlockReader, disk_btree::VisitDirection,
+        storage_layer::delta_layer::DELTA_KEY_SIZE,
+    },
+    virtual_file::VirtualFile,
+};
+use std::fs;
+use utils::bin_ser::BeSer;
+
+use crate::layer_map_analyzer::parse_filename;
+
+#[derive(Subcommand)]
+pub(crate) enum LayerCmd {
+    /// List all tenants and timelines under the pageserver path
+    ///
+    /// Example: `cargo run --bin pagectl layer list .neon/`
+    List { path: PathBuf },
+    /// List all layers of a given tenant and timeline
+    ///
+    /// Example: `cargo run --bin pagectl layer list .neon/`
+    ListLayer {
+        path: PathBuf,
+        tenant: String,
+        timeline: String,
+    },
+    /// Dump all information of a layer file
+    DumpLayer {
+        path: PathBuf,
+        tenant: String,
+        timeline: String,
+        /// The id from list-layer command
+        id: usize,
+    },
+}
+
+fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
+    use pageserver::tenant::blob_io::BlobCursor;
+    use pageserver::tenant::block_io::BlockReader;
+
+    let path = path.as_ref();
+    virtual_file::init(10);
+    page_cache::init(100);
+    let file = FileBlockReader::new(VirtualFile::open(path)?);
+    let summary_blk = file.read_blk(0)?;
+    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+        actual_summary.index_start_blk,
+        actual_summary.index_root_blk,
+        &file,
+    );
+    // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
+    let mut all = vec![];
+    tree_reader.visit(
+        &[0u8; DELTA_KEY_SIZE],
+        VisitDirection::Forwards,
+        |key, value_offset| {
+            let curr = Key::from_slice(&key[..KEY_SIZE]);
+            all.push((curr, BlobRef(value_offset)));
+            true
+        },
+    )?;
+    let mut cursor = BlockCursor::new(&file);
+    for (k, v) in all {
+        let value = cursor.read_blob(v.pos())?;
+        println!("key:{} value_len:{}", k, value.len());
+    }
+    // TODO(chi): special handling for last key?
+    Ok(())
+}
+
+pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
+    match cmd {
+        LayerCmd::List { path } => {
+            for tenant in fs::read_dir(path.join("tenants"))? {
+                let tenant = tenant?;
+                if !tenant.file_type()?.is_dir() {
+                    continue;
+                }
+                println!("tenant {}", tenant.file_name().to_string_lossy());
+                for timeline in fs::read_dir(tenant.path().join("timelines"))? {
+                    let timeline = timeline?;
+                    if !timeline.file_type()?.is_dir() {
+                        continue;
+                    }
+                    println!("- timeline {}", timeline.file_name().to_string_lossy());
+                }
+            }
+        }
+        LayerCmd::ListLayer {
+            path,
+            tenant,
+            timeline,
+        } => {
+            let timeline_path = path
+                .join("tenants")
+                .join(tenant)
+                .join("timelines")
+                .join(timeline);
+            let mut idx = 0;
+            for layer in fs::read_dir(timeline_path)? {
+                let layer = layer?;
+                if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
+                {
+                    println!(
+                        "[{:3}]  key:{}-{}\n       lsn:{}-{}\n       delta:{}",
+                        idx,
+                        layer_file.key_range.start,
+                        layer_file.key_range.end,
+                        layer_file.lsn_range.start,
+                        layer_file.lsn_range.end,
+                        layer_file.is_delta,
+                    );
+                    idx += 1;
+                }
+            }
+        }
+        LayerCmd::DumpLayer {
+            path,
+            tenant,
+            timeline,
+            id,
+        } => {
+            let timeline_path = path
+                .join("tenants")
+                .join(tenant)
+                .join("timelines")
+                .join(timeline);
+            let mut idx = 0;
+            for layer in fs::read_dir(timeline_path)? {
+                let layer = layer?;
+                if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
+                {
+                    if *id == idx {
+                        // TODO(chi): dedup code
+                        println!(
+                            "[{:3}]  key:{}-{}\n       lsn:{}-{}\n       delta:{}",
+                            idx,
+                            layer_file.key_range.start,
+                            layer_file.key_range.end,
+                            layer_file.lsn_range.start,
+                            layer_file.lsn_range.end,
+                            layer_file.is_delta,
+                        );
+
+                        if layer_file.is_delta {
+                            read_delta_file(layer.path())?;
+                        } else {
+                            anyhow::bail!("not supported yet :(");
+                        }
+
+                        break;
+                    }
+                    idx += 1;
+                }
+            }
+        }
+    }
+    Ok(())
+}
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
new file mode 100644
index 000000000000..55db9eb7e755
--- /dev/null
+++ b/pageserver/ctl/src/main.rs
@@ -0,0 +1,179 @@
+//! A helper tool to manage pageserver binary files.
+//! Accepts a file as an argument, attempts to parse it with all ways possible
+//! and prints its interpreted context.
+//!
+//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
+
+mod draw_timeline_dir;
+mod layer_map_analyzer;
+mod layers;
+
+use clap::{Parser, Subcommand};
+use layers::LayerCmd;
+use pageserver::{
+    context::{DownloadBehavior, RequestContext},
+    page_cache,
+    task_mgr::TaskKind,
+    tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
+    virtual_file,
+};
+use postgres_ffi::ControlFileData;
+use std::path::{Path, PathBuf};
+use utils::{lsn::Lsn, project_git_version};
+
+project_git_version!(GIT_VERSION);
+
+#[derive(Parser)]
+#[command(
+    version = GIT_VERSION,
+    about = "Neon Pageserver binutils",
+    long_about = "Reads pageserver (and related) binary files management utility"
+)]
+#[command(propagate_version = true)]
+struct CliOpts {
+    #[command(subcommand)]
+    command: Commands,
+}
+
+#[derive(Subcommand)]
+enum Commands {
+    Metadata(MetadataCmd),
+    PrintLayerFile(PrintLayerFileCmd),
+    DrawTimeline {},
+    AnalyzeLayerMap(AnalyzeLayerMapCmd),
+    #[command(subcommand)]
+    Layer(LayerCmd),
+}
+
+/// Read and update pageserver metadata file
+#[derive(Parser)]
+struct MetadataCmd {
+    /// Input metadata file path
+    metadata_path: PathBuf,
+    /// Replace disk consistent Lsn
+    disk_consistent_lsn: Option<Lsn>,
+    /// Replace previous record Lsn
+    prev_record_lsn: Option<Lsn>,
+    /// Replace latest gc cuttoff
+    latest_gc_cuttoff: Option<Lsn>,
+}
+
+#[derive(Parser)]
+struct PrintLayerFileCmd {
+    /// Pageserver data path
+    path: PathBuf,
+}
+
+#[derive(Parser)]
+struct AnalyzeLayerMapCmd {
+    /// Pageserver data path
+    path: PathBuf,
+    /// Max holes
+    max_holes: Option<usize>,
+}
+
+fn main() -> anyhow::Result<()> {
+    let cli = CliOpts::parse();
+
+    match cli.command {
+        Commands::Layer(cmd) => {
+            layers::main(&cmd)?;
+        }
+        Commands::Metadata(cmd) => {
+            handle_metadata(&cmd)?;
+        }
+        Commands::DrawTimeline {} => {
+            draw_timeline_dir::main()?;
+        }
+        Commands::AnalyzeLayerMap(cmd) => {
+            layer_map_analyzer::main(&cmd)?;
+        }
+        Commands::PrintLayerFile(cmd) => {
+            if let Err(e) = read_pg_control_file(&cmd.path) {
+                println!(
+                    "Failed to read input file as a pg control one: {e:#}\n\
+                    Attempting to read it as layer file"
+                );
+                print_layerfile(&cmd.path)?;
+            }
+        }
+    };
+    Ok(())
+}
+
+fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
+    let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
+    println!("{control_file:?}");
+    let control_file_initdb = Lsn(control_file.checkPoint);
+    println!(
+        "pg_initdb_lsn: {}, aligned: {}",
+        control_file_initdb,
+        control_file_initdb.align()
+    );
+    Ok(())
+}
+
+fn print_layerfile(path: &Path) -> anyhow::Result<()> {
+    // Basic initialization of things that don't change after startup
+    virtual_file::init(10);
+    page_cache::init(100);
+    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
+    dump_layerfile_from_path(path, true, &ctx)
+}
+
+fn handle_metadata(
+    MetadataCmd {
+        metadata_path: path,
+        disk_consistent_lsn,
+        prev_record_lsn,
+        latest_gc_cuttoff,
+    }: &MetadataCmd,
+) -> Result<(), anyhow::Error> {
+    let metadata_bytes = std::fs::read(path)?;
+    let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
+    println!("Current metadata:\n{meta:?}");
+    let mut update_meta = false;
+    if let Some(disk_consistent_lsn) = disk_consistent_lsn {
+        meta = TimelineMetadata::new(
+            *disk_consistent_lsn,
+            meta.prev_record_lsn(),
+            meta.ancestor_timeline(),
+            meta.ancestor_lsn(),
+            meta.latest_gc_cutoff_lsn(),
+            meta.initdb_lsn(),
+            meta.pg_version(),
+        );
+        update_meta = true;
+    }
+    if let Some(prev_record_lsn) = prev_record_lsn {
+        meta = TimelineMetadata::new(
+            meta.disk_consistent_lsn(),
+            Some(*prev_record_lsn),
+            meta.ancestor_timeline(),
+            meta.ancestor_lsn(),
+            meta.latest_gc_cutoff_lsn(),
+            meta.initdb_lsn(),
+            meta.pg_version(),
+        );
+        update_meta = true;
+    }
+    if let Some(latest_gc_cuttoff) = latest_gc_cuttoff {
+        meta = TimelineMetadata::new(
+            meta.disk_consistent_lsn(),
+            meta.prev_record_lsn(),
+            meta.ancestor_timeline(),
+            meta.ancestor_lsn(),
+            *latest_gc_cuttoff,
+            meta.initdb_lsn(),
+            meta.pg_version(),
+        );
+        update_meta = true;
+    }
+
+    if update_meta {
+        let metadata_bytes = meta.to_bytes()?;
+        std::fs::write(path, metadata_bytes)?;
+    }
+
+    Ok(())
+}
diff --git a/pageserver/src/bin/pageserver_binutils.rs b/pageserver/src/bin/pageserver_binutils.rs
deleted file mode 100644
index 5e2d39d685f0..000000000000
--- a/pageserver/src/bin/pageserver_binutils.rs
+++ /dev/null
@@ -1,174 +0,0 @@
-//! A helper tool to manage pageserver binary files.
-//! Accepts a file as an argument, attempts to parse it with all ways possible
-//! and prints its interpreted context.
-//!
-//! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
-use std::{
-    path::{Path, PathBuf},
-    str::FromStr,
-};
-
-use anyhow::Context;
-use clap::{value_parser, Arg, Command};
-
-use pageserver::{
-    context::{DownloadBehavior, RequestContext},
-    page_cache,
-    task_mgr::TaskKind,
-    tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
-    virtual_file,
-};
-use postgres_ffi::ControlFileData;
-use utils::{lsn::Lsn, project_git_version};
-
-project_git_version!(GIT_VERSION);
-
-const METADATA_SUBCOMMAND: &str = "metadata";
-
-fn main() -> anyhow::Result<()> {
-    let arg_matches = cli().get_matches();
-
-    match arg_matches.subcommand() {
-        Some((subcommand_name, subcommand_matches)) => {
-            let path = subcommand_matches
-                .get_one::<PathBuf>("metadata_path")
-                .context("'metadata_path' argument is missing")?
-                .to_path_buf();
-            anyhow::ensure!(
-                subcommand_name == METADATA_SUBCOMMAND,
-                "Unknown subcommand {subcommand_name}"
-            );
-            handle_metadata(&path, subcommand_matches)?;
-        }
-        None => {
-            let path = arg_matches
-                .get_one::<PathBuf>("path")
-                .context("'path' argument is missing")?
-                .to_path_buf();
-            println!(
-                "No subcommand specified, attempting to guess the format for file {}",
-                path.display()
-            );
-            if let Err(e) = read_pg_control_file(&path) {
-                println!(
-                    "Failed to read input file as a pg control one: {e:#}\n\
-                    Attempting to read it as layer file"
-                );
-                print_layerfile(&path)?;
-            }
-        }
-    };
-    Ok(())
-}
-
-fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
-    let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
-    println!("{control_file:?}");
-    let control_file_initdb = Lsn(control_file.checkPoint);
-    println!(
-        "pg_initdb_lsn: {}, aligned: {}",
-        control_file_initdb,
-        control_file_initdb.align()
-    );
-    Ok(())
-}
-
-fn print_layerfile(path: &Path) -> anyhow::Result<()> {
-    // Basic initialization of things that don't change after startup
-    virtual_file::init(10);
-    page_cache::init(100);
-    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
-    dump_layerfile_from_path(path, true, &ctx)
-}
-
-fn handle_metadata(path: &Path, arg_matches: &clap::ArgMatches) -> Result<(), anyhow::Error> {
-    let metadata_bytes = std::fs::read(path)?;
-    let mut meta = TimelineMetadata::from_bytes(&metadata_bytes)?;
-    println!("Current metadata:\n{meta:?}");
-    let mut update_meta = false;
-    if let Some(disk_consistent_lsn) = arg_matches.get_one::<String>("disk_consistent_lsn") {
-        meta = TimelineMetadata::new(
-            Lsn::from_str(disk_consistent_lsn)?,
-            meta.prev_record_lsn(),
-            meta.ancestor_timeline(),
-            meta.ancestor_lsn(),
-            meta.latest_gc_cutoff_lsn(),
-            meta.initdb_lsn(),
-            meta.pg_version(),
-        );
-        update_meta = true;
-    }
-    if let Some(prev_record_lsn) = arg_matches.get_one::<String>("prev_record_lsn") {
-        meta = TimelineMetadata::new(
-            meta.disk_consistent_lsn(),
-            Some(Lsn::from_str(prev_record_lsn)?),
-            meta.ancestor_timeline(),
-            meta.ancestor_lsn(),
-            meta.latest_gc_cutoff_lsn(),
-            meta.initdb_lsn(),
-            meta.pg_version(),
-        );
-        update_meta = true;
-    }
-    if let Some(latest_gc_cuttoff) = arg_matches.get_one::<String>("latest_gc_cuttoff") {
-        meta = TimelineMetadata::new(
-            meta.disk_consistent_lsn(),
-            meta.prev_record_lsn(),
-            meta.ancestor_timeline(),
-            meta.ancestor_lsn(),
-            Lsn::from_str(latest_gc_cuttoff)?,
-            meta.initdb_lsn(),
-            meta.pg_version(),
-        );
-        update_meta = true;
-    }
-
-    if update_meta {
-        let metadata_bytes = meta.to_bytes()?;
-        std::fs::write(path, metadata_bytes)?;
-    }
-
-    Ok(())
-}
-
-fn cli() -> Command {
-    Command::new("Neon Pageserver binutils")
-        .about("Reads pageserver (and related) binary files management utility")
-        .version(GIT_VERSION)
-        .arg(
-            Arg::new("path")
-                .help("Input file path")
-                .value_parser(value_parser!(PathBuf))
-                .required(false),
-        )
-        .subcommand(
-            Command::new(METADATA_SUBCOMMAND)
-                .about("Read and update pageserver metadata file")
-                .arg(
-                    Arg::new("metadata_path")
-                        .help("Input metadata file path")
-                        .value_parser(value_parser!(PathBuf))
-                        .required(false),
-                )
-                .arg(
-                    Arg::new("disk_consistent_lsn")
-                        .long("disk_consistent_lsn")
-                        .help("Replace disk consistent Lsn"),
-                )
-                .arg(
-                    Arg::new("prev_record_lsn")
-                        .long("prev_record_lsn")
-                        .help("Replace previous record Lsn"),
-                )
-                .arg(
-                    Arg::new("latest_gc_cuttoff")
-                        .long("latest_gc_cuttoff")
-                        .help("Replace latest gc cuttoff"),
-                ),
-        )
-}
-
-#[test]
-fn verify_cli() {
-    cli().debug_assert();
-}
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ce14f14aa9da..dd8e91bd5176 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -77,7 +77,7 @@ use utils::{
     lsn::{Lsn, RecordLsn},
 };
 
-mod blob_io;
+pub mod blob_io;
 pub mod block_io;
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index d30d6c5c6efb..3ca8e28c1662 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -542,7 +542,7 @@ impl From<LayerFileName> for LayerDescriptor {
 ///
 /// This is used by DeltaLayer and ImageLayer. Normally, this holds a reference to the
 /// global config, and paths to layer files are constructed using the tenant/timeline
-/// path from the config. But in the 'pageserver_binutils' binary, we need to construct a Layer
+/// path from the config. But in the 'pagectl' binary, we need to construct a Layer
 /// struct for a file on disk, without having a page server running, so that we have no
 /// config. In that case, we use the Path variant to hold the full path to the file on
 /// disk.
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index ba3ab6dd4c27..63b8e57bb04d 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -110,7 +110,7 @@ const WILL_INIT: u64 = 1;
 /// reading/deserializing records themselves.
 ///
 #[derive(Debug, Serialize, Deserialize, Copy, Clone)]
-struct BlobRef(u64);
+pub struct BlobRef(pub u64);
 
 impl BlobRef {
     pub fn will_init(&self) -> bool {
@@ -619,7 +619,7 @@ impl DeltaLayer {
 
     /// Create a DeltaLayer struct representing an existing file on disk.
     ///
-    /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
+    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
     pub fn new_for_path(path: &Path, file: File) -> Result<Self> {
         let mut summary_buf = Vec::new();
         summary_buf.resize(PAGE_SZ, 0);
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index d298b3e852ef..a5dd16fae225 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -422,7 +422,7 @@ impl ImageLayer {
 
     /// Create an ImageLayer struct representing an existing file on disk.
     ///
-    /// This variant is only used for debugging purposes, by the 'pageserver_binutils' binary.
+    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
     pub fn new_for_path(path: &Path, file: File) -> Result<ImageLayer> {
         let mut summary_buf = Vec::new();
         summary_buf.resize(PAGE_SZ, 0);

From f276f216369725c3550f978a2bf9387447de755d Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 24 May 2023 17:00:21 -0400
Subject: [PATCH 02/59] ci: use eu-central-1 bucket (#4315)

Probably increase CI success rate.

---------

Signed-off-by: Alex Chi <iskyzh@gmail.com>
---
 .github/actions/run-python-test-set/action.yml | 10 ----------
 .github/workflows/build_and_test.yml           |  6 ++----
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 449398558761..dec1f47e4713 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -36,14 +36,6 @@ inputs:
     description: 'Region name for real s3 tests'
     required: false
     default: ''
-  real_s3_access_key_id:
-    description: 'Access key id'
-    required: false
-    default: ''
-  real_s3_secret_access_key:
-    description: 'Secret access key'
-    required: false
-    default: ''
   rerun_flaky:
     description: 'Whether to rerun flaky tests'
     required: false
@@ -104,8 +96,6 @@ runs:
         COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install
         TEST_OUTPUT: /tmp/test_output
         BUILD_TYPE: ${{ inputs.build_type }}
-        AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
-        AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
         COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
         ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
         ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 845a21ad0e91..6dcf988191f1 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -346,10 +346,8 @@ jobs:
           test_selection: regress
           needs_postgres_source: true
           run_with_real_s3: true
-          real_s3_bucket: ci-tests-s3
-          real_s3_region: us-west-2
-          real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
-          real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
+          real_s3_bucket: neon-github-ci-tests
+          real_s3_region: eu-central-1
           rerun_flaky: true
           pg_version: ${{ matrix.pg_version }}
         env:

From e11ba24ec55cf26bd9b39e77f8d05b5d5f58f454 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 25 May 2023 10:49:09 +0200
Subject: [PATCH 03/59] tenant loops: operate on the Arc<Tenant> directly
 (#4298)

(Instead of going through mgr every iteration.)

The `wait_for_active_tenant` function's `wait` argument could be removed
because it was only used for the loop that waits for the tenant to show
up in the tenants map. Since we're passing the tenant in, we now longer
need to get it from the tenants map.

NB that there's no guarantee that the tenant object is in the tenants
map at the time the background loop function starts running. But the
tenant mgr guarantees that it will be quite soon. See
`tenant_map_insert` way upwards in the call hierarchy for details.

This is prep work to eliminate `subscribe_for_state_updates` (PR #4299 )

Fixes: #3501
---
 pageserver/src/tenant.rs              |  4 +-
 pageserver/src/tenant/tasks.rs        | 77 +++++++++++++--------------
 test_runner/fixtures/neon_fixtures.py |  2 -
 3 files changed, 38 insertions(+), 45 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index dd8e91bd5176..e75d9f0d269a 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1605,7 +1605,7 @@ impl Tenant {
     }
 
     /// Changes tenant status to active, unless shutdown was already requested.
-    fn activate(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+    fn activate(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
         debug_assert_current_span_has_tenant_id();
 
         let mut result = Ok(());
@@ -1638,7 +1638,7 @@ impl Tenant {
 
                     // Spawn gc and compaction loops. The loops will shut themselves
                     // down when they notice that the tenant is inactive.
-                    tasks::start_background_loops(self.tenant_id);
+                    tasks::start_background_loops(self);
 
                     let mut activated_timelines = 0;
                     let mut timelines_broken_during_activation = 0;
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 6bf26f1da1b4..b3c8a4a3bbd3 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -9,13 +9,12 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::mgr;
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::id::TenantId;
 
-pub fn start_background_loops(tenant_id: TenantId) {
+pub fn start_background_loops(tenant: &Arc<Tenant>) {
+    let tenant_id = tenant.tenant_id;
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::Compaction,
@@ -23,11 +22,14 @@ pub fn start_background_loops(tenant_id: TenantId) {
         None,
         &format!("compactor for tenant {tenant_id}"),
         false,
-        async move {
-            compaction_loop(tenant_id)
-                .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
-                .await;
-            Ok(())
+        {
+            let tenant = Arc::clone(tenant);
+            async move {
+                compaction_loop(tenant)
+                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
+                    .await;
+                Ok(())
+            }
         },
     );
     task_mgr::spawn(
@@ -37,11 +39,14 @@ pub fn start_background_loops(tenant_id: TenantId) {
         None,
         &format!("garbage collector for tenant {tenant_id}"),
         false,
-        async move {
-            gc_loop(tenant_id)
-                .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
-                .await;
-            Ok(())
+        {
+            let tenant = Arc::clone(tenant);
+            async move {
+                gc_loop(tenant)
+                    .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
+                    .await;
+                Ok(())
+            }
         },
     );
 }
@@ -49,7 +54,7 @@ pub fn start_background_loops(tenant_id: TenantId) {
 ///
 /// Compaction task's main loop
 ///
-async fn compaction_loop(tenant_id: TenantId) {
+async fn compaction_loop(tenant: Arc<Tenant>) {
     let wait_duration = Duration::from_secs(2);
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
@@ -60,16 +65,16 @@ async fn compaction_loop(tenant_id: TenantId) {
         loop {
             trace!("waking up");
 
-            let tenant = tokio::select! {
+            tokio::select! {
                 _ = cancel.cancelled() => {
                     info!("received cancellation request");
                     return;
                 },
-                tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
+                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
                     ControlFlow::Break(()) => return,
-                    ControlFlow::Continue(tenant) => tenant,
+                    ControlFlow::Continue(()) => (),
                 },
-            };
+            }
 
             let period = tenant.get_compaction_period();
 
@@ -119,7 +124,7 @@ async fn compaction_loop(tenant_id: TenantId) {
 ///
 /// GC task's main loop
 ///
-async fn gc_loop(tenant_id: TenantId) {
+async fn gc_loop(tenant: Arc<Tenant>) {
     let wait_duration = Duration::from_secs(2);
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
@@ -127,21 +132,22 @@ async fn gc_loop(tenant_id: TenantId) {
         let cancel = task_mgr::shutdown_token();
         // GC might require downloading, to find the cutoff LSN that corresponds to the
         // cutoff specified as time.
-        let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
+        let ctx =
+            RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
         let mut first = true;
         loop {
             trace!("waking up");
 
-            let tenant = tokio::select! {
+            tokio::select! {
                 _ = cancel.cancelled() => {
                     info!("received cancellation request");
                     return;
                 },
-                tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
+                tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result {
                     ControlFlow::Break(()) => return,
-                    ControlFlow::Continue(tenant) => tenant,
+                    ControlFlow::Continue(()) => (),
                 },
-            };
+            }
 
             let period = tenant.get_gc_period();
 
@@ -161,7 +167,9 @@ async fn gc_loop(tenant_id: TenantId) {
                 Duration::from_secs(10)
             } else {
                 // Run gc
-                let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await;
+                let res = tenant
+                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx)
+                    .await;
                 if let Err(e) = res {
                     error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
                     wait_duration
@@ -187,23 +195,10 @@ async fn gc_loop(tenant_id: TenantId) {
     trace!("GC loop stopped.");
 }
 
-async fn wait_for_active_tenant(
-    tenant_id: TenantId,
-    wait: Duration,
-) -> ControlFlow<(), Arc<Tenant>> {
-    let tenant = loop {
-        match mgr::get_tenant(tenant_id, false).await {
-            Ok(tenant) => break tenant,
-            Err(e) => {
-                error!("Failed to get a tenant {tenant_id}: {e:#}");
-                tokio::time::sleep(wait).await;
-            }
-        }
-    };
-
+async fn wait_for_active_tenant(tenant: &Arc<Tenant>) -> ControlFlow<()> {
     // if the tenant has a proper status already, no need to wait for anything
     if tenant.current_state() == TenantState::Active {
-        ControlFlow::Continue(tenant)
+        ControlFlow::Continue(())
     } else {
         let mut tenant_state_updates = tenant.subscribe_for_state_updates();
         loop {
@@ -213,7 +208,7 @@ async fn wait_for_active_tenant(
                     match new_state {
                         TenantState::Active => {
                             debug!("Tenant state changed to active, continuing the task loop");
-                            return ControlFlow::Continue(tenant);
+                            return ControlFlow::Continue(());
                         }
                         state => {
                             debug!("Not running the task loop, tenant is not active: {state:?}");
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 59afc104e67c..3ff5429616b0 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1603,8 +1603,6 @@ def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional
             # https://github.com/neondatabase/neon/issues/2442
             ".*could not remove ephemeral file.*No such file or directory.*",
             # FIXME: These need investigation
-            ".*gc_loop.*Failed to get a tenant .* Tenant .* not found.*",
-            ".*compaction_loop.*Failed to get a tenant .* Tenant .* not found.*",
             ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
             ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
             ".*Removing intermediate uninit mark file.*",

From 6052ecee0701a6b2ab9e603de62f2f6c443ee504 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Thu, 25 May 2023 01:36:57 -0800
Subject: [PATCH 04/59] Add connector extension to send Role/Database updates
 to console (#3891)

## Describe your changes

## Issue ticket number and link

## Checklist before requesting a review
- [x] I have performed a self-review of my code.
- [x] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
---
 .github/workflows/build_and_test.yml       |   3 +
 Dockerfile.compute-node                    |   1 +
 compute_tools/src/compute.rs               |   4 +
 compute_tools/src/pg_helpers.rs            |   5 +-
 compute_tools/src/spec.rs                  |   2 +-
 compute_tools/tests/pg_helpers_tests.rs    |   2 +-
 pgxn/neon/Makefile                         |   4 +-
 pgxn/neon/control_plane_connector.c        | 830 +++++++++++++++++++++
 pgxn/neon/control_plane_connector.h        |   6 +
 pgxn/neon/neon.c                           |   5 +
 test_runner/fixtures/pg_version.py         |   8 +-
 test_runner/regress/test_ddl_forwarding.py | 219 ++++++
 12 files changed, 1081 insertions(+), 8 deletions(-)
 create mode 100644 pgxn/neon/control_plane_connector.c
 create mode 100644 pgxn/neon/control_plane_connector.h
 create mode 100644 test_runner/regress/test_ddl_forwarding.py

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6dcf988191f1..bcc02398a196 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -661,6 +661,9 @@ jobs:
           project: nrdv0s4kcs
           push: true
           tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}}
+          build-args: |
+            GIT_VERSION=${{ github.sha }}
+            REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
 
   compute-tools-image:
     runs-on: [ self-hosted, gen3, large ]
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 3a3dee8a8a33..de8a904c02ea 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -632,6 +632,7 @@ RUN apt update &&  \
         libxml2 \
         libxslt1.1 \
         libzstd1 \
+        libcurl4-openssl-dev \
         procps && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index da5ad00da672..a7746629a858 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -362,6 +362,8 @@ impl ComputeNode {
         };
 
         // Proceed with post-startup configuration. Note, that order of operations is important.
+        // Disable DDL forwarding because control plane already knows about these roles/databases.
+        client.simple_query("SET neon.forward_ddl = false")?;
         let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
         handle_roles(spec, &mut client)?;
         handle_databases(spec, &mut client)?;
@@ -403,7 +405,9 @@ impl ComputeNode {
         self.pg_reload_conf(&mut client)?;
 
         // Proceed with post-startup configuration. Note, that order of operations is important.
+        // Disable DDL forwarding because control plane already knows about these roles/databases.
         if spec.mode == ComputeMode::Primary {
+            client.simple_query("SET neon.forward_ddl = false")?;
             handle_roles(&spec, &mut client)?;
             handle_databases(&spec, &mut client)?;
             handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 40dbea6907d1..ed00485d5ac3 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -121,9 +121,8 @@ impl RoleExt for Role {
     /// string of arguments.
     fn to_pg_options(&self) -> String {
         // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane.
-        // For now, we do not use generic `options` for roles. Once used, add
-        // `self.options.as_pg_options()` somewhere here.
-        let mut params: String = "LOGIN".to_string();
+        let mut params: String = self.options.as_pg_options();
+        params.push_str(" LOGIN");
 
         if let Some(pass) = &self.encrypted_password {
             // Some time ago we supported only md5 and treated all encrypted_password as md5.
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index bf3c40720238..a2a19ae0daf7 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -62,7 +62,7 @@ fn do_control_plane_request(
     }
 }
 
-/// Request spec from the control-plane by compute_id. If `NEON_CONSOLE_JWT`
+/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN`
 /// env variable is set, it will be used for authorization.
 pub fn get_spec_from_control_plane(
     base_uri: &str,
diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs
index a63ee038c7f8..265556d3b9b9 100644
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -16,7 +16,7 @@ mod pg_helpers_tests {
         );
         assert_eq!(
             spec.cluster.roles.first().unwrap().to_pg_options(),
-            "LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'"
+            " LOGIN PASSWORD 'md56b1d16b78004bbd51fa06af9eda75972'"
         );
     }
 
diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index ec377dbb1ecf..194802347233 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -11,10 +11,12 @@ OBJS = \
 	pagestore_smgr.o \
 	relsize_cache.o \
 	walproposer.o \
-	walproposer_utils.o
+	walproposer_utils.o \
+	control_plane_connector.o
 
 PG_CPPFLAGS = -I$(libpq_srcdir)
 SHLIB_LINK_INTERNAL = $(libpq)
+SHLIB_LINK = -lcurl
 
 EXTENSION = neon
 DATA = neon--1.0.sql
diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
new file mode 100644
index 000000000000..82e4af4b4a79
--- /dev/null
+++ b/pgxn/neon/control_plane_connector.c
@@ -0,0 +1,830 @@
+/*-------------------------------------------------------------------------
+ *
+ * control_plane_connector.c
+ *	  Captures updates to roles/databases using ProcessUtility_hook and
+ *        sends them to the control ProcessUtility_hook. The changes are sent
+ *        via HTTP to the URL specified by the GUC neon.console_url when the
+ *        transaction commits. Forwarding may be disabled temporarily by
+ *        setting neon.forward_ddl to false.
+ *
+ *        Currently, the transaction may abort AFTER
+ *        changes have already been forwarded, and that case is not handled.
+ *        Subtransactions are handled using a stack of hash tables, which
+ *        accumulate changes. On subtransaction commit, the top of the stack
+ *        is merged with the table below it.
+ *
+ * IDENTIFICATION
+ *	 contrib/neon/control_plane_connector.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "tcop/pquery.h"
+#include "tcop/utility.h"
+#include "access/xact.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+#include "commands/defrem.h"
+#include "miscadmin.h"
+#include "utils/acl.h"
+#include "fmgr.h"
+#include "utils/guc.h"
+#include "port.h"
+#include <curl/curl.h>
+#include "utils/jsonb.h"
+
+static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
+
+/* GUCs */
+static char *ConsoleURL = NULL;
+static bool ForwardDDL = true;
+
+/* Curl structures for sending the HTTP requests */
+static CURL * CurlHandle;
+static struct curl_slist *ContentHeader = NULL;
+
+/*
+ * CURL docs say that this buffer must exist until we call curl_easy_cleanup
+ * (which we never do), so we make this a static
+ */
+static char CurlErrorBuf[CURL_ERROR_SIZE];
+
+typedef enum
+{
+	Op_Set,						/* An upsert: Either a creation or an alter */
+	Op_Delete,
+}			OpType;
+
+typedef struct
+{
+	char		name[NAMEDATALEN];
+	Oid			owner;
+	char		old_name[NAMEDATALEN];
+	OpType		type;
+}			DbEntry;
+
+typedef struct
+{
+	char		name[NAMEDATALEN];
+	char		old_name[NAMEDATALEN];
+	const char *password;
+	OpType		type;
+}			RoleEntry;
+
+/*
+ * We keep one of these for each subtransaction in a stack. When a subtransaction
+ * commits, we merge the top of the stack into the table below it. It is allocated in the
+ * subtransaction's context.
+ */
+typedef struct DdlHashTable
+{
+	struct DdlHashTable *prev_table;
+	HTAB	   *db_table;
+	HTAB	   *role_table;
+}			DdlHashTable;
+
+static DdlHashTable RootTable;
+static DdlHashTable * CurrentDdlTable = &RootTable;
+
+static void
+PushKeyValue(JsonbParseState **state, char *key, char *value)
+{
+	JsonbValue	k,
+				v;
+
+	k.type = jbvString;
+	k.val.string.len = strlen(key);
+	k.val.string.val = key;
+	v.type = jbvString;
+	v.val.string.len = strlen(value);
+	v.val.string.val = value;
+	pushJsonbValue(state, WJB_KEY, &k);
+	pushJsonbValue(state, WJB_VALUE, &v);
+}
+
+static char *
+ConstructDeltaMessage()
+{
+	JsonbParseState *state = NULL;
+
+	pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL);
+	if (RootTable.db_table)
+	{
+		JsonbValue	dbs;
+
+		dbs.type = jbvString;
+		dbs.val.string.val = "dbs";
+		dbs.val.string.len = strlen(dbs.val.string.val);
+		pushJsonbValue(&state, WJB_KEY, &dbs);
+		pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL);
+
+		HASH_SEQ_STATUS status;
+		DbEntry    *entry;
+
+		hash_seq_init(&status, RootTable.db_table);
+		while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL);
+			PushKeyValue(&state, "op", entry->type == Op_Set ? "set" : "del");
+			PushKeyValue(&state, "name", entry->name);
+			if (entry->owner != InvalidOid)
+			{
+				PushKeyValue(&state, "owner", GetUserNameFromId(entry->owner, false));
+			}
+			if (entry->old_name[0] != '\0')
+			{
+				PushKeyValue(&state, "old_name", entry->old_name);
+			}
+			pushJsonbValue(&state, WJB_END_OBJECT, NULL);
+		}
+		pushJsonbValue(&state, WJB_END_ARRAY, NULL);
+	}
+
+	if (RootTable.role_table)
+	{
+		JsonbValue	roles;
+
+		roles.type = jbvString;
+		roles.val.string.val = "roles";
+		roles.val.string.len = strlen(roles.val.string.val);
+		pushJsonbValue(&state, WJB_KEY, &roles);
+		pushJsonbValue(&state, WJB_BEGIN_ARRAY, NULL);
+
+		HASH_SEQ_STATUS status;
+		RoleEntry  *entry;
+
+		hash_seq_init(&status, RootTable.role_table);
+		while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			pushJsonbValue(&state, WJB_BEGIN_OBJECT, NULL);
+			PushKeyValue(&state, "op", entry->type == Op_Set ? "set" : "del");
+			PushKeyValue(&state, "name", entry->name);
+			if (entry->password)
+			{
+				PushKeyValue(&state, "password", (char *) entry->password);
+			}
+			if (entry->old_name[0] != '\0')
+			{
+				PushKeyValue(&state, "old_name", entry->old_name);
+			}
+			pushJsonbValue(&state, WJB_END_OBJECT, NULL);
+		}
+		pushJsonbValue(&state, WJB_END_ARRAY, NULL);
+	}
+	JsonbValue *result = pushJsonbValue(&state, WJB_END_OBJECT, NULL);
+	Jsonb	   *jsonb = JsonbValueToJsonb(result);
+
+	return JsonbToCString(NULL, &jsonb->root, 0 /* estimated_len */ );
+}
+
+#define ERROR_SIZE 1024
+
+typedef struct
+{
+	char		str[ERROR_SIZE];
+	size_t		size;
+}			ErrorString;
+
+static size_t
+ErrorWriteCallback(char *ptr, size_t size, size_t nmemb, void *userdata)
+{
+	/* Docs say size is always 1 */
+	ErrorString *str = userdata;
+
+	size_t		to_write = nmemb;
+
+	/* +1 for null terminator */
+	if (str->size + nmemb + 1 >= ERROR_SIZE)
+		to_write = ERROR_SIZE - str->size - 1;
+
+	/* Ignore everyrthing past the first ERROR_SIZE bytes */
+	if (to_write == 0)
+		return nmemb;
+	memcpy(str->str + str->size, ptr, to_write);
+	str->size += to_write;
+	str->str[str->size] = '\0';
+	return nmemb;
+}
+
+static void
+SendDeltasToControlPlane()
+{
+	if (!RootTable.db_table && !RootTable.role_table)
+		return;
+	if (!ConsoleURL)
+	{
+		elog(LOG, "ConsoleURL not set, skipping forwarding");
+		return;
+	}
+	if (!ForwardDDL)
+		return;
+
+	char	   *message = ConstructDeltaMessage();
+	ErrorString str = {};
+
+	curl_easy_setopt(CurlHandle, CURLOPT_CUSTOMREQUEST, "PATCH");
+	curl_easy_setopt(CurlHandle, CURLOPT_HTTPHEADER, ContentHeader);
+	curl_easy_setopt(CurlHandle, CURLOPT_POSTFIELDS, message);
+	curl_easy_setopt(CurlHandle, CURLOPT_URL, ConsoleURL);
+	curl_easy_setopt(CurlHandle, CURLOPT_ERRORBUFFER, CurlErrorBuf);
+	curl_easy_setopt(CurlHandle, CURLOPT_TIMEOUT, 3L /* seconds */ );
+	curl_easy_setopt(CurlHandle, CURLOPT_WRITEDATA, &str);
+	curl_easy_setopt(CurlHandle, CURLOPT_WRITEFUNCTION, ErrorWriteCallback);
+
+	const int	num_retries = 5;
+	int			curl_status;
+
+	for (int i = 0; i < num_retries; i++)
+	{
+		if ((curl_status = curl_easy_perform(CurlHandle)) == 0)
+			break;
+		elog(LOG, "Curl request failed on attempt %d: %s", i, CurlErrorBuf);
+		pg_usleep(1000 * 1000);
+	}
+	if (curl_status != 0)
+	{
+		elog(ERROR, "Failed to perform curl request: %s", CurlErrorBuf);
+	}
+	else
+	{
+		long		response_code;
+
+		if (curl_easy_getinfo(CurlHandle, CURLINFO_RESPONSE_CODE, &response_code) != CURLE_UNKNOWN_OPTION)
+		{
+			bool		error_exists = str.size != 0;
+
+			if (response_code != 200)
+			{
+				if (error_exists)
+				{
+					elog(ERROR,
+						 "Received HTTP code %ld from control plane: %s",
+						 response_code,
+						 str.str);
+				}
+				else
+				{
+					elog(ERROR,
+						 "Received HTTP code %ld from control plane",
+						 response_code);
+				}
+			}
+		}
+	}
+}
+
+static void
+InitDbTableIfNeeded()
+{
+	if (!CurrentDdlTable->db_table)
+	{
+		HASHCTL		db_ctl = {};
+
+		db_ctl.keysize = NAMEDATALEN;
+		db_ctl.entrysize = sizeof(DbEntry);
+		db_ctl.hcxt = CurTransactionContext;
+		CurrentDdlTable->db_table = hash_create(
+												"Dbs Created",
+												4,
+												&db_ctl,
+												HASH_ELEM | HASH_STRINGS | HASH_CONTEXT);
+	}
+}
+
+static void
+InitRoleTableIfNeeded()
+{
+	if (!CurrentDdlTable->role_table)
+	{
+		HASHCTL		role_ctl = {};
+
+		role_ctl.keysize = NAMEDATALEN;
+		role_ctl.entrysize = sizeof(RoleEntry);
+		role_ctl.hcxt = CurTransactionContext;
+		CurrentDdlTable->role_table = hash_create(
+												  "Roles Created",
+												  4,
+												  &role_ctl,
+												  HASH_ELEM | HASH_STRINGS | HASH_CONTEXT);
+	}
+}
+
+static void
+PushTable()
+{
+	DdlHashTable *new_table = MemoryContextAlloc(CurTransactionContext, sizeof(DdlHashTable));
+
+	new_table->prev_table = CurrentDdlTable;
+	new_table->role_table = NULL;
+	new_table->db_table = NULL;
+	CurrentDdlTable = new_table;
+}
+
+static void
+MergeTable()
+{
+	DdlHashTable *old_table = CurrentDdlTable;
+
+	CurrentDdlTable = old_table->prev_table;
+
+	if (old_table->db_table)
+	{
+		InitDbTableIfNeeded();
+		DbEntry    *entry;
+		HASH_SEQ_STATUS status;
+
+		hash_seq_init(&status, old_table->db_table);
+		while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			DbEntry    *to_write = hash_search(
+											   CurrentDdlTable->db_table,
+											   entry->name,
+											   HASH_ENTER,
+											   NULL);
+
+			to_write->type = entry->type;
+			if (entry->owner != InvalidOid)
+				to_write->owner = entry->owner;
+			strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
+			if (entry->old_name[0] != '\0')
+			{
+				bool		found_old = false;
+				DbEntry    *old = hash_search(
+											  CurrentDdlTable->db_table,
+											  entry->old_name,
+											  HASH_FIND,
+											  &found_old);
+
+				if (found_old)
+				{
+					if (old->old_name[0] != '\0')
+						strlcpy(to_write->old_name, old->old_name, NAMEDATALEN);
+					else
+						strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
+					hash_search(
+								CurrentDdlTable->db_table,
+								entry->old_name,
+								HASH_REMOVE,
+								NULL);
+				}
+			}
+		}
+		hash_destroy(old_table->db_table);
+	}
+
+	if (old_table->role_table)
+	{
+		InitRoleTableIfNeeded();
+		RoleEntry  *entry;
+		HASH_SEQ_STATUS status;
+
+		hash_seq_init(&status, old_table->role_table);
+		while ((entry = hash_seq_search(&status)) != NULL)
+		{
+			RoleEntry  *to_write = hash_search(
+											   CurrentDdlTable->role_table,
+											   entry->name,
+											   HASH_ENTER,
+											   NULL);
+
+			to_write->type = entry->type;
+			if (entry->password)
+				to_write->password = entry->password;
+			strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
+			if (entry->old_name[0] != '\0')
+			{
+				bool		found_old = false;
+				RoleEntry  *old = hash_search(
+											  CurrentDdlTable->role_table,
+											  entry->old_name,
+											  HASH_FIND,
+											  &found_old);
+
+				if (found_old)
+				{
+					if (old->old_name[0] != '\0')
+						strlcpy(to_write->old_name, old->old_name, NAMEDATALEN);
+					else
+						strlcpy(to_write->old_name, entry->old_name, NAMEDATALEN);
+					hash_search(CurrentDdlTable->role_table,
+								entry->old_name,
+								HASH_REMOVE,
+								NULL);
+				}
+			}
+		}
+		hash_destroy(old_table->role_table);
+	}
+}
+
+static void
+PopTable()
+{
+	/*
+	 * Current table gets freed because it is allocated in aborted
+	 * subtransaction's memory context.
+	 */
+	CurrentDdlTable = CurrentDdlTable->prev_table;
+}
+
+static void
+NeonSubXactCallback(
+					SubXactEvent event,
+					SubTransactionId mySubid,
+					SubTransactionId parentSubid,
+					void *arg)
+{
+	switch (event)
+	{
+		case SUBXACT_EVENT_START_SUB:
+			return PushTable();
+		case SUBXACT_EVENT_COMMIT_SUB:
+			return MergeTable();
+		case SUBXACT_EVENT_ABORT_SUB:
+			return PopTable();
+		default:
+			return;
+	}
+}
+
+static void
+NeonXactCallback(XactEvent event, void *arg)
+{
+	if (event == XACT_EVENT_PRE_COMMIT || event == XACT_EVENT_PARALLEL_PRE_COMMIT)
+	{
+		SendDeltasToControlPlane();
+	}
+	RootTable.role_table = NULL;
+	RootTable.db_table = NULL;
+	Assert(CurrentDdlTable == &RootTable);
+}
+
+static void
+HandleCreateDb(CreatedbStmt *stmt)
+{
+	InitDbTableIfNeeded();
+	DefElem    *downer = NULL;
+	ListCell   *option;
+
+	foreach(option, stmt->options)
+	{
+		DefElem    *defel = lfirst(option);
+
+		if (strcmp(defel->defname, "owner") == 0)
+			downer = defel;
+	}
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									stmt->dbname,
+									HASH_ENTER,
+									&found);
+
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+
+	entry->type = Op_Set;
+	if (downer && downer->arg)
+		entry->owner = get_role_oid(defGetString(downer), false);
+	else
+		entry->owner = GetUserId();
+}
+
+static void
+HandleAlterOwner(AlterOwnerStmt *stmt)
+{
+	if (stmt->objectType != OBJECT_DATABASE)
+		return;
+	InitDbTableIfNeeded();
+	const char *name = strVal(stmt->object);
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									name,
+									HASH_ENTER,
+									&found);
+
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+
+	entry->owner = get_role_oid(get_rolespec_name(stmt->newowner), false);
+	entry->type = Op_Set;
+}
+
+static void
+HandleDbRename(RenameStmt *stmt)
+{
+	Assert(stmt->renameType == OBJECT_DATABASE);
+	InitDbTableIfNeeded();
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									stmt->subname,
+									HASH_FIND,
+									&found);
+	DbEntry    *entry_for_new_name = hash_search(
+												 CurrentDdlTable->db_table,
+												 stmt->newname,
+												 HASH_ENTER,
+												 NULL);
+
+	entry_for_new_name->type = Op_Set;
+	if (found)
+	{
+		if (entry->old_name[0] != '\0')
+			strlcpy(entry_for_new_name->old_name, entry->old_name, NAMEDATALEN);
+		else
+			strlcpy(entry_for_new_name->old_name, entry->name, NAMEDATALEN);
+		entry_for_new_name->owner = entry->owner;
+		hash_search(
+					CurrentDdlTable->db_table,
+					stmt->subname,
+					HASH_REMOVE,
+					NULL);
+	}
+	else
+	{
+		strlcpy(entry_for_new_name->old_name, stmt->subname, NAMEDATALEN);
+		entry_for_new_name->owner = InvalidOid;
+	}
+}
+
+static void
+HandleDropDb(DropdbStmt *stmt)
+{
+	InitDbTableIfNeeded();
+	bool		found = false;
+	DbEntry    *entry = hash_search(
+									CurrentDdlTable->db_table,
+									stmt->dbname,
+									HASH_ENTER,
+									&found);
+
+	entry->type = Op_Delete;
+	entry->owner = InvalidOid;
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+}
+
+static void
+HandleCreateRole(CreateRoleStmt *stmt)
+{
+	InitRoleTableIfNeeded();
+	bool		found = false;
+	RoleEntry  *entry = hash_search(
+									CurrentDdlTable->role_table,
+									stmt->role,
+									HASH_ENTER,
+									&found);
+	DefElem    *dpass = NULL;
+	ListCell   *option;
+
+	foreach(option, stmt->options)
+	{
+		DefElem    *defel = lfirst(option);
+
+		if (strcmp(defel->defname, "password") == 0)
+			dpass = defel;
+	}
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+	if (dpass && dpass->arg)
+		entry->password = MemoryContextStrdup(CurTransactionContext, strVal(dpass->arg));
+	else
+		entry->password = NULL;
+	entry->type = Op_Set;
+}
+
+static void
+HandleAlterRole(AlterRoleStmt *stmt)
+{
+	InitRoleTableIfNeeded();
+	DefElem    *dpass = NULL;
+	ListCell   *option;
+
+	foreach(option, stmt->options)
+	{
+		DefElem    *defel = lfirst(option);
+
+		if (strcmp(defel->defname, "password") == 0)
+			dpass = defel;
+	}
+	/* We only care about updates to the password */
+	if (!dpass)
+		return;
+	bool		found = false;
+	RoleEntry  *entry = hash_search(
+									CurrentDdlTable->role_table,
+									stmt->role->rolename,
+									HASH_ENTER,
+									&found);
+
+	if (!found)
+		memset(entry->old_name, 0, sizeof(entry->old_name));
+	if (dpass->arg)
+		entry->password = MemoryContextStrdup(CurTransactionContext, strVal(dpass->arg));
+	else
+		entry->password = NULL;
+	entry->type = Op_Set;
+}
+
+static void
+HandleRoleRename(RenameStmt *stmt)
+{
+	InitRoleTableIfNeeded();
+	Assert(stmt->renameType == OBJECT_ROLE);
+	bool		found = false;
+	RoleEntry  *entry = hash_search(
+									CurrentDdlTable->role_table,
+									stmt->subname,
+									HASH_FIND,
+									&found);
+
+	RoleEntry  *entry_for_new_name = hash_search(
+												 CurrentDdlTable->role_table,
+												 stmt->newname,
+												 HASH_ENTER,
+												 NULL);
+
+	entry_for_new_name->type = Op_Set;
+	if (found)
+	{
+		if (entry->old_name[0] != '\0')
+			strlcpy(entry_for_new_name->old_name, entry->old_name, NAMEDATALEN);
+		else
+			strlcpy(entry_for_new_name->old_name, entry->name, NAMEDATALEN);
+		entry_for_new_name->password = entry->password;
+		hash_search(
+					CurrentDdlTable->role_table,
+					entry->name,
+					HASH_REMOVE,
+					NULL);
+	}
+	else
+	{
+		strlcpy(entry_for_new_name->old_name, stmt->subname, NAMEDATALEN);
+		entry_for_new_name->password = NULL;
+	}
+}
+
+static void
+HandleDropRole(DropRoleStmt *stmt)
+{
+	InitRoleTableIfNeeded();
+	ListCell   *item;
+
+	foreach(item, stmt->roles)
+	{
+		RoleSpec   *spec = lfirst(item);
+		bool		found = false;
+		RoleEntry  *entry = hash_search(
+										CurrentDdlTable->role_table,
+										spec->rolename,
+										HASH_ENTER,
+										&found);
+
+		entry->type = Op_Delete;
+		entry->password = NULL;
+		if (!found)
+			memset(entry->old_name, 0, sizeof(entry));
+	}
+}
+
+static void
+HandleRename(RenameStmt *stmt)
+{
+	if (stmt->renameType == OBJECT_DATABASE)
+		return HandleDbRename(stmt);
+	else if (stmt->renameType == OBJECT_ROLE)
+		return HandleRoleRename(stmt);
+}
+
+static void
+NeonProcessUtility(
+				   PlannedStmt *pstmt,
+				   const char *queryString,
+				   bool readOnlyTree,
+				   ProcessUtilityContext context,
+				   ParamListInfo params,
+				   QueryEnvironment *queryEnv,
+				   DestReceiver *dest,
+				   QueryCompletion *qc)
+{
+	Node	   *parseTree = pstmt->utilityStmt;
+
+	switch (nodeTag(parseTree))
+	{
+		case T_CreatedbStmt:
+			HandleCreateDb(castNode(CreatedbStmt, parseTree));
+			break;
+		case T_AlterOwnerStmt:
+			HandleAlterOwner(castNode(AlterOwnerStmt, parseTree));
+			break;
+		case T_RenameStmt:
+			HandleRename(castNode(RenameStmt, parseTree));
+			break;
+		case T_DropdbStmt:
+			HandleDropDb(castNode(DropdbStmt, parseTree));
+			break;
+		case T_CreateRoleStmt:
+			HandleCreateRole(castNode(CreateRoleStmt, parseTree));
+			break;
+		case T_AlterRoleStmt:
+			HandleAlterRole(castNode(AlterRoleStmt, parseTree));
+			break;
+		case T_DropRoleStmt:
+			HandleDropRole(castNode(DropRoleStmt, parseTree));
+			break;
+		default:
+			break;
+	}
+
+	if (PreviousProcessUtilityHook)
+	{
+		PreviousProcessUtilityHook(
+								   pstmt,
+								   queryString,
+								   readOnlyTree,
+								   context,
+								   params,
+								   queryEnv,
+								   dest,
+								   qc);
+	}
+	else
+	{
+		standard_ProcessUtility(
+								pstmt,
+								queryString,
+								readOnlyTree,
+								context,
+								params,
+								queryEnv,
+								dest,
+								qc);
+	}
+}
+
+extern void
+InitControlPlaneConnector()
+{
+	PreviousProcessUtilityHook = ProcessUtility_hook;
+	ProcessUtility_hook = NeonProcessUtility;
+	RegisterXactCallback(NeonXactCallback, NULL);
+	RegisterSubXactCallback(NeonSubXactCallback, NULL);
+
+	DefineCustomStringVariable(
+							   "neon.console_url",
+							   "URL of the Neon Console, which will be forwarded changes to dbs and roles",
+							   NULL,
+							   &ConsoleURL,
+							   NULL,
+							   PGC_POSTMASTER,
+							   0,
+							   NULL,
+							   NULL,
+							   NULL);
+
+	DefineCustomBoolVariable(
+							 "neon.forward_ddl",
+							 "Controls whether to forward DDL to the control plane",
+							 NULL,
+							 &ForwardDDL,
+							 true,
+							 PGC_SUSET,
+							 0,
+							 NULL,
+							 NULL,
+							 NULL);
+
+	const char *jwt_token = getenv("NEON_CONTROL_PLANE_TOKEN");
+
+	if (!jwt_token)
+	{
+		elog(LOG, "Missing NEON_CONTROL_PLANE_TOKEN environment variable, forwarding will not be authenticated");
+	}
+
+	if (curl_global_init(CURL_GLOBAL_DEFAULT))
+	{
+		elog(ERROR, "Failed to initialize curl");
+	}
+	if ((CurlHandle = curl_easy_init()) == NULL)
+	{
+		elog(ERROR, "Failed to initialize curl handle");
+	}
+	if ((ContentHeader = curl_slist_append(ContentHeader, "Content-Type: application/json")) == NULL)
+	{
+		elog(ERROR, "Failed to initialize content header");
+	}
+
+	if (jwt_token)
+	{
+		char		auth_header[8192];
+
+		snprintf(auth_header, sizeof(auth_header), "Authorization: Bearer %s", jwt_token);
+		if ((ContentHeader = curl_slist_append(ContentHeader, auth_header)) == NULL)
+		{
+			elog(ERROR, "Failed to initialize authorization header");
+		}
+	}
+}
diff --git a/pgxn/neon/control_plane_connector.h b/pgxn/neon/control_plane_connector.h
new file mode 100644
index 000000000000..12d6a97562f8
--- /dev/null
+++ b/pgxn/neon/control_plane_connector.h
@@ -0,0 +1,6 @@
+#ifndef CONTROL_PLANE_CONNECTOR_H
+#define CONTROL_PLANE_CONNECTOR_H
+
+void		InitControlPlaneConnector();
+
+#endif
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 217c1974a063..b45d7cfc32f4 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -25,6 +25,7 @@
 #include "neon.h"
 #include "walproposer.h"
 #include "pagestore_client.h"
+#include "control_plane_connector.h"
 
 PG_MODULE_MAGIC;
 void		_PG_init(void);
@@ -34,7 +35,11 @@ _PG_init(void)
 {
 	pg_init_libpagestore();
 	pg_init_walproposer();
+	InitControlPlaneConnector();
 
+        // Important: This must happen after other parts of the extension
+        // are loaded, otherwise any settings to GUCs that were set before
+        // the extension was loaded will be removed.
 	EmitWarningsOnPlaceholders("neon");
 }
 
diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py
index d67f0883655b..14ae88cc2cd1 100644
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -27,6 +27,10 @@ class PgVersion(str, enum.Enum):
     def __repr__(self) -> str:
         return f"'{self.value}'"
 
+    # Make this explicit for Python 3.11 compatibility, which changes the behavior of enums
+    def __str__(self) -> str:
+        return self.value
+
     # In GitHub workflows we use Postgres version with v-prefix (e.g. v14 instead of just 14),
     # sometime we need to do so in tests.
     @property
@@ -78,11 +82,11 @@ def pytest_addoption(parser: Parser):
 @pytest.fixture(scope="session")
 def pg_version(request: FixtureRequest) -> Iterator[PgVersion]:
     if v := request.config.getoption("--pg-version"):
-        version, source = v, "from --pg-version commad-line argument"
+        version, source = v, "from --pg-version command-line argument"
     elif v := os.environ.get("DEFAULT_PG_VERSION"):
         version, source = PgVersion(v), "from DEFAULT_PG_VERSION environment variable"
     else:
-        version, source = DEFAULT_VERSION, "default verson"
+        version, source = DEFAULT_VERSION, "default version"
 
     log.info(f"pg_version is {version} ({source})")
     yield version
diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
new file mode 100644
index 000000000000..27ebd3c18185
--- /dev/null
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -0,0 +1,219 @@
+from types import TracebackType
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+import psycopg2
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    PortDistributor,
+    VanillaPostgres,
+)
+from pytest_httpserver import HTTPServer
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
+
+
+@pytest.fixture(scope="session")
+def httpserver_listen_address(port_distributor: PortDistributor):
+    port = port_distributor.get_port()
+    return ("localhost", port)
+
+
+def handle_db(dbs, roles, operation):
+    if operation["op"] == "set":
+        if "old_name" in operation and operation["old_name"] in dbs:
+            dbs[operation["name"]] = dbs[operation["old_name"]]
+            dbs.pop(operation["old_name"])
+        if "owner" in operation:
+            dbs[operation["name"]] = operation["owner"]
+    elif operation["op"] == "del":
+        dbs.pop(operation["name"])
+    else:
+        raise ValueError("Invalid op")
+
+
+def handle_role(dbs, roles, operation):
+    if operation["op"] == "set":
+        if "old_name" in operation and operation["old_name"] in roles:
+            roles[operation["name"]] = roles[operation["old_name"]]
+            roles.pop(operation["old_name"])
+            for db, owner in dbs.items():
+                if owner == operation["old_name"]:
+                    dbs[db] = operation["name"]
+        if "password" in operation:
+            roles[operation["name"]] = operation["password"]
+    elif operation["op"] == "del":
+        if "old_name" in operation:
+            roles.pop(operation["old_name"])
+        roles.pop(operation["name"])
+    else:
+        raise ValueError("Invalid op")
+
+
+fail = False
+
+
+def ddl_forward_handler(request: Request, dbs: Dict[str, str], roles: Dict[str, str]) -> Response:
+    log.info(f"Received request with data {request.get_data(as_text=True)}")
+    if fail:
+        log.info("FAILING")
+        return Response(status=500, response="Failed just cuz")
+    if request.json is None:
+        log.info("Received invalid JSON")
+        return Response(status=400)
+    json = request.json
+    # Handle roles first
+    if "roles" in json:
+        for operation in json["roles"]:
+            handle_role(dbs, roles, operation)
+    if "dbs" in json:
+        for operation in json["dbs"]:
+            handle_db(dbs, roles, operation)
+    return Response(status=200)
+
+
+class DdlForwardingContext:
+    def __init__(self, httpserver: HTTPServer, vanilla_pg: VanillaPostgres, host: str, port: int):
+        self.server = httpserver
+        self.pg = vanilla_pg
+        self.host = host
+        self.port = port
+        self.dbs: Dict[str, str] = {}
+        self.roles: Dict[str, str] = {}
+        endpoint = "/management/api/v2/roles_and_databases"
+        ddl_url = f"http://{host}:{port}{endpoint}"
+        self.pg.configure(
+            [
+                f"neon.console_url={ddl_url}",
+                "shared_preload_libraries = 'neon'",
+            ]
+        )
+        log.info(f"Listening on {ddl_url}")
+        self.server.expect_request(endpoint, method="PATCH").respond_with_handler(
+            lambda request: ddl_forward_handler(request, self.dbs, self.roles)
+        )
+
+    def __enter__(self):
+        self.pg.start()
+        return self
+
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc: Optional[BaseException],
+        tb: Optional[TracebackType],
+    ):
+        self.pg.stop()
+
+    def send(self, query: str) -> List[Tuple[Any, ...]]:
+        return self.pg.safe_psql(query)
+
+    def wait(self, timeout=3):
+        self.server.wait(timeout=timeout)
+
+    def send_and_wait(self, query: str, timeout=3) -> List[Tuple[Any, ...]]:
+        res = self.send(query)
+        self.wait(timeout=timeout)
+        return res
+
+
+@pytest.fixture(scope="function")
+def ddl(
+    httpserver: HTTPServer, vanilla_pg: VanillaPostgres, httpserver_listen_address: tuple[str, int]
+):
+    (host, port) = httpserver_listen_address
+    with DdlForwardingContext(httpserver, vanilla_pg, host, port) as ddl:
+        yield ddl
+
+
+def test_ddl_forwarding(ddl: DdlForwardingContext):
+    curr_user = ddl.send("SELECT current_user")[0][0]
+    log.info(f"Current user is {curr_user}")
+    ddl.send_and_wait("CREATE DATABASE bork")
+    assert ddl.dbs == {"bork": curr_user}
+    ddl.send_and_wait("CREATE ROLE volk WITH PASSWORD 'nu_zayats'")
+    ddl.send_and_wait("ALTER DATABASE bork RENAME TO nu_pogodi")
+    assert ddl.dbs == {"nu_pogodi": curr_user}
+    ddl.send_and_wait("ALTER DATABASE nu_pogodi OWNER TO volk")
+    assert ddl.dbs == {"nu_pogodi": "volk"}
+    ddl.send_and_wait("DROP DATABASE nu_pogodi")
+    assert ddl.dbs == {}
+    ddl.send_and_wait("DROP ROLE volk")
+    assert ddl.roles == {}
+
+    ddl.send_and_wait("CREATE ROLE tarzan WITH PASSWORD 'of_the_apes'")
+    assert ddl.roles == {"tarzan": "of_the_apes"}
+    ddl.send_and_wait("DROP ROLE tarzan")
+    assert ddl.roles == {}
+    ddl.send_and_wait("CREATE ROLE tarzan WITH PASSWORD 'of_the_apes'")
+    assert ddl.roles == {"tarzan": "of_the_apes"}
+    ddl.send_and_wait("ALTER ROLE tarzan WITH PASSWORD 'jungle_man'")
+    assert ddl.roles == {"tarzan": "jungle_man"}
+    ddl.send_and_wait("ALTER ROLE tarzan RENAME TO mowgli")
+    assert ddl.roles == {"mowgli": "jungle_man"}
+    ddl.send_and_wait("DROP ROLE mowgli")
+    assert ddl.roles == {}
+
+    conn = ddl.pg.connect()
+    cur = conn.cursor()
+
+    cur.execute("BEGIN")
+    cur.execute("CREATE ROLE bork WITH PASSWORD 'cork'")
+    cur.execute("COMMIT")
+    ddl.wait()
+    assert ddl.roles == {"bork": "cork"}
+    cur.execute("BEGIN")
+    cur.execute("CREATE ROLE stork WITH PASSWORD 'pork'")
+    cur.execute("ABORT")
+    ddl.wait()
+    assert ("stork", "pork") not in ddl.roles.items()
+    cur.execute("BEGIN")
+    cur.execute("ALTER ROLE bork WITH PASSWORD 'pork'")
+    cur.execute("ALTER ROLE bork RENAME TO stork")
+    cur.execute("COMMIT")
+    ddl.wait()
+    assert ddl.roles == {"stork": "pork"}
+    cur.execute("BEGIN")
+    cur.execute("CREATE ROLE dork WITH PASSWORD 'york'")
+    cur.execute("SAVEPOINT point")
+    cur.execute("ALTER ROLE dork WITH PASSWORD 'zork'")
+    cur.execute("ALTER ROLE dork RENAME TO fork")
+    cur.execute("ROLLBACK TO SAVEPOINT point")
+    cur.execute("ALTER ROLE dork WITH PASSWORD 'fork'")
+    cur.execute("ALTER ROLE dork RENAME TO zork")
+    cur.execute("RELEASE SAVEPOINT point")
+    cur.execute("COMMIT")
+    ddl.wait()
+    assert ddl.roles == {"stork": "pork", "zork": "fork"}
+
+    cur.execute("DROP ROLE stork")
+    cur.execute("DROP ROLE zork")
+    ddl.wait()
+    assert ddl.roles == {}
+
+    cur.execute("CREATE ROLE bork WITH PASSWORD 'dork'")
+    cur.execute("CREATE ROLE stork WITH PASSWORD 'cork'")
+    cur.execute("BEGIN")
+    cur.execute("DROP ROLE bork")
+    cur.execute("ALTER ROLE stork RENAME TO bork")
+    cur.execute("COMMIT")
+    ddl.wait()
+    assert ddl.roles == {"bork": "cork"}
+
+    cur.execute("DROP ROLE bork")
+    ddl.wait()
+    assert ddl.roles == {}
+
+    cur.execute("CREATE ROLE bork WITH PASSWORD 'dork'")
+    cur.execute("CREATE DATABASE stork WITH OWNER=bork")
+    cur.execute("ALTER ROLE bork RENAME TO cork")
+    ddl.wait()
+    assert ddl.dbs == {"stork": "cork"}
+
+    with pytest.raises(psycopg2.InternalError):
+        global fail
+        fail = True
+        cur.execute("CREATE DATABASE failure WITH OWNER=cork")
+        ddl.wait()
+
+    conn.close()

From 37ecebe45bfc0572c66b4e9f1fa27b2699f28812 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 25 May 2023 11:37:12 +0200
Subject: [PATCH 05/59] mgr::get_tenant: distinguished error type (#4300)

Before this patch, it would use error type `TenantStateError` which has
many more error variants than can actually happen with
`mgr::get_tenant`.

Along the way, I also introduced `SetNewTenantConfigError` because it
uses `mgr::get_tenant` and also can only fail in much fewer ways than
`TenantStateError` suggests.

The new `page_service.rs`'s `GetActiveTimelineError` and
`GetActiveTenantError` types were necessary to avoid an `Other` variant
on the `GetTenantError`.

This patch is a by-product of reading code that subscribes to
`Tenant::state` changes.
Can't really connect it to any given project.
---
 pageserver/src/http/routes.rs  | 36 ++++++++++++++++++++++++--
 pageserver/src/page_service.rs | 43 ++++++++++++++++++++++++++-----
 pageserver/src/tenant.rs       | 47 +++++++++++++++++++++++++++++-----
 pageserver/src/tenant/mgr.rs   | 29 ++++++++++++++++-----
 4 files changed, 133 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 83d478ac3d52..c530952aaf0f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -24,7 +24,9 @@ use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::mgr::{TenantMapInsertError, TenantStateError};
+use crate::tenant::mgr::{
+    GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
+};
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
@@ -140,6 +142,36 @@ impl From<TenantStateError> for ApiError {
     }
 }
 
+impl From<GetTenantError> for ApiError {
+    fn from(tse: GetTenantError) -> ApiError {
+        match tse {
+            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
+            e @ GetTenantError::NotActive(_) => {
+                // Why is this not `ApiError::NotFound`?
+                // Because we must be careful to never return 404 for a tenant if it does
+                // in fact exist locally. If we did, the caller could draw the conclusion
+                // that it can attach the tenant to another PS and we'd be in split-brain.
+                //
+                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
+                ApiError::InternalServerError(anyhow::Error::new(e))
+            }
+        }
+    }
+}
+
+impl From<SetNewTenantConfigError> for ApiError {
+    fn from(e: SetNewTenantConfigError) -> ApiError {
+        match e {
+            SetNewTenantConfigError::GetTenant(tid) => {
+                ApiError::NotFound(anyhow!("tenant {}", tid))
+            }
+            e @ SetNewTenantConfigError::Persist(_) => {
+                ApiError::InternalServerError(anyhow::Error::new(e))
+            }
+        }
+    }
+}
+
 impl From<crate::tenant::DeleteTimelineError> for ApiError {
     fn from(value: crate::tenant::DeleteTimelineError) -> Self {
         use crate::tenant::DeleteTimelineError::*;
@@ -159,7 +191,7 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
         match value {
             // Report Precondition failed so client can distinguish between
             // "tenant is missing" case from "timeline is missing"
-            Tenant(TenantStateError::NotFound(..)) => {
+            Tenant(GetTenantError::NotFound(..)) => {
                 ApiError::PreconditionFailed("Requested tenant is missing")
             }
             Tenant(t) => ApiError::from(t),
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index bd3ece2dfc9b..fd442783f905 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -50,7 +50,9 @@ use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant;
 use crate::tenant::mgr;
+use crate::tenant::mgr::GetTenantError;
 use crate::tenant::{Tenant, Timeline};
 use crate::trace::Tracer;
 
@@ -1131,7 +1133,9 @@ enum GetActiveTenantError {
         wait_time: Duration,
     },
     #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    NotFound(GetTenantError),
+    #[error(transparent)]
+    WaitTenantActive(tenant::WaitToBecomeActiveError),
 }
 
 impl From<GetActiveTenantError> for QueryError {
@@ -1140,7 +1144,8 @@ impl From<GetActiveTenantError> for QueryError {
             GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
                 ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
             ),
-            GetActiveTenantError::Other(e) => QueryError::Other(e),
+            GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)),
+            GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)),
         }
     }
 }
@@ -1156,13 +1161,16 @@ async fn get_active_tenant_with_timeout(
 ) -> Result<Arc<Tenant>, GetActiveTenantError> {
     let tenant = match mgr::get_tenant(tenant_id, false).await {
         Ok(tenant) => tenant,
-        Err(e) => return Err(GetActiveTenantError::Other(e.into())),
+        Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
+        Err(GetTenantError::NotActive(_)) => {
+            unreachable!("we're calling get_tenant with active=false")
+        }
     };
     let wait_time = Duration::from_secs(30);
     match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
         Ok(Ok(())) => Ok(tenant),
         // no .context(), the error message is good enough and some tests depend on it
-        Ok(Err(wait_error)) => Err(GetActiveTenantError::Other(wait_error)),
+        Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)),
         Err(_) => {
             let latest_state = tenant.current_state();
             if latest_state == TenantState::Active {
@@ -1177,13 +1185,34 @@ async fn get_active_tenant_with_timeout(
     }
 }
 
+#[derive(Debug, thiserror::Error)]
+enum GetActiveTimelineError {
+    #[error(transparent)]
+    Tenant(GetActiveTenantError),
+    #[error(transparent)]
+    Timeline(anyhow::Error),
+}
+
+impl From<GetActiveTimelineError> for QueryError {
+    fn from(e: GetActiveTimelineError) -> Self {
+        match e {
+            GetActiveTimelineError::Tenant(e) => e.into(),
+            GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
+        }
+    }
+}
+
 /// Shorthand for getting a reference to a Timeline of an Active tenant.
 async fn get_active_tenant_timeline(
     tenant_id: TenantId,
     timeline_id: TimelineId,
     ctx: &RequestContext,
-) -> Result<Arc<Timeline>, GetActiveTenantError> {
-    let tenant = get_active_tenant_with_timeout(tenant_id, ctx).await?;
-    let timeline = tenant.get_timeline(timeline_id, true)?;
+) -> Result<Arc<Timeline>, GetActiveTimelineError> {
+    let tenant = get_active_tenant_with_timeout(tenant_id, ctx)
+        .await
+        .map_err(GetActiveTimelineError::Tenant)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(GetActiveTimelineError::Timeline)?;
     Ok(timeline)
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e75d9f0d269a..6806b2c99da6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -450,6 +450,34 @@ struct RemoteStartupData {
     remote_metadata: TimelineMetadata,
 }
 
+#[derive(Debug, thiserror::Error)]
+pub(crate) enum WaitToBecomeActiveError {
+    WillNotBecomeActive {
+        tenant_id: TenantId,
+        state: TenantState,
+    },
+    TenantDropped {
+        tenant_id: TenantId,
+    },
+}
+
+impl std::fmt::Display for WaitToBecomeActiveError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            WaitToBecomeActiveError::WillNotBecomeActive { tenant_id, state } => {
+                write!(
+                    f,
+                    "Tenant {} will not become active. Current state: {:?}",
+                    tenant_id, state
+                )
+            }
+            WaitToBecomeActiveError::TenantDropped { tenant_id } => {
+                write!(f, "Tenant {tenant_id} will not become active (dropped)")
+            }
+        }
+    }
+}
+
 impl Tenant {
     /// Yet another helper for timeline initialization.
     /// Contains the common part of `load_local_timeline` and `load_remote_timeline`.
@@ -1753,25 +1781,30 @@ impl Tenant {
         self.state.subscribe()
     }
 
-    pub async fn wait_to_become_active(&self) -> anyhow::Result<()> {
+    pub(crate) async fn wait_to_become_active(&self) -> Result<(), WaitToBecomeActiveError> {
         let mut receiver = self.state.subscribe();
         loop {
             let current_state = receiver.borrow_and_update().clone();
             match current_state {
                 TenantState::Loading | TenantState::Attaching => {
                     // in these states, there's a chance that we can reach ::Active
-                    receiver.changed().await?;
+                    receiver.changed().await.map_err(
+                        |_e: tokio::sync::watch::error::RecvError| {
+                            WaitToBecomeActiveError::TenantDropped {
+                                tenant_id: self.tenant_id,
+                            }
+                        },
+                    )?;
                 }
                 TenantState::Active { .. } => {
                     return Ok(());
                 }
                 TenantState::Broken { .. } | TenantState::Stopping => {
                     // There's no chance the tenant can transition back into ::Active
-                    anyhow::bail!(
-                        "Tenant {} will not become active. Current state: {:?}",
-                        self.tenant_id,
-                        &current_state,
-                    );
+                    return Err(WaitToBecomeActiveError::WillNotBecomeActive {
+                        tenant_id: self.tenant_id,
+                        state: current_state,
+                    });
                 }
             }
         }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 53d69a15dcdf..fa9769b0f89a 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -300,11 +300,19 @@ pub async fn create_tenant(
     }).await
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum SetNewTenantConfigError {
+    #[error(transparent)]
+    GetTenant(#[from] GetTenantError),
+    #[error(transparent)]
+    Persist(anyhow::Error),
+}
+
 pub async fn set_new_tenant_config(
     conf: &'static PageServerConf,
     new_tenant_conf: TenantConfOpt,
     tenant_id: TenantId,
-) -> Result<(), TenantStateError> {
+) -> Result<(), SetNewTenantConfigError> {
     info!("configuring tenant {tenant_id}");
     let tenant = get_tenant(tenant_id, true).await?;
 
@@ -314,23 +322,32 @@ pub async fn set_new_tenant_config(
         &tenant_config_path,
         new_tenant_conf,
         false,
-    )?;
+    )
+    .map_err(SetNewTenantConfigError::Persist)?;
     tenant.set_new_tenant_config(new_tenant_conf);
     Ok(())
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum GetTenantError {
+    #[error("Tenant {0} not found")]
+    NotFound(TenantId),
+    #[error("Tenant {0} is not active")]
+    NotActive(TenantId),
+}
+
 /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
 /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
 pub async fn get_tenant(
     tenant_id: TenantId,
     active_only: bool,
-) -> Result<Arc<Tenant>, TenantStateError> {
+) -> Result<Arc<Tenant>, GetTenantError> {
     let m = TENANTS.read().await;
     let tenant = m
         .get(&tenant_id)
-        .ok_or(TenantStateError::NotFound(tenant_id))?;
+        .ok_or(GetTenantError::NotFound(tenant_id))?;
     if active_only && !tenant.is_active() {
-        Err(TenantStateError::NotActive(tenant_id))
+        Err(GetTenantError::NotActive(tenant_id))
     } else {
         Ok(Arc::clone(tenant))
     }
@@ -339,7 +356,7 @@ pub async fn get_tenant(
 #[derive(Debug, thiserror::Error)]
 pub enum DeleteTimelineError {
     #[error("Tenant {0}")]
-    Tenant(#[from] TenantStateError),
+    Tenant(#[from] GetTenantError),
 
     #[error("Timeline {0}")]
     Timeline(#[from] crate::tenant::DeleteTimelineError),

From 83ba02b4312cda0cabca67b9dbb4aeda50aff4a8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 25 May 2023 11:38:04 +0200
Subject: [PATCH 06/59] tenant_status: don't InternalServerError if tenant not
 found (#4337)

Note this also changes the status code to the (correct) 404. Not sure if
that's relevant to Console.

Context:
https://neondb.slack.com/archives/C04PSBP2SAF/p1684746238831449?thread_ts=1684742106.169859&cid=C04PSBP2SAF

Atop #4300 because it cleans up the mgr::get_tenant() error type and I want eyes on that PR.
---
 pageserver/src/http/routes.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index c530952aaf0f..1ca3fdb54a48 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -543,7 +543,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
         }
 
         let state = tenant.current_state();
-        Ok(TenantInfo {
+        Result::<_, ApiError>::Ok(TenantInfo {
             id: tenant_id,
             state: state.clone(),
             current_physical_size: Some(current_physical_size),
@@ -551,8 +551,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
         })
     }
     .instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
-    .await
-    .map_err(ApiError::InternalServerError)?;
+    .await?;
 
     json_response(StatusCode::OK, tenant_info)
 }

From e5617021a7b08000733c122fa487c22fde85c026 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 25 May 2023 15:47:42 +0200
Subject: [PATCH 07/59] refactor: eliminate global storage_broker client state
 (#4318)

(This is prep work to make `Timeline::activate` infallible.)

This patch removes the global storage_broker client instance from the
pageserver codebase.

Instead, pageserver startup instantiates it and passes it down to the
`Timeline::activate` function, which in turn passes it to the
WalReceiver, which is the entity that actually uses it.

Patch series:

- #4316
- #4317
- #4318
- #4319
---
 pageserver/src/bin/pageserver.rs  | 26 +++++++++++++----
 pageserver/src/broker_client.rs   | 48 -------------------------------
 pageserver/src/http/routes.rs     | 32 +++++++++++++++++----
 pageserver/src/lib.rs             |  1 -
 pageserver/src/page_service.rs    | 23 +++++++++++++--
 pageserver/src/tenant.rs          | 21 ++++++++++----
 pageserver/src/tenant/mgr.rs      | 18 ++++++++----
 pageserver/src/tenant/timeline.rs | 16 ++++-------
 storage_broker/src/lib.rs         |  3 ++
 9 files changed, 104 insertions(+), 84 deletions(-)
 delete mode 100644 pageserver/src/broker_client.rs

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index d843b01ed736..d9d3d9d66244 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -9,6 +9,7 @@ use clap::{Arg, ArgAction, Command};
 use fail::FailScenario;
 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
+use pageserver::task_mgr::WALRECEIVER_RUNTIME;
 use remote_storage::GenericRemoteStorage;
 use tracing::*;
 
@@ -18,9 +19,7 @@ use pageserver::{
     context::{DownloadBehavior, RequestContext},
     http, page_cache, page_service, task_mgr,
     task_mgr::TaskKind,
-    task_mgr::{
-        BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
-    },
+    task_mgr::{BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME},
     tenant::mgr,
     virtual_file,
 };
@@ -276,7 +275,18 @@ fn start_pageserver(
     let pageserver_listener = tcp_listener::bind(pg_addr)?;
 
     // Launch broker client
-    WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?;
+    // The storage_broker::connect call needs to happen inside a tokio runtime thread.
+    let broker_client = WALRECEIVER_RUNTIME
+        .block_on(async {
+            // Note: we do not attempt connecting here (but validate endpoints sanity).
+            storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)
+        })
+        .with_context(|| {
+            format!(
+                "create broker client for uri={:?} keepalive_interval={:?}",
+                &conf.broker_endpoint, conf.broker_keepalive_interval,
+            )
+        })?;
 
     // Initialize authentication for incoming connections
     let http_auth;
@@ -326,7 +336,11 @@ fn start_pageserver(
     let remote_storage = create_remote_storage_client(conf)?;
 
     // Scan the local 'tenants/' directory and start loading the tenants
-    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?;
+    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
+        conf,
+        broker_client.clone(),
+        remote_storage.clone(),
+    ))?;
 
     // shared state between the disk-usage backed eviction background task and the http endpoint
     // that allows triggering disk-usage based eviction manually. note that the http endpoint
@@ -351,6 +365,7 @@ fn start_pageserver(
             conf,
             launch_ts,
             http_auth,
+            broker_client.clone(),
             remote_storage,
             disk_usage_eviction_state,
         )?
@@ -427,6 +442,7 @@ fn start_pageserver(
             async move {
                 page_service::libpq_listener_main(
                     conf,
+                    broker_client,
                     pg_auth,
                     pageserver_listener,
                     conf.pg_auth_type,
diff --git a/pageserver/src/broker_client.rs b/pageserver/src/broker_client.rs
deleted file mode 100644
index 6c92967ca34b..000000000000
--- a/pageserver/src/broker_client.rs
+++ /dev/null
@@ -1,48 +0,0 @@
-//! The broker client instance of the pageserver, created during pageserver startup.
-//! Used by each timelines' [`walreceiver`].
-
-use crate::config::PageServerConf;
-
-use anyhow::Context;
-use once_cell::sync::OnceCell;
-use storage_broker::BrokerClientChannel;
-use tracing::*;
-
-static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
-
-///
-/// Initialize the broker client. This must be called once at page server startup.
-///
-pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
-    let broker_endpoint = conf.broker_endpoint.clone();
-
-    // Note: we do not attempt connecting here (but validate endpoints sanity).
-    let broker_client =
-        storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
-            format!(
-                "Failed to create broker client to {}",
-                &conf.broker_endpoint
-            ),
-        )?;
-
-    if BROKER_CLIENT.set(broker_client).is_err() {
-        panic!("broker already initialized");
-    }
-
-    info!(
-        "Initialized broker client with endpoints: {}",
-        broker_endpoint
-    );
-    Ok(())
-}
-
-///
-/// Get a handle to the broker client
-///
-pub fn get_broker_client() -> &'static BrokerClientChannel {
-    BROKER_CLIENT.get().expect("broker client not initialized")
-}
-
-pub fn is_broker_client_initialized() -> bool {
-    BROKER_CLIENT.get().is_some()
-}
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 1ca3fdb54a48..25e0d88e7062 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -7,6 +7,7 @@ use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::{DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest};
 use remote_storage::GenericRemoteStorage;
+use storage_broker::BrokerClientChannel;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -53,6 +54,7 @@ struct State {
     auth: Option<Arc<JwtAuth>>,
     allowlist_routes: Vec<Uri>,
     remote_storage: Option<GenericRemoteStorage>,
+    broker_client: storage_broker::BrokerClientChannel,
     disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
 }
 
@@ -61,6 +63,7 @@ impl State {
         conf: &'static PageServerConf,
         auth: Option<Arc<JwtAuth>>,
         remote_storage: Option<GenericRemoteStorage>,
+        broker_client: storage_broker::BrokerClientChannel,
         disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
     ) -> anyhow::Result<Self> {
         let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
@@ -72,6 +75,7 @@ impl State {
             auth,
             allowlist_routes,
             remote_storage,
+            broker_client,
             disk_usage_eviction_state,
         })
     }
@@ -303,6 +307,8 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
 
+    let state = get_state(&request);
+
     async {
         let tenant = mgr::get_tenant(tenant_id, true).await?;
         match tenant.create_timeline(
@@ -310,6 +316,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
             request_data.ancestor_timeline_id.map(TimelineId::from),
             request_data.ancestor_start_lsn,
             request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
+            state.broker_client.clone(),
             &ctx,
         )
         .await {
@@ -440,6 +447,7 @@ async fn tenant_attach_handler(mut request: Request<Body>) -> Result<Response<Bo
             state.conf,
             tenant_id,
             tenant_conf,
+            state.broker_client.clone(),
             remote_storage.clone(),
             &ctx,
         )
@@ -489,9 +497,15 @@ async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, A
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
 
     let state = get_state(&request);
-    mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx)
-        .instrument(info_span!("load", tenant = %tenant_id))
-        .await?;
+    mgr::load_tenant(
+        state.conf,
+        tenant_id,
+        state.broker_client.clone(),
+        state.remote_storage.clone(),
+        &ctx,
+    )
+    .instrument(info_span!("load", tenant = %tenant_id))
+    .await?;
 
     json_response(StatusCode::ACCEPTED, ())
 }
@@ -775,6 +789,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
         state.conf,
         tenant_conf,
         target_tenant_id,
+        state.broker_client.clone(),
         state.remote_storage.clone(),
         &ctx,
     )
@@ -1130,6 +1145,7 @@ pub fn make_router(
     conf: &'static PageServerConf,
     launch_ts: &'static LaunchTimestamp,
     auth: Option<Arc<JwtAuth>>,
+    broker_client: BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
     disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
@@ -1176,8 +1192,14 @@ pub fn make_router(
 
     Ok(router
         .data(Arc::new(
-            State::new(conf, auth, remote_storage, disk_usage_eviction_state)
-                .context("Failed to initialize router state")?,
+            State::new(
+                conf,
+                auth,
+                remote_storage,
+                broker_client,
+                disk_usage_eviction_state,
+            )
+            .context("Failed to initialize router state")?,
         ))
         .get("/v1/status", |r| RequestSpan(status_handler).handle(r))
         .put(
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 04863886cba7..4349f0e2ea8b 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,6 +1,5 @@
 mod auth;
 pub mod basebackup;
-pub mod broker_client;
 pub mod config;
 pub mod consumption_metrics;
 pub mod context;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index fd442783f905..9e9285a0092d 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -174,6 +174,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 ///
 pub async fn libpq_listener_main(
     conf: &'static PageServerConf,
+    broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<JwtAuth>>,
     listener: TcpListener,
     auth_type: AuthType,
@@ -215,7 +216,14 @@ pub async fn libpq_listener_main(
                     None,
                     "serving compute connection task",
                     false,
-                    page_service_conn_main(conf, local_auth, socket, auth_type, connection_ctx),
+                    page_service_conn_main(
+                        conf,
+                        broker_client.clone(),
+                        local_auth,
+                        socket,
+                        auth_type,
+                        connection_ctx,
+                    ),
                 );
             }
             Err(err) => {
@@ -232,6 +240,7 @@ pub async fn libpq_listener_main(
 
 async fn page_service_conn_main(
     conf: &'static PageServerConf,
+    broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<JwtAuth>>,
     socket: tokio::net::TcpStream,
     auth_type: AuthType,
@@ -268,7 +277,7 @@ async fn page_service_conn_main(
     // and create a child per-query context when it invokes process_query.
     // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
     // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx);
+    let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
     let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
 
     match pgbackend
@@ -326,6 +335,7 @@ impl PageRequestMetrics {
 
 struct PageServerHandler {
     _conf: &'static PageServerConf,
+    broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<JwtAuth>>,
     claims: Option<Claims>,
 
@@ -339,11 +349,13 @@ struct PageServerHandler {
 impl PageServerHandler {
     pub fn new(
         conf: &'static PageServerConf,
+        broker_client: storage_broker::BrokerClientChannel,
         auth: Option<Arc<JwtAuth>>,
         connection_ctx: RequestContext,
     ) -> Self {
         PageServerHandler {
             _conf: conf,
+            broker_client,
             auth,
             claims: None,
             connection_ctx,
@@ -496,7 +508,12 @@ impl PageServerHandler {
 
         let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
         timeline
-            .import_basebackup_from_tar(&mut copyin_reader, base_lsn, &ctx)
+            .import_basebackup_from_tar(
+                &mut copyin_reader,
+                base_lsn,
+                self.broker_client.clone(),
+                &ctx,
+            )
             .await?;
 
         // Read the end of the tar archive.
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 6806b2c99da6..e247fbf42308 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -16,6 +16,7 @@ use futures::FutureExt;
 use pageserver_api::models::TimelineState;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
+use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tracing::*;
@@ -238,6 +239,7 @@ impl UninitializedTimeline<'_> {
         self,
         copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
         base_lsn: Lsn,
+        broker_client: storage_broker::BrokerClientChannel,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
         let raw_timeline = self.raw_timeline()?;
@@ -264,7 +266,7 @@ impl UninitializedTimeline<'_> {
         // updated it for the layers that we created during the import.
         let mut timelines = self.owning_tenant.timelines.lock().unwrap();
         let tl = self.initialize_with_lock(ctx, &mut timelines, false)?;
-        tl.activate(ctx)?;
+        tl.activate(broker_client, ctx)?;
         Ok(tl)
     }
 
@@ -613,6 +615,7 @@ impl Tenant {
     pub(crate) fn spawn_attach(
         conf: &'static PageServerConf,
         tenant_id: TenantId,
+        broker_client: storage_broker::BrokerClientChannel,
         remote_storage: GenericRemoteStorage,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Tenant>> {
@@ -644,7 +647,7 @@ impl Tenant {
             async move {
                 let doit = async {
                     tenant_clone.attach(&ctx).await?;
-                    tenant_clone.activate(&ctx)?;
+                    tenant_clone.activate(broker_client, &ctx)?;
                     anyhow::Ok(())
                 };
                 match doit.await {
@@ -882,6 +885,7 @@ impl Tenant {
     pub fn spawn_load(
         conf: &'static PageServerConf,
         tenant_id: TenantId,
+        broker_client: storage_broker::BrokerClientChannel,
         remote_storage: Option<GenericRemoteStorage>,
         ctx: &RequestContext,
     ) -> Arc<Tenant> {
@@ -918,7 +922,7 @@ impl Tenant {
             async move {
                 let doit = async {
                     tenant_clone.load(&ctx).await?;
-                    tenant_clone.activate(&ctx)?;
+                    tenant_clone.activate(broker_client, &ctx)?;
                     anyhow::Ok(())
                 };
                 match doit.await {
@@ -1262,6 +1266,7 @@ impl Tenant {
         ancestor_timeline_id: Option<TimelineId>,
         mut ancestor_start_lsn: Option<Lsn>,
         pg_version: u32,
+        broker_client: storage_broker::BrokerClientChannel,
         ctx: &RequestContext,
     ) -> anyhow::Result<Option<Arc<Timeline>>> {
         anyhow::ensure!(
@@ -1328,7 +1333,7 @@ impl Tenant {
             }
         };
 
-        loaded_timeline.activate(ctx).context("activate timeline")?;
+        loaded_timeline.activate(broker_client, ctx)?;
 
         if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
             // Wait for the upload of the 'index_part.json` file to finish, so that when we return
@@ -1633,7 +1638,11 @@ impl Tenant {
     }
 
     /// Changes tenant status to active, unless shutdown was already requested.
-    fn activate(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
+    fn activate(
+        self: &Arc<Self>,
+        broker_client: BrokerClientChannel,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         debug_assert_current_span_has_tenant_id();
 
         let mut result = Ok(());
@@ -1673,7 +1682,7 @@ impl Tenant {
 
                     for timeline in not_broken_timelines {
                         match timeline
-                            .activate(ctx)
+                            .activate(broker_client.clone(), ctx)
                             .context("timeline activation for activating tenant")
                         {
                             Ok(()) => {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index fa9769b0f89a..dbb9577bf0f3 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -58,9 +58,10 @@ static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::
 /// Initialize repositories with locally available timelines.
 /// Timelines that are only partially available locally (remote storage has more data than this pageserver)
 /// are scheduled for download and added to the tenant once download is completed.
-#[instrument(skip(conf, remote_storage))]
+#[instrument(skip_all)]
 pub async fn init_tenant_mgr(
     conf: &'static PageServerConf,
+    broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
 ) -> anyhow::Result<()> {
     // Scan local filesystem for attached tenants
@@ -116,6 +117,7 @@ pub async fn init_tenant_mgr(
                     match schedule_local_tenant_processing(
                         conf,
                         &tenant_dir_path,
+                        broker_client.clone(),
                         remote_storage.clone(),
                         &ctx,
                     ) {
@@ -150,6 +152,7 @@ pub async fn init_tenant_mgr(
 pub fn schedule_local_tenant_processing(
     conf: &'static PageServerConf,
     tenant_path: &Path,
+    broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
     ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
@@ -186,7 +189,7 @@ pub fn schedule_local_tenant_processing(
     let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
         info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
         if let Some(remote_storage) = remote_storage {
-            match Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx) {
+            match Tenant::spawn_attach(conf, tenant_id, broker_client, remote_storage, ctx) {
                 Ok(tenant) => tenant,
                 Err(e) => {
                     error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
@@ -204,7 +207,7 @@ pub fn schedule_local_tenant_processing(
     } else {
         info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
         // Start loading the tenant into memory. It will initially be in Loading state.
-        Tenant::spawn_load(conf, tenant_id, remote_storage, ctx)
+        Tenant::spawn_load(conf, tenant_id, broker_client, remote_storage, ctx)
     };
     Ok(tenant)
 }
@@ -275,6 +278,7 @@ pub async fn create_tenant(
     conf: &'static PageServerConf,
     tenant_conf: TenantConfOpt,
     tenant_id: TenantId,
+    broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
     ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
@@ -287,7 +291,7 @@ pub async fn create_tenant(
         //       See https://github.com/neondatabase/neon/issues/4233
 
         let created_tenant =
-            schedule_local_tenant_processing(conf, &tenant_directory, remote_storage, ctx)?;
+            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, ctx)?;
         // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
         //      See https://github.com/neondatabase/neon/issues/4233
 
@@ -421,6 +425,7 @@ pub async fn detach_tenant(
 pub async fn load_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,
+    broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
     ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
@@ -432,7 +437,7 @@ pub async fn load_tenant(
                 .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
         }
 
-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, ctx)
             .with_context(|| {
                 format!("Failed to schedule tenant processing in path {tenant_path:?}")
             })?;
@@ -489,6 +494,7 @@ pub async fn attach_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,
     tenant_conf: TenantConfOpt,
+    broker_client: storage_broker::BrokerClientChannel,
     remote_storage: GenericRemoteStorage,
     ctx: &RequestContext,
 ) -> Result<(), TenantMapInsertError> {
@@ -504,7 +510,7 @@ pub async fn attach_tenant(
             .context("check for attach marker file existence")?;
         anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
 
-        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, Some(remote_storage), ctx)?;
+        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), ctx)?;
         // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
         //      See https://github.com/neondatabase/neon/issues/4233
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3c951c11884a..9b449812ac26 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -31,7 +31,6 @@ use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
-use crate::broker_client::{get_broker_client, is_broker_client_initialized};
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
 use crate::tenant::storage_layer::{
@@ -907,15 +906,12 @@ impl Timeline {
         Ok(())
     }
 
-    pub fn activate(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
-        if is_broker_client_initialized() {
-            self.launch_wal_receiver(ctx, get_broker_client().clone())?;
-        } else if cfg!(test) {
-            info!("not launching WAL receiver because broker client hasn't been initialized");
-        } else {
-            anyhow::bail!("broker client not initialized");
-        }
-
+    pub fn activate(
+        self: &Arc<Self>,
+        broker_client: BrokerClientChannel,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.launch_wal_receiver(ctx, broker_client)?;
         self.set_state(TimelineState::Active);
         self.launch_eviction_task();
         Ok(())
diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs
index 8441aaf625f7..4bc561449dc4 100644
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -40,6 +40,9 @@ pub type BrokerClientChannel = BrokerServiceClient<Channel>;
 // Create connection object configured to run TLS if schema starts with https://
 // and plain text otherwise. Connection is lazy, only endpoint sanity is
 // validated here.
+//
+// NB: this function is not async, but still must be run on a tokio runtime thread
+// because that's a requirement of tonic_endpoint.connect_lazy()'s Channel::new call.
 pub fn connect<U>(endpoint: U, keepalive_interval: Duration) -> anyhow::Result<BrokerClientChannel>
 where
     U: std::convert::TryInto<Uri>,

From ab2757f64aff973d1e408ff88eb75151d95e9195 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 25 May 2023 10:21:15 -0400
Subject: [PATCH 08/59] bump dependencies version (#4336)

proceeding https://github.com/neondatabase/neon/pull/4237, this PR bumps
AWS dependencies along with all other dependencies to the latest
compatible semver.

Signed-off-by: Alex Chi <iskyzh@gmail.com>
---
 Cargo.lock                | 614 +++++++++++++++++---------------------
 Cargo.toml                |   2 +-
 workspace_hack/Cargo.toml |   6 +-
 3 files changed, 278 insertions(+), 344 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6501d9557d1e..d390df94e02e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -17,17 +17,6 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
-[[package]]
-name = "ahash"
-version = "0.7.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47"
-dependencies = [
- "getrandom",
- "once_cell",
- "version_check",
-]
-
 [[package]]
 name = "ahash"
 version = "0.8.3"
@@ -41,9 +30,9 @@ dependencies = [
 
 [[package]]
 name = "aho-corasick"
-version = "0.7.20"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac"
+checksum = "67fc08ce920c31afb70f013dcce1bfc3a3195de6a228474e45e1f145b36f8d04"
 dependencies = [
  "memchr",
 ]
@@ -65,9 +54,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
 
 [[package]]
 name = "anstream"
-version = "0.3.0"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e579a7752471abc2a8268df8b20005e3eadd975f585398f17efcfd8d4927371"
+checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163"
 dependencies = [
  "anstyle",
  "anstyle-parse",
@@ -104,9 +93,9 @@ dependencies = [
 
 [[package]]
 name = "anstyle-wincon"
-version = "1.0.0"
+version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4bcd8291a340dd8ac70e18878bc4501dd7b4ff970cfa21c207d36ece51ea88fd"
+checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188"
 dependencies = [
  "anstyle",
  "windows-sys 0.48.0",
@@ -114,9 +103,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.70"
+version = "1.0.71"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7de8ce5e0f9f8d88245311066a578d72b7af3e7088f32783804676302df237e4"
+checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8"
 dependencies = [
  "backtrace",
 ]
@@ -188,7 +177,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.15",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -199,7 +188,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.15",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -230,9 +219,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 
 [[package]]
 name = "aws-config"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc00553f5f3c06ffd4510a9d576f92143618706c45ea6ff81e84ad9be9588abd"
+checksum = "bcdcf0d683fe9c23d32cf5b53c9918ea0a500375a9fb20109802552658e576c9"
 dependencies = [
  "aws-credential-types",
  "aws-http",
@@ -256,9 +245,9 @@ dependencies = [
 
 [[package]]
 name = "aws-credential-types"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4cb57ac6088805821f78d282c0ba8aec809f11cbee10dda19a97b03ab040ccc2"
+checksum = "1fcdb2f7acbc076ff5ad05e7864bdb191ca70a6fd07668dc3a1a8bcd051de5ae"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-types",
@@ -270,9 +259,9 @@ dependencies = [
 
 [[package]]
 name = "aws-endpoint"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c5f6f84a4f46f95a9bb71d9300b73cd67eb868bc43ae84f66ad34752299f4ac"
+checksum = "8cce1c41a6cfaa726adee9ebb9a56fcd2bbfd8be49fd8a04c5e20fd968330b04"
 dependencies = [
  "aws-smithy-http",
  "aws-smithy-types",
@@ -284,9 +273,9 @@ dependencies = [
 
 [[package]]
 name = "aws-http"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a754683c322f7dc5167484266489fdebdcd04d26e53c162cad1f3f949f2c5671"
+checksum = "aadbc44e7a8f3e71c8b374e03ecd972869eb91dd2bc89ed018954a52ba84bc44"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-http",
@@ -303,9 +292,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-s3"
-version = "0.25.1"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "392b9811ca489747ac84349790e49deaa1f16631949e7dd4156000251c260eae"
+checksum = "37c77060408d653d3efa6ea7b66c1389bc35a0342352984c8bf8bcb814a8fc27"
 dependencies = [
  "aws-credential-types",
  "aws-endpoint",
@@ -336,9 +325,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sdk-sts"
-version = "0.27.0"
+version = "0.28.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2d0fbe3c2c342bc8dfea4bb43937405a8ec06f99140a0dcb9c7b59e54dfa93a1"
+checksum = "265fac131fbfc188e5c3d96652ea90ecc676a934e3174eaaee523c6cec040b3b"
 dependencies = [
  "aws-credential-types",
  "aws-endpoint",
@@ -362,9 +351,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sig-auth"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "84dc92a63ede3c2cbe43529cb87ffa58763520c96c6a46ca1ced80417afba845"
+checksum = "3b94acb10af0c879ecd5c7bdf51cda6679a0a4f4643ce630905a77673bfa3c61"
 dependencies = [
  "aws-credential-types",
  "aws-sigv4",
@@ -377,9 +366,9 @@ dependencies = [
 
 [[package]]
 name = "aws-sigv4"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "392fefab9d6fcbd76d518eb3b1c040b84728ab50f58df0c3c53ada4bea9d327e"
+checksum = "9d2ce6f507be68e968a33485ced670111d1cbad161ddbbab1e313c03d37d8f4c"
 dependencies = [
  "aws-smithy-eventstream",
  "aws-smithy-http",
@@ -398,9 +387,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-async"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae23b9fe7a07d0919000116c4c5c0578303fbce6fc8d32efca1f7759d4c20faf"
+checksum = "13bda3996044c202d75b91afeb11a9afae9db9a721c6a7a427410018e286b880"
 dependencies = [
  "futures-util",
  "pin-project-lite",
@@ -410,9 +399,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-checksums"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6367acbd6849b8c7c659e166955531274ae147bf83ab4312885991f6b6706cb"
+checksum = "07ed8b96d95402f3f6b8b57eb4e0e45ee365f78b1a924faf20ff6e97abf1eae6"
 dependencies = [
  "aws-smithy-http",
  "aws-smithy-types",
@@ -431,9 +420,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-client"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5230d25d244a51339273b8870f0f77874cd4449fb4f8f629b21188ae10cfc0ba"
+checksum = "0a86aa6e21e86c4252ad6a0e3e74da9617295d8d6e374d552be7d3059c41cedd"
 dependencies = [
  "aws-smithy-async",
  "aws-smithy-http",
@@ -444,7 +433,7 @@ dependencies = [
  "http",
  "http-body",
  "hyper",
- "hyper-rustls",
+ "hyper-rustls 0.23.2",
  "lazy_static",
  "pin-project-lite",
  "rustls 0.20.8",
@@ -455,9 +444,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-eventstream"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22d2a2bcc16e5c4d949ffd2b851da852b9bbed4bb364ed4ae371b42137ca06d9"
+checksum = "460c8da5110835e3d9a717c61f5556b20d03c32a1dec57f8fc559b360f733bb8"
 dependencies = [
  "aws-smithy-types",
  "bytes",
@@ -466,9 +455,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b60e2133beb9fe6ffe0b70deca57aaeff0a35ad24a9c6fab2fd3b4f45b99fdb5"
+checksum = "2b3b693869133551f135e1f2c77cb0b8277d9e3e17feaf2213f735857c4f0d28"
 dependencies = [
  "aws-smithy-eventstream",
  "aws-smithy-types",
@@ -489,9 +478,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-http-tower"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a4d94f556c86a0dd916a5d7c39747157ea8cb909ca469703e20fee33e448b67"
+checksum = "3ae4f6c5798a247fac98a867698197d9ac22643596dc3777f0c76b91917616b9"
 dependencies = [
  "aws-smithy-http",
  "aws-smithy-types",
@@ -505,18 +494,18 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-json"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ce3d6e6ebb00b2cce379f079ad5ec508f9bcc3a9510d9b9c1840ed1d6f8af39"
+checksum = "23f9f42fbfa96d095194a632fbac19f60077748eba536eb0b9fecc28659807f8"
 dependencies = [
  "aws-smithy-types",
 ]
 
 [[package]]
 name = "aws-smithy-query"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d58edfca32ef9bfbc1ca394599e17ea329cb52d6a07359827be74235b64b3298"
+checksum = "98819eb0b04020a1c791903533b638534ae6c12e2aceda3e6e6fba015608d51d"
 dependencies = [
  "aws-smithy-types",
  "urlencoding",
@@ -524,9 +513,9 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-types"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "58db46fc1f4f26be01ebdb821751b4e2482cd43aa2b64a0348fb89762defaffa"
+checksum = "16a3d0bf4f324f4ef9793b86a1701d9700fbcdbd12a846da45eed104c634c6e8"
 dependencies = [
  "base64-simd",
  "itoa",
@@ -537,18 +526,18 @@ dependencies = [
 
 [[package]]
 name = "aws-smithy-xml"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb557fe4995bd9ec87fb244bbb254666a971dc902a783e9da8b7711610e9664c"
+checksum = "b1b9d12875731bd07e767be7baad95700c3137b56730ec9ddeedb52a5e5ca63b"
 dependencies = [
  "xmlparser",
 ]
 
 [[package]]
 name = "aws-types"
-version = "0.55.2"
+version = "0.55.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de0869598bfe46ec44ffe17e063ed33336e59df90356ca8ff0e8da6f7c1d994b"
+checksum = "6dd209616cc8d7bfb82f87811a5c655dc97537f592689b18743bddf5dc5c4829"
 dependencies = [
  "aws-credential-types",
  "aws-smithy-async",
@@ -562,9 +551,9 @@ dependencies = [
 
 [[package]]
 name = "axum"
-version = "0.6.15"
+version = "0.6.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3b32c5ea3aabaf4deb5f5ced2d688ec0844c881c9e6c696a8b769a05fc691e62"
+checksum = "f8175979259124331c1d7bf6586ee7e0da434155e4b2d48ec2c8386281d8df39"
 dependencies = [
  "async-trait",
  "axum-core",
@@ -634,9 +623,9 @@ checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5"
 
 [[package]]
 name = "base64"
-version = "0.21.0"
+version = "0.21.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a"
+checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105"
 
 [[package]]
 name = "base64-simd"
@@ -670,13 +659,13 @@ dependencies = [
  "lazycell",
  "log",
  "peeking_take_while",
- "prettyplease 0.2.4",
+ "prettyplease 0.2.6",
  "proc-macro2",
  "quote",
  "regex",
  "rustc-hash",
  "shlex",
- "syn 2.0.15",
+ "syn 2.0.16",
  "which",
 ]
 
@@ -697,9 +686,9 @@ dependencies = [
 
 [[package]]
 name = "bstr"
-version = "1.4.0"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3d4260bcc2e8fc9df1eac4919a720effeb63a3f0952f5bf4944adfa18897f09"
+checksum = "a246e68bb43f6cd9db24bea052a53e40405417c5fb372e3d1a8a7f770a564ef5"
 dependencies = [
  "memchr",
  "once_cell",
@@ -709,9 +698,9 @@ dependencies = [
 
 [[package]]
 name = "bumpalo"
-version = "3.12.0"
+version = "3.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
+checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
 
 [[package]]
 name = "byteorder"
@@ -780,9 +769,9 @@ dependencies = [
 
 [[package]]
 name = "ciborium"
-version = "0.2.0"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0c137568cc60b904a7724001b35ce2630fd00d5d84805fbb608ab89509d788f"
+checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926"
 dependencies = [
  "ciborium-io",
  "ciborium-ll",
@@ -791,15 +780,15 @@ dependencies = [
 
 [[package]]
 name = "ciborium-io"
-version = "0.2.0"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "346de753af073cc87b52b2083a506b38ac176a44cfb05497b622e27be899b369"
+checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656"
 
 [[package]]
 name = "ciborium-ll"
-version = "0.2.0"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "213030a2b5a4e0c0892b6652260cf6ccac84827b83a85a534e178e3906c4cf1b"
+checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
 dependencies = [
  "ciborium-io",
  "half",
@@ -818,9 +807,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "3.2.23"
+version = "3.2.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "71655c45cb9845d3270c9d6df84ebe72b4dad3c2ba3f7023ad47c144e4e473a5"
+checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123"
 dependencies = [
  "bitflags",
  "clap_lex 0.2.4",
@@ -830,9 +819,9 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.2.2"
+version = "4.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b802d85aaf3a1cdb02b224ba472ebdea62014fccfcb269b95a4d76443b5ee5a"
+checksum = "93aae7a4192245f70fe75dd9157fc7b4a5bf53e88d30bd4396f7d8f9284d5acc"
 dependencies = [
  "clap_builder",
  "clap_derive",
@@ -841,27 +830,27 @@ dependencies = [
 
 [[package]]
 name = "clap_builder"
-version = "4.2.2"
+version = "4.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14a1a858f532119338887a4b8e1af9c60de8249cd7bafd68036a489e261e37b6"
+checksum = "4f423e341edefb78c9caba2d9c7f7687d0e72e89df3ce3394554754393ac3990"
 dependencies = [
  "anstream",
  "anstyle",
  "bitflags",
- "clap_lex 0.4.1",
+ "clap_lex 0.5.0",
  "strsim",
 ]
 
 [[package]]
 name = "clap_derive"
-version = "4.2.0"
+version = "4.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f9644cd56d6b87dbe899ef8b053e331c0637664e9e21a33dfcdc36093f5c5c4"
+checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b"
 dependencies = [
  "heck",
  "proc-macro2",
  "quote",
- "syn 2.0.15",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -875,9 +864,9 @@ dependencies = [
 
 [[package]]
 name = "clap_lex"
-version = "0.4.1"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a2dd5a6fe8c6e3502f568a6353e5273bbb15193ad9a89e457b9970798efbea1"
+checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
 
 [[package]]
 name = "close_fds"
@@ -889,16 +878,6 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "codespan-reporting"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e"
-dependencies = [
- "termcolor",
- "unicode-width",
-]
-
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -936,7 +915,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "chrono",
- "clap 4.2.2",
+ "clap 4.3.0",
  "compute_api",
  "futures",
  "hyper",
@@ -998,7 +977,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.2.2",
+ "clap 4.3.0",
  "comfy-table",
  "compute_api",
  "git-version",
@@ -1041,9 +1020,9 @@ checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
 
 [[package]]
 name = "cpufeatures"
-version = "0.2.6"
+version = "0.2.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "280a9f2d8b3a38871a3c8a46fb80db65e5e5ed97da80c4d08bf27fb63e35e181"
+checksum = "3e4c1eaa2012c47becbbad2ab175484c2a84d1185b566fb2cc5b8707343dfe58"
 dependencies = [
  "libc",
 ]
@@ -1076,7 +1055,7 @@ dependencies = [
  "atty",
  "cast",
  "ciborium",
- "clap 3.2.23",
+ "clap 3.2.25",
  "criterion-plot",
  "itertools",
  "lazy_static",
@@ -1186,55 +1165,11 @@ dependencies = [
  "typenum",
 ]
 
-[[package]]
-name = "cxx"
-version = "1.0.94"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f61f1b6389c3fe1c316bf8a4dccc90a38208354b330925bce1f74a6c4756eb93"
-dependencies = [
- "cc",
- "cxxbridge-flags",
- "cxxbridge-macro",
- "link-cplusplus",
-]
-
-[[package]]
-name = "cxx-build"
-version = "1.0.94"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "12cee708e8962df2aeb38f594aae5d827c022b6460ac71a7a3e2c3c2aae5a07b"
-dependencies = [
- "cc",
- "codespan-reporting",
- "once_cell",
- "proc-macro2",
- "quote",
- "scratch",
- "syn 2.0.15",
-]
-
-[[package]]
-name = "cxxbridge-flags"
-version = "1.0.94"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7944172ae7e4068c533afbb984114a56c46e9ccddda550499caa222902c7f7bb"
-
-[[package]]
-name = "cxxbridge-macro"
-version = "1.0.94"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2345488264226bf682893e25de0769f3360aac9957980ec49361b083ddaa5bc5"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.15",
-]
-
 [[package]]
 name = "darling"
-version = "0.14.4"
+version = "0.20.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850"
+checksum = "0558d22a7b463ed0241e993f76f09f30b126687447751a8638587b864e4b3944"
 dependencies = [
  "darling_core",
  "darling_macro",
@@ -1242,27 +1177,27 @@ dependencies = [
 
 [[package]]
 name = "darling_core"
-version = "0.14.4"
+version = "0.20.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0"
+checksum = "ab8bfa2e259f8ee1ce5e97824a3c55ec4404a0d772ca7fa96bf19f0752a046eb"
 dependencies = [
  "fnv",
  "ident_case",
  "proc-macro2",
  "quote",
  "strsim",
- "syn 1.0.109",
+ "syn 2.0.16",
 ]
 
 [[package]]
 name = "darling_macro"
-version = "0.14.4"
+version = "0.20.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e"
+checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
 dependencies = [
  "darling_core",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -1280,9 +1215,9 @@ dependencies = [
 
 [[package]]
 name = "data-encoding"
-version = "2.3.3"
+version = "2.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "23d8666cb01533c39dde32bcbab8e227b4ed6679b2c925eba05feabea39508fb"
+checksum = "c2e66c9d817f1720209181c316d28635c050fa304f9c79e47a520882661b7308"
 
 [[package]]
 name = "debugid"
@@ -1310,9 +1245,9 @@ dependencies = [
 
 [[package]]
 name = "digest"
-version = "0.10.6"
+version = "0.10.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
 dependencies = [
  "block-buffer",
  "crypto-common",
@@ -1321,13 +1256,13 @@ dependencies = [
 
 [[package]]
 name = "displaydoc"
-version = "0.2.3"
+version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3bf95dc3f046b9da4f2d51833c0d3547d8564ef6910f5c1ed130306a75b92886"
+checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -1367,23 +1302,23 @@ dependencies = [
 
 [[package]]
 name = "enumset"
-version = "1.0.12"
+version = "1.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19be8061a06ab6f3a6cf21106c873578bf01bd42ad15e0311a9c76161cb1c753"
+checksum = "e875f1719c16de097dee81ed675e2d9bb63096823ed3f0ca827b7dea3028bbbb"
 dependencies = [
  "enumset_derive",
 ]
 
 [[package]]
 name = "enumset_derive"
-version = "0.6.1"
+version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03e7b551eba279bf0fa88b83a46330168c1560a52a94f5126f892f0b364ab3e0"
+checksum = "e08b6c6ab82d70f08844964ba10c7babb716de2ecaeab9be5717918a5177d3af"
 dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -1569,7 +1504,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.15",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -1667,9 +1602,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 
 [[package]]
 name = "h2"
-version = "0.3.18"
+version = "0.3.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "17f8a914c2987b688368b5138aa05321db91f4090cf26118185672ad588bce21"
+checksum = "d357c7ae988e7d2182f7d7871d0b963962420b0678b0997ce7de72001aeab782"
 dependencies = [
  "bytes",
  "fnv",
@@ -1704,9 +1639,6 @@ name = "hashbrown"
 version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
-dependencies = [
- "ahash 0.7.6",
-]
 
 [[package]]
 name = "hashbrown"
@@ -1714,16 +1646,16 @@ version = "0.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e"
 dependencies = [
- "ahash 0.8.3",
+ "ahash",
 ]
 
 [[package]]
 name = "hashlink"
-version = "0.8.1"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69fe1fcf8b4278d860ad0548329f892a3631fb63f82574df68275f34cdbe0ffa"
+checksum = "0761a1b9491c4f2e3d66aa0f62d0fba0af9a0e2852e4d48ea506632a4b56e6aa"
 dependencies = [
- "hashbrown 0.12.3",
+ "hashbrown 0.13.2",
 ]
 
 [[package]]
@@ -1892,6 +1824,19 @@ dependencies = [
  "tokio-rustls 0.23.4",
 ]
 
+[[package]]
+name = "hyper-rustls"
+version = "0.24.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7"
+dependencies = [
+ "http",
+ "hyper",
+ "rustls 0.21.1",
+ "tokio",
+ "tokio-rustls 0.24.0",
+]
+
 [[package]]
 name = "hyper-timeout"
 version = "0.4.1"
@@ -1933,12 +1878,11 @@ dependencies = [
 
 [[package]]
 name = "iana-time-zone-haiku"
-version = "0.1.1"
+version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca"
+checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
 dependencies = [
- "cxx",
- "cxx-build",
+ "cc",
 ]
 
 [[package]]
@@ -1999,9 +1943,9 @@ dependencies = [
 
 [[package]]
 name = "io-lifetimes"
-version = "1.0.10"
+version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220"
+checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
 dependencies = [
  "hermit-abi 0.3.1",
  "libc",
@@ -2022,7 +1966,7 @@ checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
 dependencies = [
  "hermit-abi 0.3.1",
  "io-lifetimes",
- "rustix 0.37.11",
+ "rustix 0.37.19",
  "windows-sys 0.48.0",
 ]
 
@@ -2043,9 +1987,9 @@ checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
 
 [[package]]
 name = "js-sys"
-version = "0.3.61"
+version = "0.3.63"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730"
+checksum = "2f37a4a5928311ac501dee68b3c7613a1037d0edb30c8e5427bd832d55d1b790"
 dependencies = [
  "wasm-bindgen",
 ]
@@ -2056,7 +2000,7 @@ version = "8.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6971da4d9c3aa03c3d8f3ff0f4155b534aad021292003895a469716b2a230378"
 dependencies = [
- "base64 0.21.0",
+ "base64 0.21.1",
  "pem",
  "ring",
  "serde",
@@ -2098,9 +2042,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 
 [[package]]
 name = "libc"
-version = "0.2.141"
+version = "0.2.144"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3304a64d199bb964be99741b7a14d26972741915b3649639149b2479bb46f4b5"
+checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
 
 [[package]]
 name = "libloading"
@@ -2112,15 +2056,6 @@ dependencies = [
  "winapi",
 ]
 
-[[package]]
-name = "link-cplusplus"
-version = "1.0.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5"
-dependencies = [
- "cc",
-]
-
 [[package]]
 name = "linux-raw-sys"
 version = "0.1.4"
@@ -2129,9 +2064,9 @@ checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.3.1"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d59d8c75012853d2e872fb56bc8a2e53718e2cafe1a4c823143141c6d90c322f"
+checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
 
 [[package]]
 name = "lock_api"
@@ -2316,9 +2251,9 @@ dependencies = [
 
 [[package]]
 name = "notify"
-version = "5.1.0"
+version = "5.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "58ea850aa68a06e48fdb069c0ec44d0d64c8dbffa49bf3b6f7f0a901fdea1ba9"
+checksum = "729f63e1ca555a43fe3efa4f3efdf4801c479da85b432242a7b726f353c88486"
 dependencies = [
  "bitflags",
  "crossbeam-channel",
@@ -2329,7 +2264,7 @@ dependencies = [
  "libc",
  "mio",
  "walkdir",
- "windows-sys 0.42.0",
+ "windows-sys 0.45.0",
 ]
 
 [[package]]
@@ -2435,7 +2370,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.15",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -2593,7 +2528,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "bytes",
- "clap 4.2.2",
+ "clap 4.3.0",
  "git-version",
  "pageserver",
  "postgres_ffi",
@@ -2612,7 +2547,7 @@ dependencies = [
  "byteorder",
  "bytes",
  "chrono",
- "clap 4.2.2",
+ "clap 4.3.0",
  "close_fds",
  "const_format",
  "consumption_metrics",
@@ -2768,22 +2703,22 @@ dependencies = [
 
 [[package]]
 name = "pin-project"
-version = "1.0.12"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad29a609b6bcd67fee905812e544992d216af9d755757c05ed2d0e15a74c6ecc"
+checksum = "c95a7476719eab1e366eaf73d0260af3021184f18177925b07f54b30089ceead"
 dependencies = [
  "pin-project-internal",
 ]
 
 [[package]]
 name = "pin-project-internal"
-version = "1.0.12"
+version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "069bdb1e05adc7a8990dce9cc75370895fbe4e3d58b9b73bf1aee56359344a55"
+checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -2800,9 +2735,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
 
 [[package]]
 name = "pkg-config"
-version = "0.3.26"
+version = "0.3.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
+checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
 
 [[package]]
 name = "plotters"
@@ -2976,12 +2911,12 @@ dependencies = [
 
 [[package]]
 name = "prettyplease"
-version = "0.2.4"
+version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ceca8aaf45b5c46ec7ed39fff75f57290368c1846d33d24a122ca81416ab058"
+checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1"
 dependencies = [
  "proc-macro2",
- "syn 2.0.15",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -2992,9 +2927,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.56"
+version = "1.0.58"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435"
+checksum = "fa1fb82fc0c281dd9671101b66b771ebbe1eaf967b96ac8740dcba4b70005ca8"
 dependencies = [
  "unicode-ident",
 ]
@@ -3009,7 +2944,7 @@ dependencies = [
  "byteorder",
  "hex",
  "lazy_static",
- "rustix 0.36.12",
+ "rustix 0.36.14",
 ]
 
 [[package]]
@@ -3093,7 +3028,7 @@ dependencies = [
  "bstr",
  "bytes",
  "chrono",
- "clap 4.2.2",
+ "clap 4.3.0",
  "consumption_metrics",
  "futures",
  "git-version",
@@ -3131,7 +3066,7 @@ dependencies = [
  "serde",
  "serde_json",
  "sha2",
- "socket2 0.5.2",
+ "socket2 0.5.3",
  "sync_wrapper",
  "thiserror",
  "tls-listener",
@@ -3154,9 +3089,9 @@ dependencies = [
 
 [[package]]
 name = "quote"
-version = "1.0.26"
+version = "1.0.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc"
+checksum = "8f4f29d145265ec1c483c7c654450edde0bfe043d3938d6972630663356d9500"
 dependencies = [
  "proc-macro2",
 ]
@@ -3245,13 +3180,13 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.7.3"
+version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d"
+checksum = "d1a59b5d8e97dee33696bf13c5ba8ab85341c002922fba050069326b9c498974"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-syntax",
+ "regex-syntax 0.7.2",
 ]
 
 [[package]]
@@ -3260,7 +3195,7 @@ version = "0.1.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
 dependencies = [
- "regex-syntax",
+ "regex-syntax 0.6.29",
 ]
 
 [[package]]
@@ -3269,6 +3204,12 @@ version = "0.6.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
 
+[[package]]
+name = "regex-syntax"
+version = "0.7.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78"
+
 [[package]]
 name = "remote_storage"
 version = "0.1.0"
@@ -3298,11 +3239,11 @@ dependencies = [
 
 [[package]]
 name = "reqwest"
-version = "0.11.16"
+version = "0.11.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "27b71749df584b7f4cac2c426c127a7c785a5106cc98f7a8feb044115f0fa254"
+checksum = "cde824a14b7c14f85caff81225f411faacc04a2013f41670f41443742b1c1c55"
 dependencies = [
- "base64 0.21.0",
+ "base64 0.21.1",
  "bytes",
  "encoding_rs",
  "futures-core",
@@ -3311,7 +3252,7 @@ dependencies = [
  "http",
  "http-body",
  "hyper",
- "hyper-rustls",
+ "hyper-rustls 0.24.0",
  "ipnet",
  "js-sys",
  "log",
@@ -3320,13 +3261,13 @@ dependencies = [
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
- "rustls 0.20.8",
+ "rustls 0.21.1",
  "rustls-pemfile",
  "serde",
  "serde_json",
  "serde_urlencoded",
  "tokio",
- "tokio-rustls 0.23.4",
+ "tokio-rustls 0.24.0",
  "tower-service",
  "url",
  "wasm-bindgen",
@@ -3338,9 +3279,9 @@ dependencies = [
 
 [[package]]
 name = "reqwest-middleware"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99c50db2c7ccd815f976473dd7d0bde296f8c3b77c383acf4fc021cdcf10852b"
+checksum = "4531c89d50effe1fac90d095c8b133c20c5c714204feee0bfc3fd158e784209d"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -3353,12 +3294,14 @@ dependencies = [
 
 [[package]]
 name = "reqwest-tracing"
-version = "0.4.1"
+version = "0.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a71d77945a1c5ae9604f0504901e77a1e2e71f2932b1cb8103078179ca62ff8"
+checksum = "783e8130d2427ddd7897dd3f814d4a3aea31b05deb42a4fdf8c18258fe5aefd1"
 dependencies = [
+ "anyhow",
  "async-trait",
  "getrandom",
+ "matchit",
  "opentelemetry",
  "reqwest",
  "reqwest-middleware",
@@ -3432,9 +3375,9 @@ dependencies = [
 
 [[package]]
 name = "rustc-demangle"
-version = "0.1.22"
+version = "0.1.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d4a36c42d1873f9a77c53bde094f9664d9891bc604a45b4798fd2c389ed12e5b"
+checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
 
 [[package]]
 name = "rustc-hash"
@@ -3462,9 +3405,9 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.36.12"
+version = "0.36.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e0af200a3324fa5bcd922e84e9b55a298ea9f431a489f01961acdebc6e908f25"
+checksum = "14e4d67015953998ad0eb82887a0eb0129e18a7e2f3b7b0f6c422fddcd503d62"
 dependencies = [
  "bitflags",
  "errno",
@@ -3476,15 +3419,15 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.37.11"
+version = "0.37.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "85597d61f83914ddeba6a47b3b8ffe7365107221c2e557ed94426489fefb5f77"
+checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d"
 dependencies = [
  "bitflags",
  "errno",
  "io-lifetimes",
  "libc",
- "linux-raw-sys 0.3.1",
+ "linux-raw-sys 0.3.8",
  "windows-sys 0.48.0",
 ]
 
@@ -3502,9 +3445,9 @@ dependencies = [
 
 [[package]]
 name = "rustls"
-version = "0.21.0"
+version = "0.21.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07180898a28ed6a7f7ba2311594308f595e3dd2e3c3812fa0a80a47b45f17e5d"
+checksum = "c911ba11bc8433e811ce56fde130ccf32f5127cab0e0194e9c68c5a5b671791e"
 dependencies = [
  "log",
  "ring",
@@ -3530,7 +3473,7 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b"
 dependencies = [
- "base64 0.21.0",
+ "base64 0.21.1",
 ]
 
 [[package]]
@@ -3565,7 +3508,7 @@ dependencies = [
  "byteorder",
  "bytes",
  "chrono",
- "clap 4.2.2",
+ "clap 4.3.0",
  "const_format",
  "crc32c",
  "fs2",
@@ -3639,12 +3582,6 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
 
-[[package]]
-name = "scratch"
-version = "1.0.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1792db035ce95be60c3f8853017b3999209281c24e2ba5bc8e59bf97a0c590c1"
-
 [[package]]
 name = "sct"
 version = "0.7.0"
@@ -3657,9 +3594,9 @@ dependencies = [
 
 [[package]]
 name = "security-framework"
-version = "2.8.2"
+version = "2.9.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a332be01508d814fed64bf28f798a146d73792121129962fdf335bb3c49a4254"
+checksum = "1fc758eb7bffce5b308734e9b0c1468893cae9ff70ebf13e7090be8dcbcc83a8"
 dependencies = [
  "bitflags",
  "core-foundation",
@@ -3670,9 +3607,9 @@ dependencies = [
 
 [[package]]
 name = "security-framework-sys"
-version = "2.8.0"
+version = "2.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "31c9bb296072e961fcbd8853511dd39c2d8be2deb1e17c6860b1d30732b323b4"
+checksum = "f51d0c0d83bec45f16480d0ce0058397a69e48fcdc52d1dc8855fb68acbd31a7"
 dependencies = [
  "core-foundation-sys",
  "libc",
@@ -3770,22 +3707,22 @@ dependencies = [
 
 [[package]]
 name = "serde"
-version = "1.0.160"
+version = "1.0.163"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb2f3770c8bce3bcda7e149193a069a0f4365bda1fa5cd88e03bca26afc1216c"
+checksum = "2113ab51b87a539ae008b5c6c02dc020ffa39afd2d83cffcb3f4eb2722cebec2"
 dependencies = [
  "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.160"
+version = "1.0.163"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "291a097c63d8497e00160b166a967a4a79c64f3facdd01cbd7502231688d77df"
+checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.15",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -3801,9 +3738,9 @@ dependencies = [
 
 [[package]]
 name = "serde_spanned"
-version = "0.6.1"
+version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0efd8caf556a6cebd3b285caf480045fcc1ac04f6bd786b09a6f11af30c4fcf4"
+checksum = "93107647184f6027e3b7dcb2e11034cf95ffa1e3a682c67951963ac69c1c007d"
 dependencies = [
  "serde",
 ]
@@ -3822,9 +3759,9 @@ dependencies = [
 
 [[package]]
 name = "serde_with"
-version = "2.3.2"
+version = "2.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "331bb8c3bf9b92457ab7abecf07078c13f7d270ba490103e84e8b014490cd0b0"
+checksum = "07ff71d2c147a7b57362cead5e22f772cd52f6ab31cfcd9edcd7f6aeb2a0afbe"
 dependencies = [
  "base64 0.13.1",
  "chrono",
@@ -3838,14 +3775,14 @@ dependencies = [
 
 [[package]]
 name = "serde_with_macros"
-version = "2.3.2"
+version = "2.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "859011bddcc11f289f07f467cc1fe01c7a941daa4d8f6c40d4d1c92eb6d9319c"
+checksum = "881b6f881b17d13214e5d494c939ebab463d01264ce1811e9d4ac3a882e7695f"
 dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -3959,9 +3896,9 @@ dependencies = [
 
 [[package]]
 name = "socket2"
-version = "0.5.2"
+version = "0.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d283f86695ae989d1e18440a943880967156325ba025f05049946bff47bcc2b"
+checksum = "2538b18701741680e0322a2302176d3253a35388e2e62f172f64f4f16605f877"
 dependencies = [
  "libc",
  "windows-sys 0.48.0",
@@ -4001,7 +3938,7 @@ dependencies = [
  "anyhow",
  "async-stream",
  "bytes",
- "clap 4.2.2",
+ "clap 4.3.0",
  "const_format",
  "futures",
  "futures-core",
@@ -4015,8 +3952,8 @@ dependencies = [
  "prost",
  "tokio",
  "tokio-stream",
- "tonic 0.9.1",
- "tonic-build 0.9.1",
+ "tonic 0.9.2",
+ "tonic-build 0.9.2",
  "tracing",
  "utils",
  "workspace_hack",
@@ -4059,9 +3996,9 @@ dependencies = [
 
 [[package]]
 name = "subtle"
-version = "2.4.1"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601"
+checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 
 [[package]]
 name = "svg_fmt"
@@ -4082,9 +4019,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.15"
+version = "2.0.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a34fcf3e8b60f57e6a14301a2e916d323af98b0ea63c599441eec8558660c822"
+checksum = "a6f671d4b5ffdb8eadec19c0ae67fe2639df8684bd7bc4b83d986b8db549cf01"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -4138,7 +4075,7 @@ dependencies = [
  "cfg-if",
  "fastrand",
  "redox_syscall 0.3.5",
- "rustix 0.37.11",
+ "rustix 0.37.19",
  "windows-sys 0.45.0",
 ]
 
@@ -4205,7 +4142,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.15",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -4220,9 +4157,9 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.20"
+version = "0.3.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890"
+checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
 dependencies = [
  "itoa",
  "serde",
@@ -4232,15 +4169,15 @@ dependencies = [
 
 [[package]]
 name = "time-core"
-version = "0.1.0"
+version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd"
+checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
 
 [[package]]
 name = "time-macros"
-version = "0.2.8"
+version = "0.2.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd80a657e71da814b8e5d60d3374fc6d35045062245d80224748ae522dd76f36"
+checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b"
 dependencies = [
  "time-core",
 ]
@@ -4320,7 +4257,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.15",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -4387,15 +4324,15 @@ version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
 dependencies = [
- "rustls 0.21.0",
+ "rustls 0.21.1",
  "tokio",
 ]
 
 [[package]]
 name = "tokio-stream"
-version = "0.1.12"
+version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fb52b74f05dbf495a8fba459fdc331812b96aa086d9eb78101fa0d4569c3313"
+checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842"
 dependencies = [
  "futures-core",
  "pin-project-lite",
@@ -4430,9 +4367,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-util"
-version = "0.7.7"
+version = "0.7.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5427d89453009325de0d8f342c9490009f76e999cb7672d77e46267448f7e6b2"
+checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d"
 dependencies = [
  "bytes",
  "futures-core",
@@ -4444,9 +4381,9 @@ dependencies = [
 
 [[package]]
 name = "toml"
-version = "0.7.3"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b403acf6f2bb0859c93c7f0d967cb4a75a7ac552100f9322faf64dc047669b21"
+checksum = "d6135d499e69981f9ff0ef2167955a5333c35e36f6937d382974566b3d5b94ec"
 dependencies = [
  "serde",
  "serde_spanned",
@@ -4456,18 +4393,18 @@ dependencies = [
 
 [[package]]
 name = "toml_datetime"
-version = "0.6.1"
+version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3ab8ed2edee10b50132aed5f331333428b011c99402b5a534154ed15746f9622"
+checksum = "5a76a9312f5ba4c2dec6b9161fdf25d87ad8a09256ccea5a556fef03c706a10f"
 dependencies = [
  "serde",
 ]
 
 [[package]]
 name = "toml_edit"
-version = "0.19.8"
+version = "0.19.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "239410c8609e8125456927e6707163a3b1fdb40561e4b803bc041f466ccfdc13"
+checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739"
 dependencies = [
  "indexmap",
  "serde",
@@ -4510,14 +4447,14 @@ dependencies = [
 
 [[package]]
 name = "tonic"
-version = "0.9.1"
+version = "0.9.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38bd8e87955eb13c1986671838177d6792cdc52af9bffced0d2c8a9a7f741ab3"
+checksum = "3082666a3a6433f7f511c7192923fa1fe07c69332d3c6a2e6bb040b569199d5a"
 dependencies = [
  "async-stream",
  "async-trait",
  "axum",
- "base64 0.21.0",
+ "base64 0.21.1",
  "bytes",
  "futures-core",
  "futures-util",
@@ -4555,9 +4492,9 @@ dependencies = [
 
 [[package]]
 name = "tonic-build"
-version = "0.9.1"
+version = "0.9.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0f60a933bbea70c95d633c04c951197ddf084958abaa2ed502a3743bdd8d8dd7"
+checksum = "a6fdaae4c2c638bb70fe42803a26fbd6fc6ac8c72f5c59f67ecc2a2dcabf4b07"
 dependencies = [
  "prettyplease 0.1.25",
  "proc-macro2",
@@ -4603,7 +4540,7 @@ name = "trace"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.2.2",
+ "clap 4.3.0",
  "pageserver_api",
  "utils",
  "workspace_hack",
@@ -4624,20 +4561,20 @@ dependencies = [
 
 [[package]]
 name = "tracing-attributes"
-version = "0.1.23"
+version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4017f8f45139870ca7e672686113917c71c7a6e02d4924eda67186083c03081a"
+checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.16",
 ]
 
 [[package]]
 name = "tracing-core"
-version = "0.1.30"
+version = "0.1.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a"
+checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a"
 dependencies = [
  "once_cell",
  "valuable",
@@ -4700,9 +4637,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-subscriber"
-version = "0.3.16"
+version = "0.3.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6176eae26dd70d0c919749377897b54a9276bd7061339665dd68777926b5a70"
+checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
  "matchers",
  "nu-ansi-term",
@@ -4792,9 +4729,9 @@ checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.8"
+version = "1.0.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
+checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0"
 
 [[package]]
 name = "unicode-normalization"
@@ -4914,9 +4851,9 @@ dependencies = [
 
 [[package]]
 name = "uuid"
-version = "1.3.1"
+version = "1.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b55a3fef2a1e3b3a00ce878640918820d3c51081576ac657d23af9fc7928fdb"
+checksum = "345444e32442451b267fc254ae85a209c64be56d2890e601a0c37ff0c3c5ecd2"
 dependencies = [
  "getrandom",
  "serde",
@@ -4951,7 +4888,7 @@ name = "wal_craft"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.2.2",
+ "clap 4.3.0",
  "env_logger",
  "log",
  "once_cell",
@@ -4989,9 +4926,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
 [[package]]
 name = "wasm-bindgen"
-version = "0.2.84"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b"
+checksum = "5bba0e8cb82ba49ff4e229459ff22a191bbe9a1cb3a341610c9c33efc27ddf73"
 dependencies = [
  "cfg-if",
  "wasm-bindgen-macro",
@@ -4999,24 +4936,24 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-backend"
-version = "0.2.84"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9"
+checksum = "19b04bc93f9d6bdee709f6bd2118f57dd6679cf1176a1af464fca3ab0d66d8fb"
 dependencies = [
  "bumpalo",
  "log",
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.16",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-futures"
-version = "0.4.34"
+version = "0.4.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f219e0d211ba40266969f6dbdd90636da12f75bee4fc9d6c23d1260dadb51454"
+checksum = "2d1985d03709c53167ce907ff394f5316aa22cb4e12761295c5dc57dacb6297e"
 dependencies = [
  "cfg-if",
  "js-sys",
@@ -5026,9 +4963,9 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro"
-version = "0.2.84"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5"
+checksum = "14d6b024f1a526bb0234f52840389927257beb670610081360e5a03c5df9c258"
 dependencies = [
  "quote",
  "wasm-bindgen-macro-support",
@@ -5036,28 +4973,28 @@ dependencies = [
 
 [[package]]
 name = "wasm-bindgen-macro-support"
-version = "0.2.84"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6"
+checksum = "e128beba882dd1eb6200e1dc92ae6c5dbaa4311aa7bb211ca035779e5efc39f8"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.16",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
 
 [[package]]
 name = "wasm-bindgen-shared"
-version = "0.2.84"
+version = "0.2.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d"
+checksum = "ed9d5b4305409d1fc9482fee2d7f9bcbf24b3972bf59817ef757e23982242a93"
 
 [[package]]
 name = "web-sys"
-version = "0.3.61"
+version = "0.3.63"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e33b99f4b23ba3eec1a53ac264e35a755f00e966e0065077d6027c0f575b0b97"
+checksum = "3bdd9ef4e984da1187bf8110c5cf5b845fbc87a23602cdf912386a76fcd3a7c2"
 dependencies = [
  "js-sys",
  "wasm-bindgen",
@@ -5291,9 +5228,9 @@ checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a"
 
 [[package]]
 name = "winnow"
-version = "0.4.1"
+version = "0.4.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae8970b36c66498d8ff1d66685dc86b91b29db0c7739899012f63a63814b4b28"
+checksum = "61de7bac303dc551fe038e2b3cef0f571087a47571ea6e79a87692ac99b99699"
 dependencies = [
  "memchr",
 ]
@@ -5314,7 +5251,7 @@ dependencies = [
  "anyhow",
  "bytes",
  "chrono",
- "clap 4.2.2",
+ "clap 4.3.0",
  "clap_builder",
  "crossbeam-utils",
  "either",
@@ -5325,7 +5262,6 @@ dependencies = [
  "futures-executor",
  "futures-sink",
  "futures-util",
- "hashbrown 0.12.3",
  "itertools",
  "libc",
  "log",
@@ -5337,7 +5273,7 @@ dependencies = [
  "prost",
  "rand",
  "regex",
- "regex-syntax",
+ "regex-syntax 0.7.2",
  "reqwest",
  "ring",
  "rustls 0.20.8",
@@ -5346,7 +5282,7 @@ dependencies = [
  "serde_json",
  "socket2 0.4.9",
  "syn 1.0.109",
- "syn 2.0.15",
+ "syn 2.0.16",
  "tokio",
  "tokio-rustls 0.23.4",
  "tokio-util",
diff --git a/Cargo.toml b/Cargo.toml
index 19d178385153..1cb8d65948a1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,7 +23,7 @@ async-stream = "0.3"
 async-trait = "0.1"
 atty = "0.2.14"
 aws-config = { version = "0.55", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "0.25"
+aws-sdk-s3 = "0.27"
 aws-smithy-http = "0.55"
 aws-credential-types = "0.55"
 aws-types = "0.55"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 3d40f5dede90..677b59f4535c 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -27,7 +27,6 @@ futures-core = { version = "0.3" }
 futures-executor = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
-hashbrown = { version = "0.12", features = ["raw"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -39,7 +38,7 @@ num-traits = { version = "0.2", features = ["i128"] }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
-regex-syntax = { version = "0.6" }
+regex-syntax = { version = "0.7" }
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "multipart", "rustls-tls"] }
 ring = { version = "0.16", features = ["std"] }
 rustls = { version = "0.20", features = ["dangerous_configuration"] }
@@ -62,7 +61,6 @@ url = { version = "2", features = ["serde"] }
 anyhow = { version = "1", features = ["backtrace"] }
 bytes = { version = "1", features = ["serde"] }
 either = { version = "1" }
-hashbrown = { version = "0.12", features = ["raw"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -70,7 +68,7 @@ memchr = { version = "2" }
 nom = { version = "7" }
 prost = { version = "0.11" }
 regex = { version = "1" }
-regex-syntax = { version = "0.6" }
+regex-syntax = { version = "0.7" }
 serde = { version = "1", features = ["alloc", "derive"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit", "visit-mut"] }
 syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit-mut"] }

From 08e7d2407b2c05b49fcc562197570fd2eb4e7bf6 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 25 May 2023 15:55:46 +0100
Subject: [PATCH 09/59] Storage: use Postgres 15 as default (#2809)

---
 control_plane/src/bin/neon_local.rs          | 2 +-
 control_plane/src/local_env.rs               | 2 +-
 pageserver/src/lib.rs                        | 2 +-
 scripts/export_import_between_pageservers.py | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 30880565ab66..39551642c0b6 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -41,7 +41,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);
 
-const DEFAULT_PG_VERSION: &str = "14";
+const DEFAULT_PG_VERSION: &str = "15";
 
 fn default_conf() -> String {
     format!(
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 2b1eec7c4b66..9286944412dd 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -24,7 +24,7 @@ use utils::{
 
 use crate::safekeeper::SafekeeperNode;
 
-pub const DEFAULT_PG_VERSION: u32 = 14;
+pub const DEFAULT_PG_VERSION: u32 = 15;
 
 //
 // This data structures represents neon_local CLI config
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 4349f0e2ea8b..36578ee4e025 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -35,7 +35,7 @@ use tracing::info;
 /// backwards-compatible changes to the metadata format.
 pub const STORAGE_FORMAT_VERSION: u16 = 3;
 
-pub const DEFAULT_PG_VERSION: u32 = 14;
+pub const DEFAULT_PG_VERSION: u32 = 15;
 
 // Magic constants used to identify different kinds of files
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py
index 4b599ce9b6b3..d95878b34170 100755
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -535,8 +535,8 @@ def export_timeline(
 
 
 def main(args: argparse.Namespace):
-    # any psql version will do here. use current DEFAULT_PG_VERSION = 14
-    psql_path = str(Path(args.pg_distrib_dir) / "v14" / "bin" / "psql")
+    # any psql version will do here. use current DEFAULT_PG_VERSION = 15
+    psql_path = str(Path(args.pg_distrib_dir) / "v15" / "bin" / "psql")
 
     old_pageserver_host = args.old_pageserver_host
     new_pageserver_host = args.new_pageserver_host

From 85e76090eae9a22b4b9980982586cb77baf6d608 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 25 May 2023 19:22:58 +0300
Subject: [PATCH 10/59] test: fix ancestor is stopping flakyness (#4234)

Flakyness most likely introduced in #4170, detected in
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4232/4980691289/index.html#suites/542b1248464b42cc5a4560f408115965/18e623585e47af33.

Opted to allow it globally because it can happen in other tests as well,
basically whenever compaction is enabled and we stop pageserver
gracefully.
---
 test_runner/fixtures/neon_fixtures.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3ff5429616b0..6b97c33ae4e2 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1619,6 +1619,8 @@ def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional
             ".*task iteration took longer than the configured period.*",
             # this is until #3501
             ".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant",
+            # these can happen anytime we do compactions from background task and shutdown pageserver
+            r".*ERROR.*ancestor timeline \S+ is being stopped",
         ]
 
     def start(

From ae805b985ddf1ec9507d314eecb8368504d06ae3 Mon Sep 17 00:00:00 2001
From: sharnoff <sharnoff@neon.tech>
Date: Thu, 25 May 2023 09:33:18 -0700
Subject: [PATCH 11/59] Bump vm-builder v0.7.3-alpha3 -> v0.8.0 (#4339)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Routine `vm-builder` version bump, from autoscaling repo release. You
can find the release notes here:
https://github.com/neondatabase/autoscaling/releases/tag/v0.8.0
The changes are from v0.7.2 — most of them were already included in
v0.7.3-alpha3.

Of particular note: This (finally) fixes the cgroup issues, so we should
now be able to scale up when we're about to run out of memory.

**NB:** This has the effect of limit the DB's memory usage in a way it
wasn't limited before. We may run into issues because of that. There is
currently no way to disable that behavior, other than switching the
endpoint back to the k8s-pod provisioner.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index bcc02398a196..6d89ce999440 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -778,7 +778,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.7.3-alpha3
+      VM_BUILDER_VERSION: v0.8.0
 
     steps:
       - name: Checkout

From 057cceb559623ed790ff4205bb4fdcbb7570d46d Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 25 May 2023 20:26:43 +0200
Subject: [PATCH 12/59] refactor: make timeline activation infallible (#4319)

Timeline::activate() was only fallible because `launch_wal_receiver`
was.

`launch_wal_receiver` was fallible only because of some preliminary
checks in `WalReceiver::start`.

Turns out these checks can be shifted to the type system by delaying
creatinon of the `WalReceiver` struct to the point where we activate the
timeline.

The changes in this PR were enabled by my previous refactoring that
funneled the broker_client from pageserver startup to the activate()
call sites.

Patch series:

- #4316
- #4317
- #4318
- #4319
---
 pageserver/src/tenant.rs                      |  34 ++----
 pageserver/src/tenant/timeline.rs             | 107 ++++++++++--------
 pageserver/src/tenant/timeline/walreceiver.rs |  63 ++++-------
 .../walreceiver/connection_manager.rs         |   5 +-
 4 files changed, 91 insertions(+), 118 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e247fbf42308..2827830f02ca 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -266,7 +266,7 @@ impl UninitializedTimeline<'_> {
         // updated it for the layers that we created during the import.
         let mut timelines = self.owning_tenant.timelines.lock().unwrap();
         let tl = self.initialize_with_lock(ctx, &mut timelines, false)?;
-        tl.activate(broker_client, ctx)?;
+        tl.activate(broker_client, ctx);
         Ok(tl)
     }
 
@@ -1333,7 +1333,7 @@ impl Tenant {
             }
         };
 
-        loaded_timeline.activate(broker_client, ctx)?;
+        loaded_timeline.activate(broker_client, ctx);
 
         if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
             // Wait for the upload of the 'index_part.json` file to finish, so that when we return
@@ -1481,7 +1481,10 @@ impl Tenant {
 
         // Stop the walreceiver first.
         debug!("waiting for wal receiver to shutdown");
-        timeline.walreceiver.stop().await;
+        let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
+        if let Some(walreceiver) = maybe_started_walreceiver {
+            walreceiver.stop().await;
+        }
         debug!("wal receiver shutdown confirmed");
 
         // Prevent new uploads from starting.
@@ -1678,30 +1681,10 @@ impl Tenant {
                     tasks::start_background_loops(self);
 
                     let mut activated_timelines = 0;
-                    let mut timelines_broken_during_activation = 0;
 
                     for timeline in not_broken_timelines {
-                        match timeline
-                            .activate(broker_client.clone(), ctx)
-                            .context("timeline activation for activating tenant")
-                        {
-                            Ok(()) => {
-                                activated_timelines += 1;
-                            }
-                            Err(e) => {
-                                error!(
-                                    "Failed to activate timeline {}: {:#}",
-                                    timeline.timeline_id, e
-                                );
-                                timeline.set_state(TimelineState::Broken);
-                                *current_state = TenantState::broken_from_reason(format!(
-                                    "failed to activate timeline {}: {}",
-                                    timeline.timeline_id, e
-                                ));
-
-                                timelines_broken_during_activation += 1;
-                            }
-                        }
+                        timeline.activate(broker_client.clone(), ctx);
+                        activated_timelines += 1;
                     }
 
                     let elapsed = self.loading_started_at.elapsed();
@@ -1713,7 +1696,6 @@ impl Tenant {
                         since_creation_millis = elapsed.as_millis(),
                         tenant_id = %self.tenant_id,
                         activated_timelines,
-                        timelines_broken_during_activation,
                         total_timelines,
                         post_state = <&'static str>::from(&*current_state),
                         "activation attempt finished"
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 9b449812ac26..b0aca458826b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -226,7 +226,7 @@ pub struct Timeline {
     /// or None if WAL receiver has not received anything for this timeline
     /// yet.
     pub last_received_wal: Mutex<Option<WalReceiverInfo>>,
-    pub walreceiver: WalReceiver,
+    pub walreceiver: Mutex<Option<WalReceiver>>,
 
     /// Relation size cache
     pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
@@ -621,17 +621,27 @@ impl Timeline {
             .await
         {
             Ok(()) => Ok(()),
-            seqwait_error => {
+            Err(e) => {
+                // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
                 drop(_timer);
-                let walreceiver_status = self.walreceiver.status().await;
-                seqwait_error.with_context(|| format!(
-                    "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, {}",
-                    lsn,
-                    self.get_last_record_lsn(),
-                    self.get_disk_consistent_lsn(),
-                    walreceiver_status.map(|status| status.to_human_readable_string())
-                            .unwrap_or_else(|| "WalReceiver status: Not active".to_string()),
-                ))
+                let walreceiver_status = {
+                    match &*self.walreceiver.lock().unwrap() {
+                        None => "stopping or stopped".to_string(),
+                        Some(walreceiver) => match walreceiver.status() {
+                            Some(status) => status.to_human_readable_string(),
+                            None => "Not active".to_string(),
+                        },
+                    }
+                };
+                Err(anyhow::Error::new(e).context({
+                    format!(
+                        "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
+                        lsn,
+                        self.get_last_record_lsn(),
+                        self.get_disk_consistent_lsn(),
+                        walreceiver_status,
+                    )
+                }))
             }
         }
     }
@@ -906,15 +916,10 @@ impl Timeline {
         Ok(())
     }
 
-    pub fn activate(
-        self: &Arc<Self>,
-        broker_client: BrokerClientChannel,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        self.launch_wal_receiver(ctx, broker_client)?;
+    pub fn activate(self: &Arc<Self>, broker_client: BrokerClientChannel, ctx: &RequestContext) {
+        self.launch_wal_receiver(ctx, broker_client);
         self.set_state(TimelineState::Active);
         self.launch_eviction_task();
-        Ok(())
     }
 
     pub fn set_state(&self, new_state: TimelineState) {
@@ -1323,15 +1328,7 @@ impl Timeline {
         let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
 
         let tenant_conf_guard = tenant_conf.read().unwrap();
-        let wal_connect_timeout = tenant_conf_guard
-            .walreceiver_connect_timeout
-            .unwrap_or(conf.default_tenant_conf.walreceiver_connect_timeout);
-        let lagging_wal_timeout = tenant_conf_guard
-            .lagging_wal_timeout
-            .unwrap_or(conf.default_tenant_conf.lagging_wal_timeout);
-        let max_lsn_wal_lag = tenant_conf_guard
-            .max_lsn_wal_lag
-            .unwrap_or(conf.default_tenant_conf.max_lsn_wal_lag);
+
         let evictions_low_residence_duration_metric_threshold =
             Self::get_evictions_low_residence_duration_metric_threshold(
                 &tenant_conf_guard,
@@ -1340,18 +1337,6 @@ impl Timeline {
         drop(tenant_conf_guard);
 
         Arc::new_cyclic(|myself| {
-            let walreceiver = WalReceiver::new(
-                TenantTimelineId::new(tenant_id, timeline_id),
-                Weak::clone(myself),
-                WalReceiverConf {
-                    wal_connect_timeout,
-                    lagging_wal_timeout,
-                    max_lsn_wal_lag,
-                    auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
-                    availability_zone: conf.availability_zone.clone(),
-                },
-            );
-
             let mut result = Timeline {
                 conf,
                 tenant_conf,
@@ -1363,7 +1348,7 @@ impl Timeline {
                 wanted_image_layers: Mutex::new(None),
 
                 walredo_mgr,
-                walreceiver,
+                walreceiver: Mutex::new(None),
 
                 remote_client: remote_client.map(Arc::new),
 
@@ -1483,17 +1468,49 @@ impl Timeline {
         *flush_loop_state = FlushLoopState::Running;
     }
 
-    pub(super) fn launch_wal_receiver(
-        &self,
+    /// Creates and starts the wal receiver.
+    ///
+    /// This function is expected to be called at most once per Timeline's lifecycle
+    /// when the timeline is activated.
+    fn launch_wal_receiver(
+        self: &Arc<Self>,
         ctx: &RequestContext,
         broker_client: BrokerClientChannel,
-    ) -> anyhow::Result<()> {
+    ) {
         info!(
             "launching WAL receiver for timeline {} of tenant {}",
             self.timeline_id, self.tenant_id
         );
-        self.walreceiver.start(ctx, broker_client)?;
-        Ok(())
+
+        let tenant_conf_guard = self.tenant_conf.read().unwrap();
+        let wal_connect_timeout = tenant_conf_guard
+            .walreceiver_connect_timeout
+            .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
+        let lagging_wal_timeout = tenant_conf_guard
+            .lagging_wal_timeout
+            .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
+        let max_lsn_wal_lag = tenant_conf_guard
+            .max_lsn_wal_lag
+            .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
+        drop(tenant_conf_guard);
+
+        let mut guard = self.walreceiver.lock().unwrap();
+        assert!(
+            guard.is_none(),
+            "multiple launches / re-launches of WAL receiver are not supported"
+        );
+        *guard = Some(WalReceiver::start(
+            Arc::clone(self),
+            WalReceiverConf {
+                wal_connect_timeout,
+                lagging_wal_timeout,
+                max_lsn_wal_lag,
+                auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
+                availability_zone: self.conf.availability_zone.clone(),
+            },
+            broker_client,
+            ctx,
+        ));
     }
 
     ///
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index 91f7208194d4..7ebf3cf17245 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -29,16 +29,14 @@ use crate::tenant::timeline::walreceiver::connection_manager::{
     connection_manager_loop_step, ConnectionManagerState,
 };
 
-use anyhow::Context;
 use std::future::Future;
 use std::num::NonZeroU64;
 use std::ops::ControlFlow;
-use std::sync::atomic::{self, AtomicBool};
-use std::sync::{Arc, Weak};
+use std::sync::Arc;
 use std::time::Duration;
 use storage_broker::BrokerClientChannel;
 use tokio::select;
-use tokio::sync::{watch, RwLock};
+use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 
@@ -62,46 +60,23 @@ pub struct WalReceiverConf {
 
 pub struct WalReceiver {
     timeline: TenantTimelineId,
-    timeline_ref: Weak<Timeline>,
-    conf: WalReceiverConf,
-    started: AtomicBool,
-    manager_status: Arc<RwLock<Option<ConnectionManagerStatus>>>,
+    manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
 }
 
 impl WalReceiver {
-    pub fn new(
-        timeline: TenantTimelineId,
-        timeline_ref: Weak<Timeline>,
-        conf: WalReceiverConf,
-    ) -> Self {
-        Self {
-            timeline,
-            timeline_ref,
-            conf,
-            started: AtomicBool::new(false),
-            manager_status: Arc::new(RwLock::new(None)),
-        }
-    }
-
     pub fn start(
-        &self,
-        ctx: &RequestContext,
+        timeline: Arc<Timeline>,
+        conf: WalReceiverConf,
         mut broker_client: BrokerClientChannel,
-    ) -> anyhow::Result<()> {
-        if self.started.load(atomic::Ordering::Acquire) {
-            anyhow::bail!("Wal receiver is already started");
-        }
-
-        let timeline = self.timeline_ref.upgrade().with_context(|| {
-            format!("walreceiver start on a dropped timeline {}", self.timeline)
-        })?;
-
+        ctx: &RequestContext,
+    ) -> Self {
         let tenant_id = timeline.tenant_id;
         let timeline_id = timeline.timeline_id;
         let walreceiver_ctx =
             ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
-        let wal_receiver_conf = self.conf.clone();
-        let loop_status = Arc::clone(&self.manager_status);
+
+        let loop_status = Arc::new(std::sync::RwLock::new(None));
+        let manager_status = Arc::clone(&loop_status);
         task_mgr::spawn(
             WALRECEIVER_RUNTIME.handle(),
             TaskKind::WalReceiverManager,
@@ -113,7 +88,7 @@ impl WalReceiver {
                 info!("WAL receiver manager started, connecting to broker");
                 let mut connection_manager_state = ConnectionManagerState::new(
                     timeline,
-                    wal_receiver_conf,
+                    conf,
                 );
                 loop {
                     select! {
@@ -137,29 +112,29 @@ impl WalReceiver {
                 }
 
                 connection_manager_state.shutdown().await;
-                *loop_status.write().await = None;
+                *loop_status.write().unwrap() = None;
                 Ok(())
             }
             .instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id))
         );
 
-        self.started.store(true, atomic::Ordering::Release);
-
-        Ok(())
+        Self {
+            timeline: TenantTimelineId::new(tenant_id, timeline_id),
+            manager_status,
+        }
     }
 
-    pub async fn stop(&self) {
+    pub async fn stop(self) {
         task_mgr::shutdown_tasks(
             Some(TaskKind::WalReceiverManager),
             Some(self.timeline.tenant_id),
             Some(self.timeline.timeline_id),
         )
         .await;
-        self.started.store(false, atomic::Ordering::Release);
     }
 
-    pub(super) async fn status(&self) -> Option<ConnectionManagerStatus> {
-        self.manager_status.read().await.clone()
+    pub(super) fn status(&self) -> Option<ConnectionManagerStatus> {
+        self.manager_status.read().unwrap().clone()
     }
 }
 
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 3da1f023e1a6..6b65e1fd429c 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -29,7 +29,6 @@ use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use storage_broker::BrokerClientChannel;
 use storage_broker::Streaming;
 use tokio::select;
-use tokio::sync::RwLock;
 use tracing::*;
 
 use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
@@ -48,7 +47,7 @@ pub(super) async fn connection_manager_loop_step(
     broker_client: &mut BrokerClientChannel,
     connection_manager_state: &mut ConnectionManagerState,
     ctx: &RequestContext,
-    manager_status: &RwLock<Option<ConnectionManagerStatus>>,
+    manager_status: &std::sync::RwLock<Option<ConnectionManagerStatus>>,
 ) -> ControlFlow<(), ()> {
     match connection_manager_state
         .timeline
@@ -195,7 +194,7 @@ pub(super) async fn connection_manager_loop_step(
                 .change_connection(new_candidate, ctx)
                 .await
         }
-        *manager_status.write().await = Some(connection_manager_state.manager_status());
+        *manager_status.write().unwrap() = Some(connection_manager_state.manager_status());
     }
 }
 

From 2b25f0dfa08ec1f6d6f73fd08481571f406c437d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 25 May 2023 22:05:11 +0100
Subject: [PATCH 13/59] Fix flakiness of test_metric_collection (#4346)

## Problem

Test `test_metric_collection` become flaky:

```
AssertionError: assert not ['2023-05-25T14:03:41.644042Z ERROR metrics_collection: failed to send metrics: reqwest::Error { kind: Request, url: Url { scheme: "http", cannot_be_a_base: false, username: "", password: None, host: Some(Domain("localhost")), port: Some(18022), path: "/billing/api/v1/usage_events", query: None, fragment: None }, source: hyper::Error(Connect, ConnectError("tcp connect error", Os { code: 99, kind: AddrNotAvailable, message: "Cannot assign requested address" })) }',
                            ...]
```
I suspect it is caused by having 2 places when we define
`httpserver_listen_address` fixture (which is internally used by
`pytest-httpserver` plugin)

## Summary of changes
- Remove the definition of `httpserver_listen_address` from
`test_runner/regress/test_ddl_forwarding.py` and keep one in
`test_runner/fixtures/neon_fixtures.py`
- Also remote unused `httpserver_listen_address` parameter from
`test_proxy_metric_collection`
---
 test_runner/regress/test_ddl_forwarding.py    | 11 +----------
 test_runner/regress/test_metric_collection.py |  1 -
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index 27ebd3c18185..6bfa8fdbe7d2 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -4,21 +4,12 @@
 import psycopg2
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    PortDistributor,
-    VanillaPostgres,
-)
+from fixtures.neon_fixtures import VanillaPostgres
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
 
 
-@pytest.fixture(scope="session")
-def httpserver_listen_address(port_distributor: PortDistributor):
-    port = port_distributor.get_port()
-    return ("localhost", port)
-
-
 def handle_db(dbs, roles, operation):
     if operation["op"] == "set":
         if "old_name" in operation and operation["old_name"] in dbs:
diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index 00ea77f2e78c..12e695bcbd82 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -228,7 +228,6 @@ def proxy_with_metric_collector(
 @pytest.mark.asyncio
 async def test_proxy_metric_collection(
     httpserver: HTTPServer,
-    httpserver_listen_address,
     proxy_with_metric_collector: NeonProxy,
     vanilla_pg: VanillaPostgres,
 ):

From 024109fbeb533b4574976a5899c27f56891de881 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 26 May 2023 13:35:50 +0300
Subject: [PATCH 14/59] Allow for higher s3 concurrency (#4292)

We currently have a semaphore based rate limiter which we hope will keep
us under S3 limits. However, the semaphore does not consider time, so
I've been hesitant to raise the concurrency limit of 100.

See #3698.

The PR Introduces a leaky-bucket based rate limiter instead of the
`tokio::sync::Semaphore` which will allow us to raise the limit later
on. The configuration changes are not contained here.
---
 Cargo.lock                           | 12 ++++
 libs/remote_storage/Cargo.toml       |  2 +
 libs/remote_storage/src/lib.rs       |  2 +
 libs/remote_storage/src/s3_bucket.rs | 85 +++++++++-------------------
 4 files changed, 42 insertions(+), 59 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d390df94e02e..69d161d2b1bf 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2040,6 +2040,17 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 
+[[package]]
+name = "leaky-bucket"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d615fd0b579225f0d3c8d781af50a73644b571da8b5b50053ef2dcfa60dd51e7"
+dependencies = [
+ "parking_lot",
+ "tokio",
+ "tracing",
+]
+
 [[package]]
 name = "libc"
 version = "0.2.144"
@@ -3222,6 +3233,7 @@ dependencies = [
  "aws-smithy-http",
  "aws-types",
  "hyper",
+ "leaky-bucket",
  "metrics",
  "once_cell",
  "pin-project-lite",
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 0877a38dd9b7..5da02293a874 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -25,6 +25,8 @@ utils.workspace = true
 pin-project-lite.workspace = true
 workspace_hack.workspace = true
 
+leaky-bucket = "1.0"
+
 [dev-dependencies]
 tempfile.workspace = true
 test-context.workspace = true
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index e0cc3ca54364..f3ae2425f63e 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -37,6 +37,8 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
 /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
+///
+/// IAM ratelimit should never be observed with caching credentials provider.
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 0be8c72fe03e..631caa6a489d 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -21,10 +21,7 @@ use aws_sdk_s3::{
 };
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
-use tokio::{
-    io::{self, AsyncRead},
-    sync::Semaphore,
-};
+use tokio::io;
 use tokio_util::io::ReaderStream;
 use tracing::debug;
 
@@ -105,9 +102,8 @@ pub struct S3Bucket {
     prefix_in_bucket: Option<String>,
     max_keys_per_list_response: Option<i32>,
     // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
-    // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
     // The helps to ensure we don't exceed the thresholds.
-    concurrency_limiter: Arc<Semaphore>,
+    concurrency_limiter: Arc<leaky_bucket::RateLimiter>,
 }
 
 #[derive(Default)]
@@ -158,12 +154,24 @@ impl S3Bucket {
             }
             prefix
         });
+
+        let rps = aws_config.concurrency_limit.get();
+        let concurrency_limiter = leaky_bucket::RateLimiter::builder()
+            .max(rps)
+            .initial(0)
+            // refill it by rps every second. this means the (rps+1)th request will have to wait for
+            // 1 second from earliest.
+            .refill(rps)
+            .interval(std::time::Duration::from_secs(1))
+            .fair(true)
+            .build();
+
         Ok(Self {
             client,
             bucket_name: aws_config.bucket_name.clone(),
             max_keys_per_list_response: aws_config.max_keys_per_list_response,
             prefix_in_bucket,
-            concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
+            concurrency_limiter: Arc::new(concurrency_limiter),
         })
     }
 
@@ -195,13 +203,10 @@ impl S3Bucket {
     }
 
     async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
-        let permit = self
-            .concurrency_limiter
-            .clone()
-            .acquire_owned()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 download")
-            .map_err(DownloadError::Other)?;
+        // while the download could take a long time with `leaky_bucket` we have nothing to release
+        // once the download is done. this is because with "requests per second" rate limiting on
+        // s3, there should be no meaning for the long requests.
+        self.concurrency_limiter.clone().acquire_owned(1).await;
 
         metrics::inc_get_object();
 
@@ -219,10 +224,9 @@ impl S3Bucket {
                 let metadata = object_output.metadata().cloned().map(StorageMetadata);
                 Ok(Download {
                     metadata,
-                    download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
-                        permit,
+                    download_stream: Box::pin(io::BufReader::new(
                         object_output.body.into_async_read(),
-                    ))),
+                    )),
                 })
             }
             Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
@@ -238,32 +242,6 @@ impl S3Bucket {
     }
 }
 
-pin_project_lite::pin_project! {
-    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
-    struct RatelimitedAsyncRead<S> {
-        permit: tokio::sync::OwnedSemaphorePermit,
-        #[pin]
-        inner: S,
-    }
-}
-
-impl<S: AsyncRead> RatelimitedAsyncRead<S> {
-    fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
-        RatelimitedAsyncRead { permit, inner }
-    }
-}
-
-impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
-    fn poll_read(
-        self: std::pin::Pin<&mut Self>,
-        cx: &mut std::task::Context<'_>,
-        buf: &mut io::ReadBuf<'_>,
-    ) -> std::task::Poll<std::io::Result<()>> {
-        let this = self.project();
-        this.inner.poll_read(cx, buf)
-    }
-}
-
 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
     /// See the doc for `RemoteStorage::list_prefixes`
@@ -289,12 +267,7 @@ impl RemoteStorage for S3Bucket {
 
         let mut continuation_token = None;
         loop {
-            let _guard = self
-                .concurrency_limiter
-                .acquire()
-                .await
-                .context("Concurrency limiter semaphore got closed during S3 list")
-                .map_err(DownloadError::Other)?;
+            self.concurrency_limiter.acquire_one().await;
 
             metrics::inc_list_objects();
 
@@ -339,11 +312,9 @@ impl RemoteStorage for S3Bucket {
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
     ) -> anyhow::Result<()> {
-        let _guard = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 upload")?;
+        // similarly to downloads, the permit does not have live through the upload, but instead we
+        // are rate limiting requests per second.
+        self.concurrency_limiter.acquire_one().await;
 
         metrics::inc_put_object();
 
@@ -398,11 +369,7 @@ impl RemoteStorage for S3Bucket {
     }
 
     async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        let _guard = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 delete")?;
+        self.concurrency_limiter.acquire_one().await;
 
         metrics::inc_delete_object();
 

From a560b28829f25d6be033cba589c6cbbf85dc55a1 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 26 May 2023 16:19:36 +0300
Subject: [PATCH 15/59] Make new tenant/timeline IDs mandatory in create APIs.
 (#4304)

We used to generate the ID, if the caller didn't specify it. That's bad
practice, however, because network is never fully reliable, so it's
possible we create a new tenant but the caller doesn't know about it,
and because it doesn't know the tenant ID, it has no way of retrying or
checking if it succeeded. To discourage that, make it mandatory. The web
control plane has not relied on the auto-generation for a long time.
---
 control_plane/src/pageserver.rs               |  7 ++++++
 .../compute_wrapper/shell/compute.sh          | 20 +++++++++++-----
 libs/pageserver_api/src/models.rs             | 16 ++++++-------
 pageserver/src/http/openapi_spec.yml          |  4 ++++
 pageserver/src/http/routes.rs                 | 17 ++++----------
 test_runner/fixtures/pageserver/http.py       |  8 +++----
 test_runner/regress/test_auth.py              | 19 ++++++++-------
 test_runner/regress/test_tenants.py           | 23 ++++++++++---------
 8 files changed, 62 insertions(+), 52 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 6309494b71a9..149cfd00cba0 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -370,6 +370,10 @@ impl PageServerNode {
                 .remove("evictions_low_residence_duration_metric_threshold")
                 .map(|x| x.to_string()),
         };
+
+        // If tenant ID was not specified, generate one
+        let new_tenant_id = new_tenant_id.unwrap_or(TenantId::generate());
+
         let request = models::TenantCreateRequest {
             new_tenant_id,
             config,
@@ -495,6 +499,9 @@ impl PageServerNode {
         ancestor_timeline_id: Option<TimelineId>,
         pg_version: Option<u32>,
     ) -> anyhow::Result<TimelineInfo> {
+        // If timeline ID was not specified, generate one
+        let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
+
         self.http_request(
             Method::POST,
             format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh
index cef2b485f33f..22660a63ce2e 100755
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -1,6 +1,14 @@
 #!/bin/bash
 set -eux
 
+# Generate a random tenant or timeline ID
+#
+# Takes a variable name as argument. The result is stored in that variable.
+generate_id() {
+    local -n resvar=$1
+    printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
+}
+
 PG_VERSION=${PG_VERSION:-14}
 
 SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
@@ -13,29 +21,29 @@ done
 echo "Page server is ready."
 
 echo "Create a tenant and timeline"
+generate_id tenant_id
 PARAMS=(
      -sb 
      -X POST
      -H "Content-Type: application/json"
-     -d "{}"
+     -d "{\"new_tenant_id\": \"${tenant_id}\"}"
      http://pageserver:9898/v1/tenant/
 )
-tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g')
+result=$(curl "${PARAMS[@]}")
+echo $result | jq .
 
+generate_id timeline_id
 PARAMS=(
      -sb 
      -X POST
      -H "Content-Type: application/json"
-     -d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}"
+     -d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"
      "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
 )
 result=$(curl "${PARAMS[@]}")
 echo $result | jq .
 
 echo "Overwrite tenant id and timeline id in spec file"
-tenant_id=$(echo ${result} | jq -r .tenant_id)
-timeline_id=$(echo ${result} | jq -r .timeline_id)
-
 sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
 sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
 
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 3927ba3dadf7..540633d11363 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -118,9 +118,8 @@ pub enum TimelineState {
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 pub struct TimelineCreateRequest {
-    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub new_timeline_id: Option<TimelineId>,
+    #[serde_as(as = "DisplayFromStr")]
+    pub new_timeline_id: TimelineId,
     #[serde(default)]
     #[serde_as(as = "Option<DisplayFromStr>")]
     pub ancestor_timeline_id: Option<TimelineId>,
@@ -131,12 +130,11 @@ pub struct TimelineCreateRequest {
 }
 
 #[serde_as]
-#[derive(Serialize, Deserialize, Debug, Default)]
+#[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
-    #[serde(default)]
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    pub new_tenant_id: Option<TenantId>,
+    #[serde_as(as = "DisplayFromStr")]
+    pub new_tenant_id: TenantId,
     #[serde(flatten)]
     pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
@@ -184,10 +182,10 @@ pub struct StatusResponse {
 }
 
 impl TenantCreateRequest {
-    pub fn new(new_tenant_id: Option<TenantId>) -> TenantCreateRequest {
+    pub fn new(new_tenant_id: TenantId) -> TenantCreateRequest {
         TenantCreateRequest {
             new_tenant_id,
-            ..Default::default()
+            config: TenantConfig::default(),
         }
     }
 }
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index e23d3f3a207d..0d912c95e07d 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -678,6 +678,8 @@ paths:
           application/json:
             schema:
               type: object
+              required:
+                - new_timeline_id
               properties:
                 new_timeline_id:
                   type: string
@@ -936,6 +938,8 @@ components:
       allOf:
         - $ref: '#/components/schemas/TenantConfig'
         - type: object
+          required:
+            - new_tenant_id
           properties:
             new_tenant_id:
               type: string
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 25e0d88e7062..30c219f77377 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -301,9 +301,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
     let request_data: TimelineCreateRequest = json_request(&mut request).await?;
     check_permission(&request, Some(tenant_id))?;
 
-    let new_timeline_id = request_data
-        .new_timeline_id
-        .unwrap_or_else(TimelineId::generate);
+    let new_timeline_id = request_data.new_timeline_id;
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
 
@@ -330,7 +328,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
             Err(err) => Err(ApiError::InternalServerError(err)),
         }
     }
-    .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
+    .instrument(info_span!("timeline_create", tenant = %tenant_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
     .await
 }
 
@@ -764,6 +762,8 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
 }
 
 async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let request_data: TenantCreateRequest = json_request(&mut request).await?;
+    let target_tenant_id = request_data.new_tenant_id;
     check_permission(&request, None)?;
 
     let _timer = STORAGE_TIME_GLOBAL
@@ -771,17 +771,10 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
         .expect("bug")
         .start_timer();
 
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let request_data: TenantCreateRequest = json_request(&mut request).await?;
-
     let tenant_conf =
         TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
 
-    let target_tenant_id = request_data
-        .new_tenant_id
-        .map(TenantId::from)
-        .unwrap_or_else(TenantId::generate);
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
 
     let state = get_state(&request);
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 12720478813b..f258a3a24d4c 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -155,14 +155,14 @@ def tenant_list(self) -> List[Dict[Any, Any]]:
         return res_json
 
     def tenant_create(
-        self, new_tenant_id: Optional[TenantId] = None, conf: Optional[Dict[str, Any]] = None
+        self, new_tenant_id: TenantId, conf: Optional[Dict[str, Any]] = None
     ) -> TenantId:
         if conf is not None:
             assert "new_tenant_id" not in conf.keys()
         res = self.post(
             f"http://localhost:{self.port}/v1/tenant",
             json={
-                "new_tenant_id": str(new_tenant_id) if new_tenant_id else None,
+                "new_tenant_id": str(new_tenant_id),
                 **(conf or {}),
             },
         )
@@ -293,13 +293,13 @@ def timeline_create(
         self,
         pg_version: PgVersion,
         tenant_id: TenantId,
-        new_timeline_id: Optional[TimelineId] = None,
+        new_timeline_id: TimelineId,
         ancestor_timeline_id: Optional[TimelineId] = None,
         ancestor_start_lsn: Optional[Lsn] = None,
         **kwargs,
     ) -> Dict[Any, Any]:
         body: Dict[str, Any] = {
-            "new_timeline_id": str(new_timeline_id) if new_timeline_id else None,
+            "new_timeline_id": str(new_timeline_id),
             "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None,
             "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None,
         }
diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index 3e4a0bfbbb78..fb7974883266 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -3,7 +3,7 @@
 import pytest
 from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.types import TenantId
+from fixtures.types import TenantId, TimelineId
 
 
 def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
@@ -25,21 +25,19 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
     ps.safe_psql("set FOO", password=tenant_token)
     ps.safe_psql("set FOO", password=pageserver_token)
 
-    new_timeline_id = env.neon_cli.create_branch(
-        "test_pageserver_auth", tenant_id=env.initial_tenant
-    )
-
     # tenant can create branches
     tenant_http_client.timeline_create(
         pg_version=env.pg_version,
         tenant_id=env.initial_tenant,
-        ancestor_timeline_id=new_timeline_id,
+        new_timeline_id=TimelineId.generate(),
+        ancestor_timeline_id=env.initial_timeline,
     )
     # console can create branches for tenant
     pageserver_http_client.timeline_create(
         pg_version=env.pg_version,
         tenant_id=env.initial_tenant,
-        ancestor_timeline_id=new_timeline_id,
+        new_timeline_id=TimelineId.generate(),
+        ancestor_timeline_id=env.initial_timeline,
     )
 
     # fail to create branch using token with different tenant_id
@@ -49,18 +47,19 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
         invalid_tenant_http_client.timeline_create(
             pg_version=env.pg_version,
             tenant_id=env.initial_tenant,
-            ancestor_timeline_id=new_timeline_id,
+            new_timeline_id=TimelineId.generate(),
+            ancestor_timeline_id=env.initial_timeline,
         )
 
     # create tenant using management token
-    pageserver_http_client.tenant_create()
+    pageserver_http_client.tenant_create(TenantId.generate())
 
     # fail to create tenant using tenant token
     with pytest.raises(
         PageserverApiException,
         match="Forbidden: Attempt to access management api with tenant scope. Permission denied",
     ):
-        tenant_http_client.tenant_create()
+        tenant_http_client.tenant_create(TenantId.generate())
 
 
 def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 5642449ce6a3..6599fa7ba59c 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -314,21 +314,22 @@ def test_pageserver_with_empty_tenants(
 
     client = env.pageserver.http_client()
 
-    tenant_with_empty_timelines_dir = client.tenant_create()
-    temp_timelines = client.timeline_list(tenant_with_empty_timelines_dir)
+    tenant_with_empty_timelines = TenantId.generate()
+    client.tenant_create(tenant_with_empty_timelines)
+    temp_timelines = client.timeline_list(tenant_with_empty_timelines)
     for temp_timeline in temp_timelines:
         client.timeline_delete(
-            tenant_with_empty_timelines_dir, TimelineId(temp_timeline["timeline_id"])
+            tenant_with_empty_timelines, TimelineId(temp_timeline["timeline_id"])
         )
     files_in_timelines_dir = sum(
         1
         for _p in Path.iterdir(
-            Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines_dir) / "timelines"
+            Path(env.repo_dir) / "tenants" / str(tenant_with_empty_timelines) / "timelines"
         )
     )
     assert (
         files_in_timelines_dir == 0
-    ), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory"
+    ), f"Tenant {tenant_with_empty_timelines} should have an empty timelines/ directory"
 
     # Trigger timeline re-initialization after pageserver restart
     env.endpoints.stop_all()
@@ -356,15 +357,15 @@ def test_pageserver_with_empty_tenants(
 
     assert env.pageserver.log_contains(".*Setting tenant as Broken state, reason:.*")
 
-    [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)]
+    [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines)]
     assert (
         loaded_tenant["state"]["slug"] == "Active"
-    ), "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation"
+    ), "Tenant {tenant_with_empty_timelines} with empty timelines dir should be active and ready for timeline creation"
 
-    loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines_dir)
+    loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines)
     assert (
         loaded_tenant_status["state"]["slug"] == "Active"
-    ), f"Tenant {tenant_with_empty_timelines_dir} without timelines dir should be active"
+    ), f"Tenant {tenant_with_empty_timelines} without timelines dir should be active"
 
     time.sleep(1)  # to allow metrics propagation
 
@@ -374,7 +375,7 @@ def test_pageserver_with_empty_tenants(
         "state": "Broken",
     }
     active_tenants_metric_filter = {
-        "tenant_id": str(tenant_with_empty_timelines_dir),
+        "tenant_id": str(tenant_with_empty_timelines),
         "state": "Active",
     }
 
@@ -386,7 +387,7 @@ def test_pageserver_with_empty_tenants(
 
     assert (
         tenant_active_count == 1
-    ), f"Tenant {tenant_with_empty_timelines_dir} should have metric as active"
+    ), f"Tenant {tenant_with_empty_timelines} should have metric as active"
 
     tenant_broken_count = int(
         ps_metrics.query_one(

From 339a3e314609ed00e675b455d0fdb98e908394d2 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 26 May 2023 14:49:42 +0100
Subject: [PATCH 16/59] GitHub Autocomment: comment commits for branches
 (#4335)

## Problem

GitHub Autocomment script posts a comment only for PRs. It's harder
to debug failed tests on main or release branches.

## Summary of changes

- Change the GitHub Autocomment script to be able to post a comment to
either a PR or a commit of a branch
---
 .github/workflows/build_and_test.yml          |  6 +--
 ...-test-report.js => comment-test-report.js} | 37 +++++++++++++++----
 2 files changed, 31 insertions(+), 12 deletions(-)
 rename scripts/{pr-comment-test-report.js => comment-test-report.js} (85%)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 6d89ce999440..336dea04eb61 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -407,9 +407,7 @@ jobs:
         uses: ./.github/actions/allure-report-generate
 
       - uses: actions/github-script@v6
-        if: >
-          !cancelled() &&
-          github.event_name == 'pull_request'
+        if: ${{ !cancelled() }}
         with:
           # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
           retries: 5
@@ -419,7 +417,7 @@ jobs:
               reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
             }
 
-            const script = require("./scripts/pr-comment-test-report.js")
+            const script = require("./scripts/comment-test-report.js")
             await script({
               github,
               context,
diff --git a/scripts/pr-comment-test-report.js b/scripts/comment-test-report.js
similarity index 85%
rename from scripts/pr-comment-test-report.js
rename to scripts/comment-test-report.js
index 3a7bba0daa15..a7fd5b0bef1a 100644
--- a/scripts/pr-comment-test-report.js
+++ b/scripts/comment-test-report.js
@@ -1,5 +1,5 @@
 //
-// The script parses Allure reports and posts a comment with a summary of the test results to the PR.
+// The script parses Allure reports and posts a comment with a summary of the test results to the PR or to the latest commit in the branch.
 //
 // The comment is updated on each run with the latest results.
 //
@@ -7,7 +7,7 @@
 // - uses: actions/github-script@v6
 //   with:
 //     script: |
-//       const script = require("./scripts/pr-comment-test-report.js")
+//       const script = require("./scripts/comment-test-report.js")
 //       await script({
 //         github,
 //         context,
@@ -35,8 +35,12 @@ class DefaultMap extends Map {
 module.exports = async ({ github, context, fetch, report }) => {
     // Marker to find the comment in the subsequent runs
     const startMarker = `<!--AUTOMATIC COMMENT START #${context.payload.number}-->`
+    // If we run the script in the PR or in the branch (main/release/...)
+    const isPullRequest = !!context.payload.pull_request
+    // Latest commit in PR or in the branch
+    const commitSha = isPullRequest ? context.payload.pull_request.head.sha : context.sha
     // Let users know that the comment is updated automatically
-    const autoupdateNotice = `<div align="right"><sub>The comment gets automatically updated with the latest test results<br>${context.payload.pull_request.head.sha} at ${new Date().toISOString()} :recycle:</sub></div>`
+    const autoupdateNotice = `<div align="right"><sub>The comment gets automatically updated with the latest test results<br>${commitSha} at ${new Date().toISOString()} :recycle:</sub></div>`
     // GitHub bot id taken from (https://api.github.com/users/github-actions[bot])
     const githubActionsBotId = 41898282
     // Commend body itself
@@ -166,22 +170,39 @@ module.exports = async ({ github, context, fetch, report }) => {
 
     commentBody += autoupdateNotice
 
-    const { data: comments } = await github.rest.issues.listComments({
-        issue_number: context.payload.number,
+    let createCommentFn, listCommentsFn, updateCommentFn, issueNumberOrSha
+    if (isPullRequest) {
+        createCommentFn  = github.rest.issues.createComment
+        listCommentsFn   = github.rest.issues.listComments
+        updateCommentFn  = github.rest.issues.updateComment
+        issueNumberOrSha = {
+            issue_number: context.payload.number,
+        }
+    } else {
+        updateCommentFn  = github.rest.repos.updateCommitComment
+        listCommentsFn   = github.rest.repos.listCommentsForCommit
+        createCommentFn  = github.rest.repos.createCommitComment
+        issueNumberOrSha = {
+            commit_sha: commitSha,
+        }
+    }
+
+    const { data: comments } = await listCommentsFn({
+        ...issueNumberOrSha,
         ...ownerRepoParams,
     })
 
     const comment = comments.find(comment => comment.user.id === githubActionsBotId && comment.body.startsWith(startMarker))
     if (comment) {
-        await github.rest.issues.updateComment({
+        await updateCommentFn({
             comment_id: comment.id,
             body: commentBody,
             ...ownerRepoParams,
         })
     } else {
-        await github.rest.issues.createComment({
-            issue_number: context.payload.number,
+        await createCommentFn({
             body: commentBody,
+            ...issueNumberOrSha,
             ...ownerRepoParams,
         })
     }

From be177f82dc5c9aa8166a3fdfbc03dbd8105d0c59 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 26 May 2023 18:37:17 +0300
Subject: [PATCH 17/59] Revert "Allow for higher s3 concurrency (#4292)"
 (#4356)

This reverts commit 024109fbeb533b4574976a5899c27f56891de881 for it
failing to be speed up anything, but run into more errors.

See: #3698.
---
 Cargo.lock                           | 12 ----
 libs/remote_storage/Cargo.toml       |  2 -
 libs/remote_storage/src/lib.rs       |  2 -
 libs/remote_storage/src/s3_bucket.rs | 85 +++++++++++++++++++---------
 4 files changed, 59 insertions(+), 42 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 69d161d2b1bf..d390df94e02e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2040,17 +2040,6 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 
-[[package]]
-name = "leaky-bucket"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d615fd0b579225f0d3c8d781af50a73644b571da8b5b50053ef2dcfa60dd51e7"
-dependencies = [
- "parking_lot",
- "tokio",
- "tracing",
-]
-
 [[package]]
 name = "libc"
 version = "0.2.144"
@@ -3233,7 +3222,6 @@ dependencies = [
  "aws-smithy-http",
  "aws-types",
  "hyper",
- "leaky-bucket",
  "metrics",
  "once_cell",
  "pin-project-lite",
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 5da02293a874..0877a38dd9b7 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -25,8 +25,6 @@ utils.workspace = true
 pin-project-lite.workspace = true
 workspace_hack.workspace = true
 
-leaky-bucket = "1.0"
-
 [dev-dependencies]
 tempfile.workspace = true
 test-context.workspace = true
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index f3ae2425f63e..e0cc3ca54364 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -37,8 +37,6 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
 /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
-///
-/// IAM ratelimit should never be observed with caching credentials provider.
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 631caa6a489d..0be8c72fe03e 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -21,7 +21,10 @@ use aws_sdk_s3::{
 };
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
-use tokio::io;
+use tokio::{
+    io::{self, AsyncRead},
+    sync::Semaphore,
+};
 use tokio_util::io::ReaderStream;
 use tracing::debug;
 
@@ -102,8 +105,9 @@ pub struct S3Bucket {
     prefix_in_bucket: Option<String>,
     max_keys_per_list_response: Option<i32>,
     // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
+    // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
     // The helps to ensure we don't exceed the thresholds.
-    concurrency_limiter: Arc<leaky_bucket::RateLimiter>,
+    concurrency_limiter: Arc<Semaphore>,
 }
 
 #[derive(Default)]
@@ -154,24 +158,12 @@ impl S3Bucket {
             }
             prefix
         });
-
-        let rps = aws_config.concurrency_limit.get();
-        let concurrency_limiter = leaky_bucket::RateLimiter::builder()
-            .max(rps)
-            .initial(0)
-            // refill it by rps every second. this means the (rps+1)th request will have to wait for
-            // 1 second from earliest.
-            .refill(rps)
-            .interval(std::time::Duration::from_secs(1))
-            .fair(true)
-            .build();
-
         Ok(Self {
             client,
             bucket_name: aws_config.bucket_name.clone(),
             max_keys_per_list_response: aws_config.max_keys_per_list_response,
             prefix_in_bucket,
-            concurrency_limiter: Arc::new(concurrency_limiter),
+            concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
         })
     }
 
@@ -203,10 +195,13 @@ impl S3Bucket {
     }
 
     async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
-        // while the download could take a long time with `leaky_bucket` we have nothing to release
-        // once the download is done. this is because with "requests per second" rate limiting on
-        // s3, there should be no meaning for the long requests.
-        self.concurrency_limiter.clone().acquire_owned(1).await;
+        let permit = self
+            .concurrency_limiter
+            .clone()
+            .acquire_owned()
+            .await
+            .context("Concurrency limiter semaphore got closed during S3 download")
+            .map_err(DownloadError::Other)?;
 
         metrics::inc_get_object();
 
@@ -224,9 +219,10 @@ impl S3Bucket {
                 let metadata = object_output.metadata().cloned().map(StorageMetadata);
                 Ok(Download {
                     metadata,
-                    download_stream: Box::pin(io::BufReader::new(
+                    download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
+                        permit,
                         object_output.body.into_async_read(),
-                    )),
+                    ))),
                 })
             }
             Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
@@ -242,6 +238,32 @@ impl S3Bucket {
     }
 }
 
+pin_project_lite::pin_project! {
+    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
+    struct RatelimitedAsyncRead<S> {
+        permit: tokio::sync::OwnedSemaphorePermit,
+        #[pin]
+        inner: S,
+    }
+}
+
+impl<S: AsyncRead> RatelimitedAsyncRead<S> {
+    fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
+        RatelimitedAsyncRead { permit, inner }
+    }
+}
+
+impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
+    fn poll_read(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut io::ReadBuf<'_>,
+    ) -> std::task::Poll<std::io::Result<()>> {
+        let this = self.project();
+        this.inner.poll_read(cx, buf)
+    }
+}
+
 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
     /// See the doc for `RemoteStorage::list_prefixes`
@@ -267,7 +289,12 @@ impl RemoteStorage for S3Bucket {
 
         let mut continuation_token = None;
         loop {
-            self.concurrency_limiter.acquire_one().await;
+            let _guard = self
+                .concurrency_limiter
+                .acquire()
+                .await
+                .context("Concurrency limiter semaphore got closed during S3 list")
+                .map_err(DownloadError::Other)?;
 
             metrics::inc_list_objects();
 
@@ -312,9 +339,11 @@ impl RemoteStorage for S3Bucket {
         to: &RemotePath,
         metadata: Option<StorageMetadata>,
     ) -> anyhow::Result<()> {
-        // similarly to downloads, the permit does not have live through the upload, but instead we
-        // are rate limiting requests per second.
-        self.concurrency_limiter.acquire_one().await;
+        let _guard = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .context("Concurrency limiter semaphore got closed during S3 upload")?;
 
         metrics::inc_put_object();
 
@@ -369,7 +398,11 @@ impl RemoteStorage for S3Bucket {
     }
 
     async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        self.concurrency_limiter.acquire_one().await;
+        let _guard = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .context("Concurrency limiter semaphore got closed during S3 delete")?;
 
         metrics::inc_delete_object();
 

From a1ae23b827ad2de80fda216f732ee0e7aef18253 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 10 May 2023 19:43:50 +0200
Subject: [PATCH 18/59] controversial but necessary: keep holding layer map
 lock inside compact_level0_phase1

Without this, the seocnd read().unwrap() becomes an await point,
which makes the future not-Send, but, we require it to be Send
because it runs inside task_mgr::spawn, which requires the Fut's to be Send
---
 pageserver/src/tenant/timeline.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ba0cd2936167..d0d915adb406 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3167,7 +3167,6 @@ impl Timeline {
     ) -> Result<CompactLevel0Phase1Result, CompactionError> {
         let layers = self.layers.read().unwrap();
         let mut level0_deltas = layers.get_level0_deltas()?;
-        drop(layers);
 
         // Only compact if enough layers have accumulated.
         let threshold = self.get_compaction_threshold();
@@ -3288,7 +3287,6 @@ impl Timeline {
         // Determine N largest holes where N is number of compacted layers.
         let max_holes = deltas_to_compact.len();
         let last_record_lsn = self.get_last_record_lsn();
-        let layers = self.layers.read().unwrap(); // Is'n it better to hold original layers lock till here?
         let min_hole_range = (target_file_size / page_cache::PAGE_SZ as u64) as i128;
         let min_hole_coverage_size = 3; // TODO: something more flexible?
 

From a1680b185f1bcd3f9b893568f27af591d7454995 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 12 May 2023 11:27:30 +0200
Subject: [PATCH 19/59] basebackup import: pre-lock the layer map for the
 `flush()` calls

The checkpointer loop isn't running anyway, so, there's no risk of
blocking it through the pre-lock.

(cherry picked from commit 1b2663350cc36dfb1ef41131836c9273b7a0de22)
---
 pageserver/src/pgdatadir_mapping.rs |  4 +++-
 pageserver/src/tenant/timeline.rs   | 35 ++++++++++++++++++++++++++++-
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 7857efd03cf8..0b20186efb32 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1118,11 +1118,13 @@ impl<'a> DatadirModification<'a> {
 
         let writer = self.tline.writer();
 
+        let mut layer_map = self.tline.layers.write().unwrap();
+
         // Flush relation and  SLRU data blocks, keep metadata.
         let mut result: anyhow::Result<()> = Ok(());
         self.pending_updates.retain(|&key, value| {
             if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
-                result = writer.put(key, self.lsn, value);
+                result = writer.put_locked(key, self.lsn, value, &mut layer_map);
                 false
             } else {
                 true
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d0d915adb406..440a3dcd5680 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -119,7 +119,7 @@ pub struct Timeline {
 
     pub pg_version: u32,
 
-    pub(super) layers: RwLock<LayerMap<dyn PersistentLayer>>,
+    pub(crate) layers: RwLock<LayerMap<dyn PersistentLayer>>,
 
     /// Set of key ranges which should be covered by image layers to
     /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
@@ -238,6 +238,8 @@ pub struct Timeline {
     eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
 }
 
+type LayerMapWriteLockGuard<'t> = std::sync::RwLockWriteGuard<'t, LayerMap<dyn PersistentLayer>>;
+
 /// Internal structure to hold all data needed for logical size calculation.
 ///
 /// Calculation consists of two stages:
@@ -2541,7 +2543,14 @@ impl Timeline {
     ///
     fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
         let mut layers = self.layers.write().unwrap();
+        self.get_layer_for_write_locked(lsn, &mut layers)
+    }
 
+    fn get_layer_for_write_locked(
+        &self,
+        lsn: Lsn,
+        layers: &mut LayerMapWriteLockGuard,
+    ) -> anyhow::Result<Arc<InMemoryLayer>> {
         ensure!(lsn.is_aligned());
 
         let last_record_lsn = self.get_last_record_lsn();
@@ -2591,6 +2600,19 @@ impl Timeline {
         Ok(())
     }
 
+    fn put_value_locked(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        val: &Value,
+        pre_locked_layer_map: &mut LayerMapWriteLockGuard,
+    ) -> anyhow::Result<()> {
+        //info!("PUT: key {} at {}", key, lsn);
+        let layer = self.get_layer_for_write_locked(lsn, pre_locked_layer_map)?;
+        layer.put_value(key, lsn, val)?;
+        Ok(())
+    }
+
     fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
         let layer = self.get_layer_for_write(lsn)?;
         layer.put_tombstone(key_range, lsn)?;
@@ -4432,6 +4454,17 @@ impl<'a> TimelineWriter<'a> {
         self.tl.put_value(key, lsn, value)
     }
 
+    pub fn put_locked(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        value: &Value,
+        pre_locked_layer_map: &mut LayerMapWriteLockGuard,
+    ) -> anyhow::Result<()> {
+        self.tl
+            .put_value_locked(key, lsn, value, pre_locked_layer_map)
+    }
+
     pub fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
         self.tl.put_tombstone(key_range, lsn)
     }

From 7de3799e66946556e5ea3d2ac77fe594a4964316 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 12 May 2023 10:53:57 +0200
Subject: [PATCH 20/59] (does not compile): make TimelineWriter `Send` by using
 tokio::sync Mutex internally

fails with

cs@devvm:[~/src/neon]: cargo check -p pageserver  --features testing
    Checking pageserver v0.1.0 (/home/cs/src/neon/pageserver)
error: future cannot be sent between threads safely
   --> pageserver/src/tenant/timeline/walreceiver/connection_manager.rs:426:33
    |
426 |         let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
    |                                 ^^^^^^^^^^^^^^^^^ future created by async block is not `Send`
    |
    = help: within `Instrumented<[async block@pageserver/src/tenant/timeline/walreceiver/connection_manager.rs:427:13: 439:14]>`, the trait `std::marker::Send` is not implemented for `std::sync::RwLockReadGuard<'_, LayerMap<dyn PersistentLayer>>`
note: future is not `Send` as this value is used across an await
   --> pageserver/src/tenant/timeline.rs:872:46
    |
850 |         let layers = self.layers.read().unwrap();
    |             ------ has type `std::sync::RwLockReadGuard<'_, LayerMap<dyn PersistentLayer>>` which is not `Send`
...
872 |                 self.freeze_inmem_layer(true).await;
    |                                              ^^^^^^ await occurs here, with `layers` maybe used later
...
881 |     }
    |     - `layers` is later dropped here
note: required by a bound in `TaskHandle::<E>::spawn`
   --> pageserver/src/tenant/timeline/walreceiver.rs:196:52
    |
192 |     fn spawn<Fut>(
    |        ----- required by a bound in this
...
196 |         Fut: Future<Output = anyhow::Result<()>> + Send,
    |                                                    ^^^^ required by this bound in `TaskHandle::<E>::spawn`

error: could not compile `pageserver` due to previous error
---
 pageserver/src/import_datadir.rs              | 10 ++---
 pageserver/src/pgdatadir_mapping.rs           | 10 ++---
 pageserver/src/tenant.rs                      | 32 +++++++--------
 pageserver/src/tenant/timeline.rs             | 22 +++++-----
 .../walreceiver/walreceiver_connection.rs     | 15 ++++---
 pageserver/src/walingest.rs                   | 40 +++++++++----------
 6 files changed, 66 insertions(+), 63 deletions(-)

diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 936de35eb99d..9ad0124a8037 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -75,12 +75,12 @@ pub async fn import_timeline_from_postgres_datadir(
             {
                 pg_control = Some(control_file);
             }
-            modification.flush()?;
+            modification.flush().await?;
         }
     }
 
     // We're done importing all the data files.
-    modification.commit()?;
+    modification.commit().await?;
 
     // We expect the Postgres server to be shut down cleanly.
     let pg_control = pg_control.context("pg_control file not found")?;
@@ -359,7 +359,7 @@ pub async fn import_basebackup_from_tar(
                     // We found the pg_control file.
                     pg_control = Some(res);
                 }
-                modification.flush()?;
+                modification.flush().await?;
             }
             tokio_tar::EntryType::Directory => {
                 debug!("directory {:?}", file_path);
@@ -377,7 +377,7 @@ pub async fn import_basebackup_from_tar(
     // sanity check: ensure that pg_control is loaded
     let _pg_control = pg_control.context("pg_control file not found")?;
 
-    modification.commit()?;
+    modification.commit().await?;
     Ok(())
 }
 
@@ -594,7 +594,7 @@ async fn import_file(
         // zenith.signal is not necessarily the last file, that we handle
         // but it is ok to call `finish_write()`, because final `modification.commit()`
         // will update lsn once more to the final one.
-        let writer = modification.tline.writer();
+        let writer = modification.tline.writer().await;
         writer.finish_write(prev_lsn);
 
         debug!("imported zenith signal {}", prev_lsn);
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 0b20186efb32..89f58c049e8c 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1108,7 +1108,7 @@ impl<'a> DatadirModification<'a> {
     /// retains all the metadata, but data pages are flushed. That's again OK
     /// for bulk import, where you are just loading data pages and won't try to
     /// modify the same pages twice.
-    pub fn flush(&mut self) -> anyhow::Result<()> {
+    pub async fn flush(&mut self) -> anyhow::Result<()> {
         // Unless we have accumulated a decent amount of changes, it's not worth it
         // to scan through the pending_updates list.
         let pending_nblocks = self.pending_nblocks;
@@ -1116,7 +1116,7 @@ impl<'a> DatadirModification<'a> {
             return Ok(());
         }
 
-        let writer = self.tline.writer();
+        let writer = self.tline.writer().await;
 
         let mut layer_map = self.tline.layers.write().unwrap();
 
@@ -1145,8 +1145,8 @@ impl<'a> DatadirModification<'a> {
     /// underlying timeline.
     /// All the modifications in this atomic update are stamped by the specified LSN.
     ///
-    pub fn commit(&mut self) -> anyhow::Result<()> {
-        let writer = self.tline.writer();
+    pub async fn commit(&mut self) -> anyhow::Result<()> {
+        let writer = self.tline.writer().await;
         let lsn = self.lsn;
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
@@ -1607,7 +1607,7 @@ pub async fn create_test_timeline(
         .await?;
     let mut m = tline.begin_modification(Lsn(8));
     m.init_empty()?;
-    m.commit()?;
+    m.commit().await?;
     Ok(tline)
 }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 15c420701822..9352168778d2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3306,12 +3306,12 @@ mod tests {
             .create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
-        let writer = tline.writer();
+        let writer = tline.writer().await;
         writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
         writer.finish_write(Lsn(0x10));
         drop(writer);
 
-        let writer = tline.writer();
+        let writer = tline.writer().await;
         writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
         writer.finish_write(Lsn(0x20));
         drop(writer);
@@ -3376,7 +3376,7 @@ mod tests {
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
             .await?;
-        let writer = tline.writer();
+        let writer = tline.writer().await;
 
         #[allow(non_snake_case)]
         let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
@@ -3403,7 +3403,7 @@ mod tests {
             .get_timeline(NEW_TIMELINE_ID, true)
             .await
             .expect("Should have a local timeline");
-        let new_writer = newtline.writer();
+        let new_writer = newtline.writer().await;
         new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?;
         new_writer.finish_write(Lsn(0x40));
 
@@ -3430,7 +3430,7 @@ mod tests {
         let mut lsn = start_lsn;
         #[allow(non_snake_case)]
         {
-            let writer = tline.writer();
+            let writer = tline.writer().await;
             // Create a relation on the timeline
             writer.put(
                 *TEST_KEY,
@@ -3449,7 +3449,7 @@ mod tests {
         }
         tline.freeze_and_flush().await?;
         {
-            let writer = tline.writer();
+            let writer = tline.writer().await;
             writer.put(
                 *TEST_KEY,
                 lsn,
@@ -3783,7 +3783,7 @@ mod tests {
             .create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
             .await?;
 
-        let writer = tline.writer();
+        let writer = tline.writer().await;
         writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
         writer.finish_write(Lsn(0x10));
         drop(writer);
@@ -3791,7 +3791,7 @@ mod tests {
         tline.freeze_and_flush().await?;
         tline.compact(&ctx).await?;
 
-        let writer = tline.writer();
+        let writer = tline.writer().await;
         writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
         writer.finish_write(Lsn(0x20));
         drop(writer);
@@ -3799,7 +3799,7 @@ mod tests {
         tline.freeze_and_flush().await?;
         tline.compact(&ctx).await?;
 
-        let writer = tline.writer();
+        let writer = tline.writer().await;
         writer.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?;
         writer.finish_write(Lsn(0x30));
         drop(writer);
@@ -3807,7 +3807,7 @@ mod tests {
         tline.freeze_and_flush().await?;
         tline.compact(&ctx).await?;
 
-        let writer = tline.writer();
+        let writer = tline.writer().await;
         writer.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?;
         writer.finish_write(Lsn(0x40));
         drop(writer);
@@ -3859,7 +3859,7 @@ mod tests {
         for _ in 0..50 {
             for _ in 0..10000 {
                 test_key.field6 = blknum;
-                let writer = tline.writer();
+                let writer = tline.writer().await;
                 writer.put(
                     test_key,
                     lsn,
@@ -3909,7 +3909,7 @@ mod tests {
         for blknum in 0..NUM_KEYS {
             lsn = Lsn(lsn.0 + 0x10);
             test_key.field6 = blknum as u32;
-            let writer = tline.writer();
+            let writer = tline.writer().await;
             writer.put(
                 test_key,
                 lsn,
@@ -3927,7 +3927,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let writer = tline.writer();
+                let writer = tline.writer().await;
                 writer.put(
                     test_key,
                     lsn,
@@ -3984,7 +3984,7 @@ mod tests {
         for blknum in 0..NUM_KEYS {
             lsn = Lsn(lsn.0 + 0x10);
             test_key.field6 = blknum as u32;
-            let writer = tline.writer();
+            let writer = tline.writer().await;
             writer.put(
                 test_key,
                 lsn,
@@ -4011,7 +4011,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let writer = tline.writer();
+                let writer = tline.writer().await;
                 writer.put(
                     test_key,
                     lsn,
@@ -4078,7 +4078,7 @@ mod tests {
                 lsn = Lsn(lsn.0 + 0x10);
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
-                let writer = tline.writer();
+                let writer = tline.writer().await;
                 writer.put(
                     test_key,
                     lsn,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 440a3dcd5680..0908119b6347 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -28,7 +28,7 @@ use std::ops::{Deref, Range};
 use std::path::{Path, PathBuf};
 use std::pin::pin;
 use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
-use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
+use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
 use crate::context::{DownloadBehavior, RequestContext};
@@ -179,7 +179,7 @@ pub struct Timeline {
     /// Locked automatically by [`TimelineWriter`] and checkpointer.
     /// Must always be acquired before the layer map/individual layer lock
     /// to avoid deadlock.
-    write_lock: Mutex<()>,
+    write_lock: tokio::sync::Mutex<()>,
 
     /// Used to avoid multiple `flush_loop` tasks running
     flush_loop_state: Mutex<FlushLoopState>,
@@ -666,7 +666,7 @@ impl Timeline {
     /// Flush to disk all data that was written with the put_* functions
     #[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))]
     pub async fn freeze_and_flush(&self) -> anyhow::Result<()> {
-        self.freeze_inmem_layer(false);
+        self.freeze_inmem_layer(false).await;
         self.flush_frozen_layers_and_wait().await
     }
 
@@ -846,10 +846,10 @@ impl Timeline {
     }
 
     /// Mutate the timeline with a [`TimelineWriter`].
-    pub fn writer(&self) -> TimelineWriter<'_> {
+    pub async fn writer(&self) -> TimelineWriter<'_> {
         TimelineWriter {
             tl: self,
-            _write_guard: self.write_lock.lock().unwrap(),
+            _write_guard: self.write_lock.lock().await,
         }
     }
 
@@ -883,7 +883,7 @@ impl Timeline {
     ///
     /// Also flush after a period of time without new data -- it helps
     /// safekeepers to regard pageserver as caught up and suspend activity.
-    pub fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
+    pub async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
         let last_lsn = self.get_last_record_lsn();
         let layers = self.layers.read().unwrap();
         if let Some(open_layer) = &layers.open_layer {
@@ -907,7 +907,7 @@ impl Timeline {
                     last_freeze_ts.elapsed()
                 );
 
-                self.freeze_inmem_layer(true);
+                self.freeze_inmem_layer(true).await;
                 self.last_freeze_at.store(last_lsn);
                 *(self.last_freeze_ts.write().unwrap()) = Instant::now();
 
@@ -1381,7 +1381,7 @@ impl Timeline {
                 layer_flush_start_tx,
                 layer_flush_done_tx,
 
-                write_lock: Mutex::new(()),
+                write_lock: tokio::sync::Mutex::new(()),
                 layer_removal_cs: Default::default(),
 
                 gc_info: std::sync::RwLock::new(GcInfo {
@@ -2627,13 +2627,13 @@ impl Timeline {
         self.last_record_lsn.advance(new_lsn);
     }
 
-    fn freeze_inmem_layer(&self, write_lock_held: bool) {
+    async fn freeze_inmem_layer(&self, write_lock_held: bool) {
         // Freeze the current open in-memory layer. It will be written to disk on next
         // iteration.
         let _write_guard = if write_lock_held {
             None
         } else {
-            Some(self.write_lock.lock().unwrap())
+            Some(self.write_lock.lock().await)
         };
         let mut layers = self.layers.write().unwrap();
         if let Some(open_layer) = &layers.open_layer {
@@ -4434,7 +4434,7 @@ fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageRecon
 // but will cause large code changes.
 pub struct TimelineWriter<'a> {
     tl: &'a Timeline,
-    _write_guard: MutexGuard<'a, ()>,
+    _write_guard: tokio::sync::MutexGuard<'a, ()>,
 }
 
 impl Deref for TimelineWriter<'_> {
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 1cbed3416c23..91ff60603abb 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -313,12 +313,15 @@ pub(super) async fn handle_walreceiver_connection(
             }
         }
 
-        timeline.check_checkpoint_distance().with_context(|| {
-            format!(
-                "Failed to check checkpoint distance for timeline {}",
-                timeline.timeline_id
-            )
-        })?;
+        timeline
+            .check_checkpoint_distance()
+            .await
+            .with_context(|| {
+                format!(
+                    "Failed to check checkpoint distance for timeline {}",
+                    timeline.timeline_id
+                )
+            })?;
 
         if let Some(last_lsn) = status_update {
             let timeline_remote_consistent_lsn =
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index eb74f7584658..fc0cc5c81ea4 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -333,7 +333,7 @@ impl<'a> WalIngest<'a> {
 
         // Now that this record has been fully handled, including updating the
         // checkpoint data, let the repository know that it is up-to-date to this LSN
-        modification.commit()?;
+        modification.commit().await?;
 
         Ok(())
     }
@@ -1200,7 +1200,7 @@ mod tests {
         let mut m = tline.begin_modification(Lsn(0x10));
         m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
         m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
-        m.commit()?;
+        m.commit().await?;
         let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;
 
         Ok(walingest)
@@ -1217,22 +1217,22 @@ mod tests {
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
             .await?;
-        m.commit()?;
+        m.commit().await?;
         let mut m = tline.begin_modification(Lsn(0x30));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
             .await?;
-        m.commit()?;
+        m.commit().await?;
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
             .await?;
-        m.commit()?;
+        m.commit().await?;
         let mut m = tline.begin_modification(Lsn(0x50));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
             .await?;
-        m.commit()?;
+        m.commit().await?;
 
         assert_current_logical_size(&tline, Lsn(0x50));
 
@@ -1318,7 +1318,7 @@ mod tests {
         walingest
             .put_rel_truncation(&mut m, TESTREL_A, 2, &ctx)
             .await?;
-        m.commit()?;
+        m.commit().await?;
         assert_current_logical_size(&tline, Lsn(0x60));
 
         // Check reported size and contents after truncation
@@ -1360,7 +1360,7 @@ mod tests {
         walingest
             .put_rel_truncation(&mut m, TESTREL_A, 0, &ctx)
             .await?;
-        m.commit()?;
+        m.commit().await?;
         assert_eq!(
             tline
                 .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
@@ -1373,7 +1373,7 @@ mod tests {
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
             .await?;
-        m.commit()?;
+        m.commit().await?;
         assert_eq!(
             tline
                 .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
@@ -1398,7 +1398,7 @@ mod tests {
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
             .await?;
-        m.commit()?;
+        m.commit().await?;
         assert_eq!(
             tline
                 .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
@@ -1435,7 +1435,7 @@ mod tests {
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
             .await?;
-        m.commit()?;
+        m.commit().await?;
 
         // Check that rel exists and size is correct
         assert_eq!(
@@ -1454,7 +1454,7 @@ mod tests {
         // Drop rel
         let mut m = tline.begin_modification(Lsn(0x30));
         walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
-        m.commit()?;
+        m.commit().await?;
 
         // Check that rel is not visible anymore
         assert_eq!(
@@ -1472,7 +1472,7 @@ mod tests {
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
             .await?;
-        m.commit()?;
+        m.commit().await?;
 
         // Check that rel exists and size is correct
         assert_eq!(
@@ -1509,7 +1509,7 @@ mod tests {
                 .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                 .await?;
         }
-        m.commit()?;
+        m.commit().await?;
 
         // The relation was created at LSN 20, not visible at LSN 1 yet.
         assert_eq!(
@@ -1554,7 +1554,7 @@ mod tests {
         walingest
             .put_rel_truncation(&mut m, TESTREL_A, 1, &ctx)
             .await?;
-        m.commit()?;
+        m.commit().await?;
 
         // Check reported size and contents after truncation
         assert_eq!(
@@ -1603,7 +1603,7 @@ mod tests {
                 .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                 .await?;
         }
-        m.commit()?;
+        m.commit().await?;
 
         assert_eq!(
             tline
@@ -1648,7 +1648,7 @@ mod tests {
             walingest
                 .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
                 .await?;
-            m.commit()?;
+            m.commit().await?;
         }
 
         assert_current_logical_size(&tline, Lsn(lsn));
@@ -1664,7 +1664,7 @@ mod tests {
         walingest
             .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx)
             .await?;
-        m.commit()?;
+        m.commit().await?;
         assert_eq!(
             tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
             RELSEG_SIZE
@@ -1677,7 +1677,7 @@ mod tests {
         walingest
             .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx)
             .await?;
-        m.commit()?;
+        m.commit().await?;
         assert_eq!(
             tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
             RELSEG_SIZE - 1
@@ -1693,7 +1693,7 @@ mod tests {
             walingest
                 .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx)
                 .await?;
-            m.commit()?;
+            m.commit().await?;
             assert_eq!(
                 tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
                 size as BlockNumber

From 2001c31a14e723c25afef3afe00ba0111eb708a9 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 26 May 2023 18:49:39 +0200
Subject: [PATCH 21/59] turn Timeline::layers into tokio::sync::RwLock

---
 pageserver/src/disk_usage_eviction_task.rs    |   2 +-
 pageserver/src/http/routes.rs                 |  11 +-
 pageserver/src/pgdatadir_mapping.rs           |   6 +-
 pageserver/src/tenant.rs                      | 180 +++++++++++-------
 .../tenant/storage_layer/inmemory_layer.rs    |   2 +-
 pageserver/src/tenant/timeline.rs             |  96 +++++-----
 .../src/tenant/timeline/eviction_task.rs      |   2 +-
 7 files changed, 175 insertions(+), 124 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 7f8691d81efe..9e5f644759c8 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -512,7 +512,7 @@ async fn collect_eviction_candidates(
             if !tl.is_active() {
                 continue;
             }
-            let info = tl.get_local_layers_for_disk_usage_eviction();
+            let info = tl.get_local_layers_for_disk_usage_eviction().await;
             debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
             tenant_candidates.extend(
                 info.resident_layers
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index e944d6b8ff4c..2e9f230ace93 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -180,7 +180,7 @@ async fn build_timeline_info(
 ) -> anyhow::Result<TimelineInfo> {
     crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
 
-    let mut info = build_timeline_info_common(timeline, ctx)?;
+    let mut info = build_timeline_info_common(timeline, ctx).await?;
     if include_non_incremental_logical_size {
         // XXX we should be using spawn_ondemand_logical_size_calculation here.
         // Otherwise, if someone deletes the timeline / detaches the tenant while
@@ -198,7 +198,7 @@ async fn build_timeline_info(
     Ok(info)
 }
 
-fn build_timeline_info_common(
+async fn build_timeline_info_common(
     timeline: &Arc<Timeline>,
     ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
@@ -229,7 +229,7 @@ fn build_timeline_info_common(
             None
         }
     };
-    let current_physical_size = Some(timeline.layer_size_sum());
+    let current_physical_size = Some(timeline.layer_size_sum().await);
     let state = timeline.current_state();
     let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
 
@@ -291,6 +291,7 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
             Ok(Some(new_timeline)) => {
                 // Created. Construct a TimelineInfo for it.
                 let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
+                .await
                     .map_err(ApiError::InternalServerError)?;
                 json_response(StatusCode::CREATED, timeline_info)
             }
@@ -522,7 +523,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
         // Calculate total physical size of all timelines
         let mut current_physical_size = 0;
         for timeline in tenant.list_timelines().await.iter() {
-            current_physical_size += timeline.layer_size_sum();
+            current_physical_size += timeline.layer_size_sum().await;
         }
 
         let state = tenant.current_state();
@@ -627,7 +628,7 @@ async fn layer_map_info_handler(request: Request<Body>) -> Result<Response<Body>
     check_permission(&request, Some(tenant_id))?;
 
     let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
-    let layer_map_info = timeline.layer_map_info(reset);
+    let layer_map_info = timeline.layer_map_info(reset).await;
 
     json_response(StatusCode::OK, layer_map_info)
 }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 89f58c049e8c..bf4204738417 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1118,7 +1118,7 @@ impl<'a> DatadirModification<'a> {
 
         let writer = self.tline.writer().await;
 
-        let mut layer_map = self.tline.layers.write().unwrap();
+        let mut layer_map = self.tline.layers.write().await;
 
         // Flush relation and  SLRU data blocks, keep metadata.
         let mut result: anyhow::Result<()> = Ok(());
@@ -1152,10 +1152,10 @@ impl<'a> DatadirModification<'a> {
         self.pending_nblocks = 0;
 
         for (key, value) in self.pending_updates.drain() {
-            writer.put(key, lsn, &value)?;
+            writer.put(key, lsn, &value).await?;
         }
         for key_range in self.pending_deletions.drain(..) {
-            writer.delete(key_range, lsn)?;
+            writer.delete(key_range, lsn).await?;
         }
 
         writer.finish_write(lsn);
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 9352168778d2..7f589cb971e2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -213,6 +213,7 @@ impl UninitializedTimeline<'_> {
                 if load_layer_map {
                     new_timeline
                         .load_layer_map(new_disk_consistent_lsn)
+                        .await
                         .with_context(|| {
                             format!(
                                 "Failed to load layermap for timeline {tenant_id}/{timeline_id}"
@@ -556,7 +557,7 @@ impl Tenant {
                 || timeline
                     .layers
                     .read()
-                    .unwrap()
+                    .await
                     .iter_historic_layers()
                     .next()
                     .is_some(),
@@ -1206,6 +1207,7 @@ impl Tenant {
             true,
             None,
         )
+        .await
     }
 
     /// Helper for unit tests to create an emtpy timeline.
@@ -2475,7 +2477,8 @@ impl Tenant {
                 timeline_uninit_mark,
                 false,
                 Some(Arc::clone(src_timeline)),
-            )?
+            )
+            .await?
             .initialize_with_lock(ctx, &mut timelines, true)
             .await?
         };
@@ -2554,8 +2557,9 @@ impl Tenant {
             pgdata_lsn,
             pg_version,
         );
-        let raw_timeline =
-            self.prepare_timeline(timeline_id, &new_metadata, timeline_uninit_mark, true, None)?;
+        let raw_timeline = self
+            .prepare_timeline(timeline_id, &new_metadata, timeline_uninit_mark, true, None)
+            .await?;
 
         let tenant_id = raw_timeline.owning_tenant.tenant_id;
         let unfinished_timeline = raw_timeline.raw_timeline()?;
@@ -2610,7 +2614,7 @@ impl Tenant {
 
     /// Creates intermediate timeline structure and its files, without loading it into memory.
     /// It's up to the caller to import the necesary data and import the timeline into memory.
-    fn prepare_timeline(
+    async fn prepare_timeline(
         &self,
         new_timeline_id: TimelineId,
         new_metadata: &TimelineMetadata,
@@ -2642,7 +2646,7 @@ impl Tenant {
         ) {
             Ok(new_timeline) => {
                 if init_layers {
-                    new_timeline.layers.write().unwrap().next_open_layer_at =
+                    new_timeline.layers.write().await.next_open_layer_at =
                         Some(new_timeline.initdb_lsn);
                 }
                 debug!(
@@ -3307,12 +3311,16 @@ mod tests {
             .await?;
 
         let writer = tline.writer().await;
-        writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
+        writer
+            .put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))
+            .await?;
         writer.finish_write(Lsn(0x10));
         drop(writer);
 
         let writer = tline.writer().await;
-        writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
+        writer
+            .put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))
+            .await?;
         writer.finish_write(Lsn(0x20));
         drop(writer);
 
@@ -3384,13 +3392,21 @@ mod tests {
         let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap();
 
         // Insert a value on the timeline
-        writer.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))?;
-        writer.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))?;
+        writer
+            .put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))
+            .await?;
+        writer
+            .put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))
+            .await?;
         writer.finish_write(Lsn(0x20));
 
-        writer.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))?;
+        writer
+            .put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))
+            .await?;
         writer.finish_write(Lsn(0x30));
-        writer.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))?;
+        writer
+            .put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))
+            .await?;
         writer.finish_write(Lsn(0x40));
 
         //assert_current_logical_size(&tline, Lsn(0x40));
@@ -3404,7 +3420,9 @@ mod tests {
             .await
             .expect("Should have a local timeline");
         let new_writer = newtline.writer().await;
-        new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?;
+        new_writer
+            .put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))
+            .await?;
         new_writer.finish_write(Lsn(0x40));
 
         // Check page contents on both branches
@@ -3432,36 +3450,44 @@ mod tests {
         {
             let writer = tline.writer().await;
             // Create a relation on the timeline
-            writer.put(
-                *TEST_KEY,
-                lsn,
-                &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
-            )?;
+            writer
+                .put(
+                    *TEST_KEY,
+                    lsn,
+                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                )
+                .await?;
             writer.finish_write(lsn);
             lsn += 0x10;
-            writer.put(
-                *TEST_KEY,
-                lsn,
-                &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
-            )?;
+            writer
+                .put(
+                    *TEST_KEY,
+                    lsn,
+                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                )
+                .await?;
             writer.finish_write(lsn);
             lsn += 0x10;
         }
         tline.freeze_and_flush().await?;
         {
             let writer = tline.writer().await;
-            writer.put(
-                *TEST_KEY,
-                lsn,
-                &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
-            )?;
+            writer
+                .put(
+                    *TEST_KEY,
+                    lsn,
+                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                )
+                .await?;
             writer.finish_write(lsn);
             lsn += 0x10;
-            writer.put(
-                *TEST_KEY,
-                lsn,
-                &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
-            )?;
+            writer
+                .put(
+                    *TEST_KEY,
+                    lsn,
+                    &Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
+                )
+                .await?;
             writer.finish_write(lsn);
         }
         tline.freeze_and_flush().await
@@ -3784,7 +3810,9 @@ mod tests {
             .await?;
 
         let writer = tline.writer().await;
-        writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
+        writer
+            .put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))
+            .await?;
         writer.finish_write(Lsn(0x10));
         drop(writer);
 
@@ -3792,7 +3820,9 @@ mod tests {
         tline.compact(&ctx).await?;
 
         let writer = tline.writer().await;
-        writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
+        writer
+            .put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))
+            .await?;
         writer.finish_write(Lsn(0x20));
         drop(writer);
 
@@ -3800,7 +3830,9 @@ mod tests {
         tline.compact(&ctx).await?;
 
         let writer = tline.writer().await;
-        writer.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?;
+        writer
+            .put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))
+            .await?;
         writer.finish_write(Lsn(0x30));
         drop(writer);
 
@@ -3808,7 +3840,9 @@ mod tests {
         tline.compact(&ctx).await?;
 
         let writer = tline.writer().await;
-        writer.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?;
+        writer
+            .put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))
+            .await?;
         writer.finish_write(Lsn(0x40));
         drop(writer);
 
@@ -3860,11 +3894,13 @@ mod tests {
             for _ in 0..10000 {
                 test_key.field6 = blknum;
                 let writer = tline.writer().await;
-                writer.put(
-                    test_key,
-                    lsn,
-                    &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
-                )?;
+                writer
+                    .put(
+                        test_key,
+                        lsn,
+                        &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                    )
+                    .await?;
                 writer.finish_write(lsn);
                 drop(writer);
 
@@ -3910,11 +3946,13 @@ mod tests {
             lsn = Lsn(lsn.0 + 0x10);
             test_key.field6 = blknum as u32;
             let writer = tline.writer().await;
-            writer.put(
-                test_key,
-                lsn,
-                &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
-            )?;
+            writer
+                .put(
+                    test_key,
+                    lsn,
+                    &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                )
+                .await?;
             writer.finish_write(lsn);
             updated[blknum] = lsn;
             drop(writer);
@@ -3928,11 +3966,13 @@ mod tests {
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
                 let writer = tline.writer().await;
-                writer.put(
-                    test_key,
-                    lsn,
-                    &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
-                )?;
+                writer
+                    .put(
+                        test_key,
+                        lsn,
+                        &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                    )
+                    .await?;
                 writer.finish_write(lsn);
                 drop(writer);
                 updated[blknum] = lsn;
@@ -3985,11 +4025,13 @@ mod tests {
             lsn = Lsn(lsn.0 + 0x10);
             test_key.field6 = blknum as u32;
             let writer = tline.writer().await;
-            writer.put(
-                test_key,
-                lsn,
-                &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
-            )?;
+            writer
+                .put(
+                    test_key,
+                    lsn,
+                    &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                )
+                .await?;
             writer.finish_write(lsn);
             updated[blknum] = lsn;
             drop(writer);
@@ -4012,11 +4054,13 @@ mod tests {
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
                 let writer = tline.writer().await;
-                writer.put(
-                    test_key,
-                    lsn,
-                    &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
-                )?;
+                writer
+                    .put(
+                        test_key,
+                        lsn,
+                        &Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
+                    )
+                    .await?;
                 println!("updating {} at {}", blknum, lsn);
                 writer.finish_write(lsn);
                 drop(writer);
@@ -4079,11 +4123,13 @@ mod tests {
                 let blknum = thread_rng().gen_range(0..NUM_KEYS);
                 test_key.field6 = blknum as u32;
                 let writer = tline.writer().await;
-                writer.put(
-                    test_key,
-                    lsn,
-                    &Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))),
-                )?;
+                writer
+                    .put(
+                        test_key,
+                        lsn,
+                        &Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))),
+                    )
+                    .await?;
                 println!("updating [{}][{}] at {}", idx, blknum, lsn);
                 writer.finish_write(lsn);
                 drop(writer);
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index c453683fea53..78bcfdafc0d2 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -304,7 +304,7 @@ impl InMemoryLayer {
         Ok(())
     }
 
-    pub fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
+    pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
         // TODO: Currently, we just leak the storage for any deleted keys
 
         Ok(())
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0908119b6347..ecb01708acdc 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -119,7 +119,7 @@ pub struct Timeline {
 
     pub pg_version: u32,
 
-    pub(crate) layers: RwLock<LayerMap<dyn PersistentLayer>>,
+    pub(crate) layers: tokio::sync::RwLock<LayerMap<dyn PersistentLayer>>,
 
     /// Set of key ranges which should be covered by image layers to
     /// allow GC to remove old layers. This set is created by GC and its cutoff LSN is also stored.
@@ -238,7 +238,7 @@ pub struct Timeline {
     eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
 }
 
-type LayerMapWriteLockGuard<'t> = std::sync::RwLockWriteGuard<'t, LayerMap<dyn PersistentLayer>>;
+type LayerMapWriteLockGuard<'t> = tokio::sync::RwLockWriteGuard<'t, LayerMap<dyn PersistentLayer>>;
 
 /// Internal structure to hold all data needed for logical size calculation.
 ///
@@ -574,8 +574,8 @@ impl Timeline {
     /// The sum of the file size of all historic layers in the layer map.
     /// This method makes no distinction between local and remote layers.
     /// Hence, the result **does not represent local filesystem usage**.
-    pub fn layer_size_sum(&self) -> u64 {
-        let layer_map = self.layers.read().unwrap();
+    pub async fn layer_size_sum(&self) -> u64 {
+        let layer_map = self.layers.read().await;
         let mut size = 0;
         for l in layer_map.iter_historic_layers() {
             size += l.file_size();
@@ -885,7 +885,7 @@ impl Timeline {
     /// safekeepers to regard pageserver as caught up and suspend activity.
     pub async fn check_checkpoint_distance(self: &Arc<Timeline>) -> anyhow::Result<()> {
         let last_lsn = self.get_last_record_lsn();
-        let layers = self.layers.read().unwrap();
+        let layers = self.layers.read().await;
         if let Some(open_layer) = &layers.open_layer {
             let open_layer_size = open_layer.size()?;
             drop(layers);
@@ -981,8 +981,8 @@ impl Timeline {
         }
     }
 
-    pub fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
-        let layer_map = self.layers.read().unwrap();
+    pub async fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
+        let layer_map = self.layers.read().await;
         let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
         if let Some(open_layer) = &layer_map.open_layer {
             in_memory_layers.push(open_layer.info());
@@ -1004,7 +1004,7 @@ impl Timeline {
 
     #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
     pub async fn download_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
-        let Some(layer) = self.find_layer(layer_file_name) else { return Ok(None) };
+        let Some(layer) = self.find_layer(layer_file_name).await else { return Ok(None) };
         let Some(remote_layer) = layer.downcast_remote_layer() else { return  Ok(Some(false)) };
         if self.remote_client.is_none() {
             return Ok(Some(false));
@@ -1017,7 +1017,7 @@ impl Timeline {
     /// Like [`evict_layer_batch`], but for just one layer.
     /// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
     pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
-        let Some(local_layer) = self.find_layer(layer_file_name) else { return Ok(None) };
+        let Some(local_layer) = self.find_layer(layer_file_name).await else { return Ok(None) };
         let remote_client = self
             .remote_client
             .as_ref()
@@ -1102,7 +1102,7 @@ impl Timeline {
         }
 
         // start the batch update
-        let mut layer_map = self.layers.write().unwrap();
+        let mut layer_map = self.layers.write().await;
         let mut batch_updates = layer_map.batch_update();
 
         let mut results = Vec::with_capacity(layers_to_evict.len());
@@ -1346,7 +1346,7 @@ impl Timeline {
                 timeline_id,
                 tenant_id,
                 pg_version,
-                layers: RwLock::new(LayerMap::default()),
+                layers: tokio::sync::RwLock::new(LayerMap::default()),
                 wanted_image_layers: Mutex::new(None),
 
                 walredo_mgr,
@@ -1519,8 +1519,8 @@ impl Timeline {
     /// Scan the timeline directory to populate the layer map.
     /// Returns all timeline-related files that were found and loaded.
     ///
-    pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
-        let mut layers = self.layers.write().unwrap();
+    pub(super) async fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
+        let mut layers = self.layers.write().await;
         let mut updates = layers.batch_update();
         let mut num_layers = 0;
 
@@ -1649,7 +1649,7 @@ impl Timeline {
 
         // We're holding a layer map lock for a while but this
         // method is only called during init so it's fine.
-        let mut layer_map = self.layers.write().unwrap();
+        let mut layer_map = self.layers.write().await;
         let mut updates = layer_map.batch_update();
         for remote_layer_name in &index_part.timeline_layers {
             let local_layer = local_only_layers.remove(remote_layer_name);
@@ -1802,7 +1802,7 @@ impl Timeline {
         let local_layers = self
             .layers
             .read()
-            .unwrap()
+            .await
             .iter_historic_layers()
             .map(|l| (l.filename(), l))
             .collect::<HashMap<_, _>>();
@@ -2154,8 +2154,8 @@ impl Timeline {
         }
     }
 
-    fn find_layer(&self, layer_file_name: &str) -> Option<Arc<dyn PersistentLayer>> {
-        for historic_layer in self.layers.read().unwrap().iter_historic_layers() {
+    async fn find_layer(&self, layer_file_name: &str) -> Option<Arc<dyn PersistentLayer>> {
+        for historic_layer in self.layers.read().await.iter_historic_layers() {
             let historic_layer_name = historic_layer.filename().file_name();
             if layer_file_name == historic_layer_name {
                 return Some(historic_layer);
@@ -2362,7 +2362,7 @@ impl Timeline {
             #[allow(clippy::never_loop)] // see comment at bottom of this loop
             'layer_map_search: loop {
                 let remote_layer = {
-                    let layers = timeline.layers.read().unwrap();
+                    let layers = timeline.layers.read().await;
 
                     // Check the open and frozen in-memory layers first, in order from newest
                     // to oldest.
@@ -2541,8 +2541,8 @@ impl Timeline {
     ///
     /// Get a handle to the latest layer for appending.
     ///
-    fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
-        let mut layers = self.layers.write().unwrap();
+    async fn get_layer_for_write(&self, lsn: Lsn) -> anyhow::Result<Arc<InMemoryLayer>> {
+        let mut layers = self.layers.write().await;
         self.get_layer_for_write_locked(lsn, &mut layers)
     }
 
@@ -2593,9 +2593,9 @@ impl Timeline {
         Ok(layer)
     }
 
-    fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
+    async fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> anyhow::Result<()> {
         //info!("PUT: key {} at {}", key, lsn);
-        let layer = self.get_layer_for_write(lsn)?;
+        let layer = self.get_layer_for_write(lsn).await?;
         layer.put_value(key, lsn, val)?;
         Ok(())
     }
@@ -2613,9 +2613,9 @@ impl Timeline {
         Ok(())
     }
 
-    fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
-        let layer = self.get_layer_for_write(lsn)?;
-        layer.put_tombstone(key_range, lsn)?;
+    async fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
+        let layer = self.get_layer_for_write(lsn).await?;
+        layer.put_tombstone(key_range, lsn).await?;
 
         Ok(())
     }
@@ -2635,7 +2635,7 @@ impl Timeline {
         } else {
             Some(self.write_lock.lock().await)
         };
-        let mut layers = self.layers.write().unwrap();
+        let mut layers = self.layers.write().await;
         if let Some(open_layer) = &layers.open_layer {
             let open_layer_rc = Arc::clone(open_layer);
             // Does this layer need freezing?
@@ -2673,7 +2673,7 @@ impl Timeline {
             let flush_counter = *layer_flush_start_rx.borrow();
             let result = loop {
                 let layer_to_flush = {
-                    let layers = self.layers.read().unwrap();
+                    let layers = self.layers.read().await;
                     layers.frozen_layers.front().cloned()
                     // drop 'layers' lock to allow concurrent reads and writes
                 };
@@ -2765,7 +2765,7 @@ impl Timeline {
                     .await?
             } else {
                 // normal case, write out a L0 delta layer file.
-                let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?;
+                let (delta_path, metadata) = self.create_delta_layer(&frozen_layer).await?;
                 HashMap::from([(delta_path, metadata)])
             };
 
@@ -2774,7 +2774,7 @@ impl Timeline {
         // The new on-disk layers are now in the layer map. We can remove the
         // in-memory layer from the map now.
         {
-            let mut layers = self.layers.write().unwrap();
+            let mut layers = self.layers.write().await;
             let l = layers.frozen_layers.pop_front();
 
             // Only one thread may call this function at a time (for this
@@ -2868,7 +2868,7 @@ impl Timeline {
     }
 
     // Write out the given frozen in-memory layer as a new L0 delta file
-    fn create_delta_layer(
+    async fn create_delta_layer(
         &self,
         frozen_layer: &InMemoryLayer,
     ) -> anyhow::Result<(LayerFileName, LayerFileMetadata)> {
@@ -2892,7 +2892,7 @@ impl Timeline {
 
         // Add it to the layer map
         let l = Arc::new(new_delta);
-        let mut layers = self.layers.write().unwrap();
+        let mut layers = self.layers.write().await;
         let mut batch_updates = layers.batch_update();
         l.access_stats().record_residence_event(
             &batch_updates,
@@ -2944,10 +2944,14 @@ impl Timeline {
     }
 
     // Is it time to create a new image layer for the given partition?
-    fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<bool> {
+    async fn time_for_new_image_layer(
+        &self,
+        partition: &KeySpace,
+        lsn: Lsn,
+    ) -> anyhow::Result<bool> {
         let threshold = self.get_image_creation_threshold();
 
-        let layers = self.layers.read().unwrap();
+        let layers = self.layers.read().await;
 
         let mut max_deltas = 0;
         {
@@ -3042,7 +3046,7 @@ impl Timeline {
         for partition in partitioning.parts.iter() {
             let img_range = start..partition.ranges.last().unwrap().end;
             start = img_range.end;
-            if force || self.time_for_new_image_layer(partition, lsn)? {
+            if force || self.time_for_new_image_layer(partition, lsn).await? {
                 let mut image_layer_writer = ImageLayerWriter::new(
                     self.conf,
                     self.timeline_id,
@@ -3120,7 +3124,7 @@ impl Timeline {
 
         let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
 
-        let mut layers = self.layers.write().unwrap();
+        let mut layers = self.layers.write().await;
         let mut updates = layers.batch_update();
         let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
         for l in image_layers {
@@ -3187,7 +3191,7 @@ impl Timeline {
         target_file_size: u64,
         ctx: &RequestContext,
     ) -> Result<CompactLevel0Phase1Result, CompactionError> {
-        let layers = self.layers.read().unwrap();
+        let layers = self.layers.read().await;
         let mut level0_deltas = layers.get_level0_deltas()?;
 
         // Only compact if enough layers have accumulated.
@@ -3545,7 +3549,7 @@ impl Timeline {
                 .context("wait for layer upload ops to complete")?;
         }
 
-        let mut layers = self.layers.write().unwrap();
+        let mut layers = self.layers.write().await;
         let mut updates = layers.batch_update();
         let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
         for l in new_layers {
@@ -3805,7 +3809,7 @@ impl Timeline {
         // 4. newer on-disk image layers cover the layer's whole key range
         //
         // TODO holding a write lock is too agressive and avoidable
-        let mut layers = self.layers.write().unwrap();
+        let mut layers = self.layers.write().await;
         'outer: for l in layers.iter_historic_layers() {
             result.layers_total += 1;
 
@@ -4101,7 +4105,7 @@ impl Timeline {
 
                     // Download complete. Replace the RemoteLayer with the corresponding
                     // Delta- or ImageLayer in the layer map.
-                    let mut layers = self_clone.layers.write().unwrap();
+                    let mut layers = self_clone.layers.write().await;
                     let mut updates = layers.batch_update();
                     let new_layer = remote_layer.create_downloaded_layer(&updates, self_clone.conf, *size);
                     {
@@ -4259,7 +4263,7 @@ impl Timeline {
     ) {
         let mut downloads = Vec::new();
         {
-            let layers = self.layers.read().unwrap();
+            let layers = self.layers.read().await;
             layers
                 .iter_historic_layers()
                 .filter_map(|l| l.downcast_remote_layer())
@@ -4361,8 +4365,8 @@ impl LocalLayerInfoForDiskUsageEviction {
 }
 
 impl Timeline {
-    pub(crate) fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
-        let layers = self.layers.read().unwrap();
+    pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
+        let layers = self.layers.read().await;
 
         let mut max_layer_size: Option<u64> = None;
         let mut resident_layers = Vec::new();
@@ -4450,8 +4454,8 @@ impl<'a> TimelineWriter<'a> {
     ///
     /// This will implicitly extend the relation, if the page is beyond the
     /// current end-of-file.
-    pub fn put(&self, key: Key, lsn: Lsn, value: &Value) -> anyhow::Result<()> {
-        self.tl.put_value(key, lsn, value)
+    pub async fn put(&self, key: Key, lsn: Lsn, value: &Value) -> anyhow::Result<()> {
+        self.tl.put_value(key, lsn, value).await
     }
 
     pub fn put_locked(
@@ -4465,8 +4469,8 @@ impl<'a> TimelineWriter<'a> {
             .put_value_locked(key, lsn, value, pre_locked_layer_map)
     }
 
-    pub fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
-        self.tl.put_tombstone(key_range, lsn)
+    pub async fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
+        self.tl.put_tombstone(key_range, lsn).await
     }
 
     /// Track the end of the latest digested WAL record.
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 558600692ec7..a7f24c52edc7 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -185,7 +185,7 @@ impl Timeline {
         // We don't want to hold the layer map lock during eviction.
         // So, we just need to deal with this.
         let candidates: Vec<Arc<dyn PersistentLayer>> = {
-            let layers = self.layers.read().unwrap();
+            let layers = self.layers.read().await;
             let mut candidates = Vec::new();
             for hist_layer in layers.iter_historic_layers() {
                 if hist_layer.is_remote_layer() {

From 4e359db4c78e0fd3d3e8f6a69baac4fb5b80b752 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 26 May 2023 17:15:47 -0400
Subject: [PATCH 22/59] pgserver: spawn_blocking in compaction (#4265)

Compaction is usually a compute-heavy process and might affect other
futures running on the thread of the compaction. Therefore, we add
`block_in_place` as a temporary solution to avoid blocking other futures
on the same thread as compaction in the runtime. As we are migrating
towards a fully-async-style pageserver, we can revert this change when
everything is async and when we move compaction to a separate runtime.

---------

Signed-off-by: Alex Chi <iskyzh@gmail.com>
---
 pageserver/src/context.rs          |  3 +-
 pageserver/src/tenant/par_fsync.rs | 52 +++++++++++++----
 pageserver/src/tenant/timeline.rs  | 89 ++++++++++++++++++------------
 3 files changed, 98 insertions(+), 46 deletions(-)

diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index e826d28e6d38..f53b7736abe5 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -88,6 +88,7 @@
 use crate::task_mgr::TaskKind;
 
 // The main structure of this module, see module-level comment.
+#[derive(Clone, Debug)]
 pub struct RequestContext {
     task_kind: TaskKind,
     download_behavior: DownloadBehavior,
@@ -95,7 +96,7 @@ pub struct RequestContext {
 
 /// Desired behavior if the operation requires an on-demand download
 /// to proceed.
-#[derive(Clone, Copy, PartialEq, Eq)]
+#[derive(Clone, Copy, PartialEq, Eq, Debug)]
 pub enum DownloadBehavior {
     /// Download the layer file. It can take a while.
     Download,
diff --git a/pageserver/src/tenant/par_fsync.rs b/pageserver/src/tenant/par_fsync.rs
index 0b0217ab58ca..3cbcfe8774d2 100644
--- a/pageserver/src/tenant/par_fsync.rs
+++ b/pageserver/src/tenant/par_fsync.rs
@@ -19,14 +19,8 @@ fn parallel_worker(paths: &[PathBuf], next_path_idx: &AtomicUsize) -> io::Result
     Ok(())
 }
 
-pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
-    const PARALLEL_PATH_THRESHOLD: usize = 1;
-    if paths.len() <= PARALLEL_PATH_THRESHOLD {
-        for path in paths {
-            fsync_path(path)?;
-        }
-        return Ok(());
-    }
+fn fsync_in_thread_pool(paths: &[PathBuf]) -> io::Result<()> {
+    // TODO: remove this function in favor of `par_fsync_async` once we asyncify everything.
 
     /// Use at most this number of threads.
     /// Increasing this limit will
@@ -36,11 +30,11 @@ pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
     let num_threads = paths.len().min(MAX_NUM_THREADS);
     let next_path_idx = AtomicUsize::new(0);
 
-    crossbeam_utils::thread::scope(|s| -> io::Result<()> {
+    std::thread::scope(|s| -> io::Result<()> {
         let mut handles = vec![];
         // Spawn `num_threads - 1`, as the current thread is also a worker.
         for _ in 1..num_threads {
-            handles.push(s.spawn(|_| parallel_worker(paths, &next_path_idx)));
+            handles.push(s.spawn(|| parallel_worker(paths, &next_path_idx)));
         }
 
         parallel_worker(paths, &next_path_idx)?;
@@ -51,5 +45,41 @@ pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
 
         Ok(())
     })
-    .unwrap()
+}
+
+/// Parallel fsync all files. Can be used in non-async context as it is using rayon thread pool.
+pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
+    if paths.len() == 1 {
+        fsync_path(&paths[0])?;
+        return Ok(());
+    }
+
+    fsync_in_thread_pool(paths)
+}
+
+/// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current
+/// execution thread. Otherwise, we will spawn_blocking and run it in tokio.
+pub async fn par_fsync_async(paths: &[PathBuf]) -> io::Result<()> {
+    const MAX_CONCURRENT_FSYNC: usize = 64;
+    let mut next = paths.iter().peekable();
+    let mut js = tokio::task::JoinSet::new();
+    loop {
+        while js.len() < MAX_CONCURRENT_FSYNC && next.peek().is_some() {
+            let next = next.next().expect("just peeked");
+            let next = next.to_owned();
+            js.spawn_blocking(move || fsync_path(&next));
+        }
+
+        // now the joinset has been filled up, wait for next to complete
+        if let Some(res) = js.join_next().await {
+            res??;
+        } else {
+            // last item had already completed
+            assert!(
+                next.peek().is_none(),
+                "joinset emptied, we shouldn't have more work"
+            );
+            return Ok(());
+        }
+    }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b0aca458826b..4bfebd93df66 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -195,8 +195,9 @@ pub struct Timeline {
     /// Layer removal lock.
     /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
     /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
-    /// and [`Tenant::delete_timeline`].
-    pub(super) layer_removal_cs: tokio::sync::Mutex<()>,
+    /// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
+    /// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
+    pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,
 
     // Needed to ensure that we can't create a branch at a point that was already garbage collected
     pub latest_gc_cutoff_lsn: Rcu<Lsn>,
@@ -669,7 +670,7 @@ impl Timeline {
     }
 
     /// Outermost timeline compaction operation; downloads needed layers.
-    pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+    pub async fn compact(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
         const ROUNDS: usize = 2;
 
         let last_record_lsn = self.get_last_record_lsn();
@@ -758,7 +759,7 @@ impl Timeline {
     }
 
     /// Compaction which might need to be retried after downloading remote layers.
-    async fn compact_inner(&self, ctx: &RequestContext) -> Result<(), CompactionError> {
+    async fn compact_inner(self: &Arc<Self>, ctx: &RequestContext) -> Result<(), CompactionError> {
         //
         // High level strategy for compaction / image creation:
         //
@@ -793,7 +794,7 @@ impl Timeline {
         // Below are functions compact_level0() and create_image_layers()
         // but they are a bit ad hoc and don't quite work like it's explained
         // above. Rewrite it.
-        let layer_removal_cs = self.layer_removal_cs.lock().await;
+        let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
         // Is the timeline being deleted?
         let state = *self.state.borrow();
         if state == TimelineState::Stopping {
@@ -827,7 +828,7 @@ impl Timeline {
 
                 // 3. Compact
                 let timer = self.metrics.compact_time_histo.start_timer();
-                self.compact_level0(&layer_removal_cs, target_file_size, ctx)
+                self.compact_level0(layer_removal_cs.clone(), target_file_size, ctx)
                     .await?;
                 timer.stop_and_record();
             }
@@ -2168,7 +2169,7 @@ impl Timeline {
     fn delete_historic_layer(
         &self,
         // we cannot remove layers otherwise, since gc and compaction will race
-        _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
+        _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
         layer: Arc<dyn PersistentLayer>,
         updates: &mut BatchedUpdates<'_, dyn PersistentLayer>,
     ) -> anyhow::Result<()> {
@@ -2632,7 +2633,7 @@ impl Timeline {
 
     /// Layer flusher task's main loop.
     async fn flush_loop(
-        &self,
+        self: &Arc<Self>,
         mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>,
         ctx: &RequestContext,
     ) {
@@ -2723,7 +2724,7 @@ impl Timeline {
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
     #[instrument(skip(self, frozen_layer, ctx), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
     async fn flush_frozen_layer(
-        &self,
+        self: &Arc<Self>,
         frozen_layer: Arc<InMemoryLayer>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
@@ -2743,7 +2744,11 @@ impl Timeline {
                     .await?
             } else {
                 // normal case, write out a L0 delta layer file.
-                let (delta_path, metadata) = self.create_delta_layer(&frozen_layer)?;
+                let this = self.clone();
+                let frozen_layer = frozen_layer.clone();
+                let (delta_path, metadata) =
+                    tokio::task::spawn_blocking(move || this.create_delta_layer(&frozen_layer))
+                        .await??;
                 HashMap::from([(delta_path, metadata)])
             };
 
@@ -2847,7 +2852,7 @@ impl Timeline {
 
     // Write out the given frozen in-memory layer as a new L0 delta file
     fn create_delta_layer(
-        &self,
+        self: &Arc<Self>,
         frozen_layer: &InMemoryLayer,
     ) -> anyhow::Result<(LayerFileName, LayerFileMetadata)> {
         // Write it out
@@ -2863,10 +2868,13 @@ impl Timeline {
         // TODO: If we're running inside 'flush_frozen_layers' and there are multiple
         // files to flush, it might be better to first write them all, and then fsync
         // them all in parallel.
-        par_fsync::par_fsync(&[
-            new_delta_path.clone(),
-            self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
-        ])?;
+
+        // First sync the delta layer. We still use par_fsync here to keep everything consistent. Feel free to replace
+        // this with a single fsync in future refactors.
+        par_fsync::par_fsync(&[new_delta_path.clone()]).context("fsync of delta layer")?;
+        // Then sync the parent directory.
+        par_fsync::par_fsync(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
+            .context("fsync of timeline dir")?;
 
         // Add it to the layer map
         let l = Arc::new(new_delta);
@@ -3090,11 +3098,15 @@ impl Timeline {
         let all_paths = image_layers
             .iter()
             .map(|layer| layer.path())
-            .chain(std::iter::once(
-                self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
-            ))
             .collect::<Vec<_>>();
-        par_fsync::par_fsync(&all_paths).context("fsync of newly created layer files")?;
+
+        par_fsync::par_fsync_async(&all_paths)
+            .await
+            .context("fsync of newly created layer files")?;
+
+        par_fsync::par_fsync_async(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
+            .await
+            .context("fsync of timeline dir")?;
 
         let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
 
@@ -3159,9 +3171,9 @@ impl Timeline {
     /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
     /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
     /// start of level0 files compaction, the on-demand download should be revisited as well.
-    async fn compact_level0_phase1(
+    fn compact_level0_phase1(
         &self,
-        _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
+        _layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
         target_file_size: u64,
         ctx: &RequestContext,
     ) -> Result<CompactLevel0Phase1Result, CompactionError> {
@@ -3474,13 +3486,13 @@ impl Timeline {
         if !new_layers.is_empty() {
             let mut layer_paths: Vec<PathBuf> = new_layers.iter().map(|l| l.path()).collect();
 
-            // also sync the directory
-            layer_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id));
-
             // Fsync all the layer files and directory using multiple threads to
             // minimize latency.
             par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;
 
+            par_fsync::par_fsync(&[self.conf.timeline_path(&self.timeline_id, &self.tenant_id)])
+                .context("fsync of timeline dir")?;
+
             layer_paths.pop().unwrap();
         }
 
@@ -3497,17 +3509,22 @@ impl Timeline {
     /// as Level 1 files.
     ///
     async fn compact_level0(
-        &self,
-        layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
+        self: &Arc<Self>,
+        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
         target_file_size: u64,
         ctx: &RequestContext,
     ) -> Result<(), CompactionError> {
+        let this = self.clone();
+        let ctx_inner = ctx.clone();
+        let layer_removal_cs_inner = layer_removal_cs.clone();
         let CompactLevel0Phase1Result {
             new_layers,
             deltas_to_compact,
-        } = self
-            .compact_level0_phase1(layer_removal_cs, target_file_size, ctx)
-            .await?;
+        } = tokio::task::spawn_blocking(move || {
+            this.compact_level0_phase1(layer_removal_cs_inner, target_file_size, &ctx_inner)
+        })
+        .await
+        .unwrap()?;
 
         if new_layers.is_empty() && deltas_to_compact.is_empty() {
             // nothing to do
@@ -3565,7 +3582,7 @@ impl Timeline {
         let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
         for l in deltas_to_compact {
             layer_names_to_delete.push(l.filename());
-            self.delete_historic_layer(layer_removal_cs, l, &mut updates)?;
+            self.delete_historic_layer(layer_removal_cs.clone(), l, &mut updates)?;
         }
         updates.flush();
         drop(layers);
@@ -3685,7 +3702,7 @@ impl Timeline {
 
         fail_point!("before-timeline-gc");
 
-        let layer_removal_cs = self.layer_removal_cs.lock().await;
+        let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
         // Is the timeline being deleted?
         let state = *self.state.borrow();
         if state == TimelineState::Stopping {
@@ -3705,7 +3722,7 @@ impl Timeline {
 
         let res = self
             .gc_timeline(
-                &layer_removal_cs,
+                layer_removal_cs.clone(),
                 horizon_cutoff,
                 pitr_cutoff,
                 retain_lsns,
@@ -3724,7 +3741,7 @@ impl Timeline {
 
     async fn gc_timeline(
         &self,
-        layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
+        layer_removal_cs: Arc<tokio::sync::OwnedMutexGuard<()>>,
         horizon_cutoff: Lsn,
         pitr_cutoff: Lsn,
         retain_lsns: Vec<Lsn>,
@@ -3897,7 +3914,11 @@ impl Timeline {
             {
                 for doomed_layer in layers_to_remove {
                     layer_names_to_delete.push(doomed_layer.filename());
-                    self.delete_historic_layer(layer_removal_cs, doomed_layer, &mut updates)?; // FIXME: schedule succeeded deletions before returning?
+                    self.delete_historic_layer(
+                        layer_removal_cs.clone(),
+                        doomed_layer,
+                        &mut updates,
+                    )?; // FIXME: schedule succeeded deletions before returning?
                     result.layers_removed += 1;
                 }
             }

From 200a520e6c73ce028a4964e9f9d1c1eb92515415 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 27 May 2023 01:07:00 +0300
Subject: [PATCH 23/59] Minor refactoring in RequestSpan

Require the error type to be ApiError. It implicitly required that
anyway, because the function used error::handler, which downcasted the
error to an ApiError. If the error was in fact anything else than
ApiError, it would just panic. Better to check it at compilation time.

Also make the last-resort error handler more forgiving, so that it
returns an 500 Internal Server error response, instead of panicking,
if a request handler returns some other error than an ApiError.
---
 libs/utils/src/http/endpoint.rs | 39 +++++++++++++++++++--------------
 libs/utils/src/http/error.rs    | 21 +++++++++++++-----
 2 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 4bfb5bf994bb..4a78f16cfb63 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,5 +1,5 @@
 use crate::auth::{Claims, JwtAuth};
-use crate::http::error;
+use crate::http::error::{api_error_handler, route_error_handler, ApiError};
 use anyhow::{anyhow, Context};
 use hyper::header::{HeaderName, AUTHORIZATION};
 use hyper::http::HeaderValue;
@@ -16,8 +16,6 @@ use std::future::Future;
 use std::net::TcpListener;
 use std::str::FromStr;
 
-use super::error::ApiError;
-
 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "libmetrics_metric_handler_requests_total",
@@ -38,6 +36,8 @@ struct RequestId(String);
 /// Use this to distinguish between logs of different HTTP requests: every request handler wrapped
 /// in this type will get request info logged in the wrapping span, including the unique request ID.
 ///
+/// This also handles errors, logging them and converting them to an HTTP error response.
+///
 /// There could be other ways to implement similar functionality:
 ///
 /// * procmacros placed on top of all handler methods
@@ -54,21 +54,19 @@ struct RequestId(String);
 /// tries to achive with its `.instrument` used in the current approach.
 ///
 /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
-pub struct RequestSpan<E, R, H>(pub H)
+pub struct RequestSpan<R, H>(pub H)
 where
-    E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
-    R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
+    R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
     H: Fn(Request<Body>) -> R + Send + Sync + 'static;
 
-impl<E, R, H> RequestSpan<E, R, H>
+impl<R, H> RequestSpan<R, H>
 where
-    E: Into<Box<dyn std::error::Error + Send + Sync>> + 'static,
-    R: Future<Output = Result<Response<Body>, E>> + Send + 'static,
+    R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
     H: Fn(Request<Body>) -> R + Send + Sync + 'static,
 {
     /// Creates a tracing span around inner request handler and executes the request handler in the contex of that span.
     /// Use as `|r| RequestSpan(my_handler).handle(r)` instead of `my_handler` as the request handler to get the span enabled.
-    pub async fn handle(self, request: Request<Body>) -> Result<Response<Body>, E> {
+    pub async fn handle(self, request: Request<Body>) -> Result<Response<Body>, ApiError> {
         let request_id = request.context::<RequestId>().unwrap_or_default().0;
         let method = request.method();
         let path = request.uri().path();
@@ -83,15 +81,22 @@ where
                 info!("Handling request");
             }
 
-            // Note that we reuse `error::handler` here and not returning and error at all,
-            // yet cannot use `!` directly in the method signature due to `routerify::RouterBuilder` limitation.
-            // Usage of the error handler also means that we expect only the `ApiError` errors to be raised in this call.
-            //
-            // Panics are not handled separately, there's a `tracing_panic_hook` from another module to do that globally.
+            // No special handling for panics here. There's a `tracing_panic_hook` from another
+            // module to do that globally.
             let res = (self.0)(request).await;
 
             cancellation_guard.disarm();
 
+            // Log the result if needed.
+            //
+            // We also convert any errors into an Ok response with HTTP error code here.
+            // `make_router` sets a last-resort error handler that would do the same, but
+            // we prefer to do it here, before we exit the request span, so that the error
+            // is still logged with the span.
+            //
+            // (Because we convert errors to Ok response, we never actually return an error,
+            // and we could declare the function to return the never type (`!`). However,
+            // using `routerify::RouterBuilder` requires a proper error type.)
             match res {
                 Ok(response) => {
                     let response_status = response.status();
@@ -102,7 +107,7 @@ where
                     }
                     Ok(response)
                 }
-                Err(e) => Ok(error::handler(e.into()).await),
+                Err(err) => Ok(api_error_handler(err)),
             }
         }
         .instrument(request_span)
@@ -210,7 +215,7 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
         .get("/metrics", |r| {
             RequestSpan(prometheus_metrics_handler).handle(r)
         })
-        .err_handler(error::handler)
+        .err_handler(route_error_handler)
 }
 
 pub fn attach_openapi_ui(
diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index 3c6023eb80dd..4eff16b6a390 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -83,13 +83,24 @@ impl HttpErrorBody {
     }
 }
 
-pub async fn handler(err: routerify::RouteError) -> Response<Body> {
-    let api_error = err
-        .downcast::<ApiError>()
-        .expect("handler should always return api error");
+pub async fn route_error_handler(err: routerify::RouteError) -> Response<Body> {
+    match err.downcast::<ApiError>() {
+        Ok(api_error) => api_error_handler(*api_error),
+        Err(other_error) => {
+            // We expect all the request handlers to return an ApiError, so this should
+            // not be reached. But just in case.
+            error!("Error processing HTTP request: {other_error:?}");
+            HttpErrorBody::response_from_msg_and_status(
+                other_error.to_string(),
+                StatusCode::INTERNAL_SERVER_ERROR,
+            )
+        }
+    }
+}
 
+pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
     // Print a stack trace for Internal Server errors
-    if let ApiError::InternalServerError(_) = api_error.as_ref() {
+    if let ApiError::InternalServerError(_) = api_error {
         error!("Error processing HTTP request: {api_error:?}");
     } else {
         error!("Error processing HTTP request: {api_error:#}");

From 2cdf07f12c5d1fc637e0acfdd512ab88f801907d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 27 May 2023 01:11:58 +0300
Subject: [PATCH 24/59] Refactor RequestSpan into a function.

Previously, you used it like this:

    |r| RequestSpan(my_handler).handle(r)

But I don't see the point of the RequestSpan struct. It's just a
wrapper around the handler function. With this commit, the call
becomes:

    |r| request_span(r, my_handler)

Which seems a little simpler.

At first I thought that the RequestSpan struct would allow "chaining"
other kinds of decorators like RequestSpan, so that you could do
something like this:

    |r| CheckPermissions(RequestSpan(my_handler)).handle(r)

But it doesn't work like that. If each of those structs wrap a handler
*function*, it would actually look like this:

    |r| CheckPermissions(|r| RequestSpan(my_handler).handle(r))).handle(r)

This commit doesn't make that kind of chaining any easier, but seems a
little more straightforward anyway.
---
 libs/utils/src/http/endpoint.rs | 118 +++++++++++++++-----------------
 pageserver/src/http/routes.rs   |  56 +++++++--------
 2 files changed, 82 insertions(+), 92 deletions(-)

diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 4a78f16cfb63..db3642b507e7 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -33,8 +33,10 @@ struct RequestId(String);
 /// Adds a tracing info_span! instrumentation around the handler events,
 /// logs the request start and end events for non-GET requests and non-200 responses.
 ///
+/// Usage: Replace `my_handler` with `|r| request_span(r, my_handler)`
+///
 /// Use this to distinguish between logs of different HTTP requests: every request handler wrapped
-/// in this type will get request info logged in the wrapping span, including the unique request ID.
+/// with this will get request info logged in the wrapping span, including the unique request ID.
 ///
 /// This also handles errors, logging them and converting them to an HTTP error response.
 ///
@@ -54,65 +56,56 @@ struct RequestId(String);
 /// tries to achive with its `.instrument` used in the current approach.
 ///
 /// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
-pub struct RequestSpan<R, H>(pub H)
-where
-    R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
-    H: Fn(Request<Body>) -> R + Send + Sync + 'static;
-
-impl<R, H> RequestSpan<R, H>
+pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output
 where
     R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
-    H: Fn(Request<Body>) -> R + Send + Sync + 'static,
+    H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
 {
-    /// Creates a tracing span around inner request handler and executes the request handler in the contex of that span.
-    /// Use as `|r| RequestSpan(my_handler).handle(r)` instead of `my_handler` as the request handler to get the span enabled.
-    pub async fn handle(self, request: Request<Body>) -> Result<Response<Body>, ApiError> {
-        let request_id = request.context::<RequestId>().unwrap_or_default().0;
-        let method = request.method();
-        let path = request.uri().path();
-        let request_span = info_span!("request", %method, %path, %request_id);
-
-        let log_quietly = method == Method::GET;
-        async move {
-            let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
-            if log_quietly {
-                debug!("Handling request");
-            } else {
-                info!("Handling request");
-            }
-
-            // No special handling for panics here. There's a `tracing_panic_hook` from another
-            // module to do that globally.
-            let res = (self.0)(request).await;
-
-            cancellation_guard.disarm();
+    let request_id = request.context::<RequestId>().unwrap_or_default().0;
+    let method = request.method();
+    let path = request.uri().path();
+    let request_span = info_span!("request", %method, %path, %request_id);
+
+    let log_quietly = method == Method::GET;
+    async move {
+        let cancellation_guard = RequestCancelled::warn_when_dropped_without_responding();
+        if log_quietly {
+            debug!("Handling request");
+        } else {
+            info!("Handling request");
+        }
 
-            // Log the result if needed.
-            //
-            // We also convert any errors into an Ok response with HTTP error code here.
-            // `make_router` sets a last-resort error handler that would do the same, but
-            // we prefer to do it here, before we exit the request span, so that the error
-            // is still logged with the span.
-            //
-            // (Because we convert errors to Ok response, we never actually return an error,
-            // and we could declare the function to return the never type (`!`). However,
-            // using `routerify::RouterBuilder` requires a proper error type.)
-            match res {
-                Ok(response) => {
-                    let response_status = response.status();
-                    if log_quietly && response_status.is_success() {
-                        debug!("Request handled, status: {response_status}");
-                    } else {
-                        info!("Request handled, status: {response_status}");
-                    }
-                    Ok(response)
+        // No special handling for panics here. There's a `tracing_panic_hook` from another
+        // module to do that globally.
+        let res = handler(request).await;
+
+        cancellation_guard.disarm();
+
+        // Log the result if needed.
+        //
+        // We also convert any errors into an Ok response with HTTP error code here.
+        // `make_router` sets a last-resort error handler that would do the same, but
+        // we prefer to do it here, before we exit the request span, so that the error
+        // is still logged with the span.
+        //
+        // (Because we convert errors to Ok response, we never actually return an error,
+        // and we could declare the function to return the never type (`!`). However,
+        // using `routerify::RouterBuilder` requires a proper error type.)
+        match res {
+            Ok(response) => {
+                let response_status = response.status();
+                if log_quietly && response_status.is_success() {
+                    debug!("Request handled, status: {response_status}");
+                } else {
+                    info!("Request handled, status: {response_status}");
                 }
-                Err(err) => Ok(api_error_handler(err)),
+                Ok(response)
             }
+            Err(err) => Ok(api_error_handler(err)),
         }
-        .instrument(request_span)
-        .await
     }
+    .instrument(request_span)
+    .await
 }
 
 /// Drop guard to WARN in case the request was dropped before completion.
@@ -212,9 +205,7 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
         .middleware(Middleware::post_with_info(
             add_request_id_header_to_response,
         ))
-        .get("/metrics", |r| {
-            RequestSpan(prometheus_metrics_handler).handle(r)
-        })
+        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
         .err_handler(route_error_handler)
 }
 
@@ -225,12 +216,14 @@ pub fn attach_openapi_ui(
     ui_mount_path: &'static str,
 ) -> RouterBuilder<hyper::Body, ApiError> {
     router_builder
-        .get(spec_mount_path, move |r| {
-            RequestSpan(move |_| async move { Ok(Response::builder().body(Body::from(spec)).unwrap()) })
-                .handle(r)
-        })
-        .get(ui_mount_path, move |r| RequestSpan( move |_| async move {
-            Ok(Response::builder().body(Body::from(format!(r#"
+        .get(spec_mount_path,
+            move |r| request_span(r, move |_| async move {
+                Ok(Response::builder().body(Body::from(spec)).unwrap())
+            })
+        )
+        .get(ui_mount_path,
+             move |r| request_span(r, move |_| async move {
+                 Ok(Response::builder().body(Body::from(format!(r#"
                 <!DOCTYPE html>
                 <html lang="en">
                 <head>
@@ -260,7 +253,8 @@ pub fn attach_openapi_ui(
                 </body>
                 </html>
             "#, spec_mount_path))).unwrap())
-        }).handle(r))
+             })
+        )
 }
 
 fn parse_token(header_value: &str) -> Result<&str, ApiError> {
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 30c219f77377..279f069be7ca 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -11,7 +11,7 @@ use storage_broker::BrokerClientChannel;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::http::endpoint::RequestSpan;
+use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
 
@@ -1179,7 +1179,7 @@ pub fn make_router(
             #[cfg(not(feature = "testing"))]
             let handler = cfg_disabled;
 
-            move |r| RequestSpan(handler).handle(r)
+            move |r| request_span(r, handler)
         }};
     }
 
@@ -1194,54 +1194,50 @@ pub fn make_router(
             )
             .context("Failed to initialize router state")?,
         ))
-        .get("/v1/status", |r| RequestSpan(status_handler).handle(r))
+        .get("/v1/status", |r| request_span(r, status_handler))
         .put(
             "/v1/failpoints",
             testing_api!("manage failpoints", failpoints_handler),
         )
-        .get("/v1/tenant", |r| RequestSpan(tenant_list_handler).handle(r))
-        .post("/v1/tenant", |r| {
-            RequestSpan(tenant_create_handler).handle(r)
-        })
-        .get("/v1/tenant/:tenant_id", |r| {
-            RequestSpan(tenant_status).handle(r)
-        })
+        .get("/v1/tenant", |r| request_span(r, tenant_list_handler))
+        .post("/v1/tenant", |r| request_span(r, tenant_create_handler))
+        .get("/v1/tenant/:tenant_id", |r| request_span(r, tenant_status))
         .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
-            RequestSpan(tenant_size_handler).handle(r)
+            request_span(r, tenant_size_handler)
         })
         .put("/v1/tenant/config", |r| {
-            RequestSpan(update_tenant_config_handler).handle(r)
+            request_span(r, update_tenant_config_handler)
         })
         .get("/v1/tenant/:tenant_id/config", |r| {
-            RequestSpan(get_tenant_config_handler).handle(r)
+            request_span(r, get_tenant_config_handler)
         })
         .get("/v1/tenant/:tenant_id/timeline", |r| {
-            RequestSpan(timeline_list_handler).handle(r)
+            request_span(r, timeline_list_handler)
         })
         .post("/v1/tenant/:tenant_id/timeline", |r| {
-            RequestSpan(timeline_create_handler).handle(r)
+            request_span(r, timeline_create_handler)
         })
         .post("/v1/tenant/:tenant_id/attach", |r| {
-            RequestSpan(tenant_attach_handler).handle(r)
+            request_span(r, tenant_attach_handler)
         })
         .post("/v1/tenant/:tenant_id/detach", |r| {
-            RequestSpan(tenant_detach_handler).handle(r)
+            request_span(r, tenant_detach_handler)
         })
         .post("/v1/tenant/:tenant_id/load", |r| {
-            RequestSpan(tenant_load_handler).handle(r)
+            request_span(r, tenant_load_handler)
         })
         .post("/v1/tenant/:tenant_id/ignore", |r| {
-            RequestSpan(tenant_ignore_handler).handle(r)
+            request_span(r, tenant_ignore_handler)
         })
         .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            RequestSpan(timeline_detail_handler).handle(r)
+            request_span(r, timeline_detail_handler)
         })
         .get(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
-            |r| RequestSpan(get_lsn_by_timestamp_handler).handle(r),
+            |r| request_span(r, get_lsn_by_timestamp_handler),
         )
         .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
-            RequestSpan(timeline_gc_handler).handle(r)
+            request_span(r, timeline_gc_handler)
         })
         .put(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
@@ -1253,34 +1249,34 @@ pub fn make_router(
         )
         .post(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
-            |r| RequestSpan(timeline_download_remote_layers_handler_post).handle(r),
+            |r| request_span(r, timeline_download_remote_layers_handler_post),
         )
         .get(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
-            |r| RequestSpan(timeline_download_remote_layers_handler_get).handle(r),
+            |r| request_span(r, timeline_download_remote_layers_handler_get),
         )
         .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            RequestSpan(timeline_delete_handler).handle(r)
+            request_span(r, timeline_delete_handler)
         })
         .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
-            RequestSpan(layer_map_info_handler).handle(r)
+            request_span(r, layer_map_info_handler)
         })
         .get(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
-            |r| RequestSpan(layer_download_handler).handle(r),
+            |r| request_span(r, layer_download_handler),
         )
         .delete(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
-            |r| RequestSpan(evict_timeline_layer_handler).handle(r),
+            |r| request_span(r, evict_timeline_layer_handler),
         )
         .put("/v1/disk_usage_eviction/run", |r| {
-            RequestSpan(disk_usage_eviction_run).handle(r)
+            request_span(r, disk_usage_eviction_run)
         })
         .put(
             "/v1/tenant/:tenant_id/break",
             testing_api!("set tenant state to broken", handle_tenant_break),
         )
-        .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r))
+        .get("/v1/panic", |r| request_span(r, always_panic_handler))
         .post(
             "/v1/tracing/event",
             testing_api!("emit a tracing event", post_tracing_event_handler),

From 2d6a022bb819aca9fe9689ac23184b01b070e12f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 27 May 2023 15:55:43 +0300
Subject: [PATCH 25/59] Don't allow two timeline_delete operations to run
 concurrently. (#4313)

If the timeline is already being deleted, return an error. We used to
notice the duplicate request and error out in
persist_index_part_with_deleted_flag(), but it's better to detect it
earlier. Add an explicit lock for the deletion.

Note: This doesn't do anything about the async cancellation problem
(github issue #3478): if the original HTTP request dropped, because the
client disconnected, the timeline deletion stops half-way through the
operation. That needs to be fixed, too, but that's a separate story.

(This is a simpler replacement for PR #4194. I'm also working on the
cancellation shielding, see PR #4314.)
---
 pageserver/src/tenant.rs                    | 106 +++++++++++---------
 pageserver/src/tenant/timeline.rs           |   5 +
 test_runner/regress/test_timeline_delete.py |   2 +-
 3 files changed, 67 insertions(+), 46 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2827830f02ca..991f5ca1c64e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1436,7 +1436,11 @@ impl Tenant {
         Ok(())
     }
 
-    /// Removes timeline-related in-memory data
+    /// Shuts down a timeline's tasks, removes its in-memory structures, and deletes its
+    /// data from disk.
+    ///
+    /// This doesn't currently delete all data from S3, but sets a flag in its
+    /// index_part.json file to mark it as deleted.
     pub async fn delete_timeline(
         &self,
         timeline_id: TimelineId,
@@ -1446,7 +1450,11 @@ impl Tenant {
 
         // Transition the timeline into TimelineState::Stopping.
         // This should prevent new operations from starting.
-        let timeline = {
+        //
+        // Also grab the Timeline's delete_lock to prevent another deletion from starting.
+        let timeline;
+        let mut delete_lock_guard;
+        {
             let mut timelines = self.timelines.lock().unwrap();
 
             // Ensure that there are no child timelines **attached to that pageserver**,
@@ -1464,20 +1472,36 @@ impl Tenant {
                 Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound),
             };
 
-            let timeline = Arc::clone(timeline_entry.get());
+            timeline = Arc::clone(timeline_entry.get());
+
+            // Prevent two tasks from trying to delete the timeline at the same time.
+            //
+            // XXX: We should perhaps return an HTTP "202 Accepted" to signal that the caller
+            // needs to poll until the operation has finished. But for now, we return an
+            // error, because the control plane knows to retry errors.
+            delete_lock_guard = timeline.delete_lock.try_lock().map_err(|_| {
+                DeleteTimelineError::Other(anyhow::anyhow!(
+                    "timeline deletion is already in progress"
+                ))
+            })?;
+
+            // If another task finished the deletion just before we acquired the lock,
+            // return success.
+            if *delete_lock_guard {
+                return Ok(());
+            }
+
             timeline.set_state(TimelineState::Stopping);
 
             drop(timelines);
-            timeline
-        };
+        }
 
         // Now that the Timeline is in Stopping state, request all the related tasks to
         // shut down.
         //
-        // NB: If you call delete_timeline multiple times concurrently, they will
-        // all go through the motions here. Make sure the code here is idempotent,
-        // and don't error out if some of the shutdown tasks have already been
-        // completed!
+        // NB: If this fails half-way through, and is retried, the retry will go through
+        // all the same steps again. Make sure the code here is idempotent, and don't
+        // error out if some of the shutdown tasks have already been completed!
 
         // Stop the walreceiver first.
         debug!("waiting for wal receiver to shutdown");
@@ -1518,6 +1542,10 @@ impl Tenant {
                 // If we (now, or already) marked it successfully as deleted, we can proceed
                 Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
                 // Bail out otherwise
+                //
+                // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
+                // two tasks from performing the deletion at the same time. The first task
+                // that starts deletion should run it to completion.
                 Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
                 | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
                     return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
@@ -1528,14 +1556,12 @@ impl Tenant {
         {
             // Grab the layer_removal_cs lock, and actually perform the deletion.
             //
-            // This lock prevents multiple concurrent delete_timeline calls from
-            // stepping on each other's toes, while deleting the files. It also
-            // prevents GC or compaction from running at the same time.
+            // This lock prevents prevents GC or compaction from running at the same time.
+            // The GC task doesn't register itself with the timeline it's operating on,
+            // so it might still be running even though we called `shutdown_tasks`.
             //
             // Note that there are still other race conditions between
-            // GC, compaction and timeline deletion. GC task doesn't
-            // register itself properly with the timeline it's
-            // operating on. See
+            // GC, compaction and timeline deletion. See
             // https://github.com/neondatabase/neon/issues/2671
             //
             // No timeout here, GC & Compaction should be responsive to the
@@ -1597,37 +1623,27 @@ impl Tenant {
         });
 
         // Remove the timeline from the map.
-        let mut timelines = self.timelines.lock().unwrap();
-        let children_exist = timelines
-            .iter()
-            .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
-        // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
-        // We already deleted the layer files, so it's probably best to panic.
-        // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
-        if children_exist {
-            panic!("Timeline grew children while we removed layer files");
-        }
-        let removed_timeline = timelines.remove(&timeline_id);
-        if removed_timeline.is_none() {
-            // This can legitimately happen if there's a concurrent call to this function.
-            //   T1                                             T2
-            //   lock
-            //   unlock
-            //                                                  lock
-            //                                                  unlock
-            //                                                  remove files
-            //                                                  lock
-            //                                                  remove from map
-            //                                                  unlock
-            //                                                  return
-            //   remove files
-            //   lock
-            //   remove from map observes empty map
-            //   unlock
-            //   return
-            debug!("concurrent call to this function won the race");
+        {
+            let mut timelines = self.timelines.lock().unwrap();
+
+            let children_exist = timelines
+                .iter()
+                .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
+            // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
+            // We already deleted the layer files, so it's probably best to panic.
+            // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
+            if children_exist {
+                panic!("Timeline grew children while we removed layer files");
+            }
+
+            timelines.remove(&timeline_id).expect(
+                "timeline that we were deleting was concurrently removed from 'timelines' map",
+            );
         }
-        drop(timelines);
+
+        // All done! Mark the deletion as completed and release the delete_lock
+        *delete_lock_guard = true;
+        drop(delete_lock_guard);
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4bfebd93df66..9dd5352a540c 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -236,6 +236,10 @@ pub struct Timeline {
 
     state: watch::Sender<TimelineState>,
 
+    /// Prevent two tasks from deleting the timeline at the same time. If held, the
+    /// timeline is being deleted. If 'true', the timeline has already been deleted.
+    pub delete_lock: tokio::sync::Mutex<bool>,
+
     eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
 }
 
@@ -1414,6 +1418,7 @@ impl Timeline {
                 eviction_task_timeline_state: tokio::sync::Mutex::new(
                     EvictionTaskTimelineState::default(),
                 ),
+                delete_lock: tokio::sync::Mutex::new(false),
             };
             result.repartition_threshold = result.get_checkpoint_distance() / 10;
             result
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 7135b621cbed..99bf4002079f 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -371,7 +371,7 @@ def first_call_hit_failpoint():
 
         # make the second call and assert behavior
         log.info("second call start")
-        error_msg_re = "another task is already setting the deleted_flag, started at"
+        error_msg_re = "timeline deletion is already in progress"
         with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err:
             ps_http.timeline_delete(env.initial_tenant, child_timeline_id)
         assert second_call_err.value.status_code == 500

From ccf653c1f47ab357cd7f43ca959cf3a8f6627ee9 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Sun, 28 May 2023 10:22:45 -0700
Subject: [PATCH 26/59] re-enable file cache integration for VM compute node
 (#4338)

#4155 inadvertently switched to a version of the VM builder that leaves
the file cache integration disabled by default. This re-enables the
vm-informant's file cache integration.

(as a refresher: The vm-informant is the autoscaling component that sits
inside the VM and manages postgres / compute_ctl)

See also: https://github.com/neondatabase/autoscaling/pull/265
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 336dea04eb61..e00b98250c33 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -797,7 +797,7 @@ jobs:
 
       - name: Build vm image
         run: |
-          ./vm-builder -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          ./vm-builder -enable-file-cache -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
       - name: Pushing vm-compute-node image
         run: |

From f4f300732a433eb4873fc2210421bf5e4446a893 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 29 May 2023 16:52:41 +0200
Subject: [PATCH 27/59] refactor TenantState transitions (#4321)

This is preliminary work for/from #4220 (async
`Layer::get_value_reconstruct_data`).
The motivation is to avoid locking `Tenant::timelines` in places that
can't be `async`, because in #4333 we want to convert Tenant::timelines
from `std::sync::Mutex` to `tokio::sync::Mutex`.

But, the changes here are useful in general because they clean up &
document tenant state transitions.
That also paves the way for #4350, which is an alternative to #4333 that
refactors the pageserver code so that we can keep the
`Tenant::timelines` mutex sync.

This patch consists of the following core insights and changes:

* spawn_load and spawn_attach own the tenant state until they're done
* once load()/attach() calls are done ...
* if they failed, transition them to Broken directly (we know that
there's no background activity because we didn't call activate yet)
* if they succeed, call activate. We can make it infallible. How? Later.
* set_broken() and set_stopping() are changed to wait for spawn_load() /
spawn_attach() to finish.
* This sounds scary because it might hinder detach or shutdown, but
actually, concurrent attach+detach, or attach+shutdown, or
load+shutdown, or attach+shutdown were just racy before this PR.
     So, with this change, they're not anymore.
In the future, we can add a `CancellationToken` stored in Tenant to
cancel `load` and `attach` faster, i.e., make `spawn_load` /
`spawn_attach` transition them to Broken state sooner.

See the doc comments on TenantState for the state transitions that are
now possible.
It might seem scary, but actually, this patch reduces the possible state
transitions.

We introduce a new state `TenantState::Activating` to avoid grabbing the
`Tenant::timelines` lock inside the `send_modify` closure.
These were the humble beginnings of this PR (see Motivation section),
and I think it's still the right thing to have this `Activating` state,
even if we decide against async `Tenant::timelines` mutex. The reason is
that `send_modify` locks internally, and by moving locking of
Tenant::timelines out of the closure, the internal locking of
`send_modify` becomes a leaf of the lock graph, and so, we eliminate
deadlock risk.

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 libs/pageserver_api/src/models.rs           | 125 ++++++++-
 pageserver/src/http/routes.rs               |   2 +-
 pageserver/src/lib.rs                       |   1 +
 pageserver/src/tenant.rs                    | 292 ++++++++++++--------
 pageserver/src/tenant/mgr.rs                | 116 ++++++--
 test_runner/regress/test_broken_timeline.py |   2 +-
 test_runner/regress/test_remote_storage.py  |   2 +-
 test_runner/regress/test_tenant_detach.py   |   4 +-
 test_runner/regress/test_tenants.py         |  17 +-
 9 files changed, 413 insertions(+), 148 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 540633d11363..0b4457a9a5b9 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -18,7 +18,29 @@ use crate::reltag::RelTag;
 use anyhow::bail;
 use bytes::{BufMut, Bytes, BytesMut};
 
-/// A state of a tenant in pageserver's memory.
+/// The state of a tenant in this pageserver.
+///
+/// ```mermaid
+/// stateDiagram-v2
+///
+///     [*] --> Loading: spawn_load()
+///     [*] --> Attaching: spawn_attach()
+///
+///     Loading --> Activating: activate()
+///     Attaching --> Activating: activate()
+///     Activating --> Active: infallible
+///
+///     Loading --> Broken: load() failure
+///     Attaching --> Broken: attach() failure
+///
+///     Active --> Stopping: set_stopping(), part of shutdown & detach
+///     Stopping --> Broken: late error in remove_tenant_from_memory
+///
+///     Broken --> [*]: ignore / detach / shutdown
+///     Stopping --> [*]: remove_from_memory complete
+///
+///     Active --> Broken: cfg(testing)-only tenant break point
+/// ```
 #[derive(
     Clone,
     PartialEq,
@@ -26,40 +48,63 @@ use bytes::{BufMut, Bytes, BytesMut};
     serde::Serialize,
     serde::Deserialize,
     strum_macros::Display,
-    strum_macros::EnumString,
     strum_macros::EnumVariantNames,
     strum_macros::AsRefStr,
     strum_macros::IntoStaticStr,
 )]
 #[serde(tag = "slug", content = "data")]
 pub enum TenantState {
-    /// This tenant is being loaded from local disk
+    /// This tenant is being loaded from local disk.
+    ///
+    /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
     Loading,
-    /// This tenant is being downloaded from cloud storage.
+    /// This tenant is being attached to the pageserver.
+    ///
+    /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
     Attaching,
-    /// Tenant is fully operational
+    /// The tenant is transitioning from Loading/Attaching to Active.
+    ///
+    /// While in this state, the individual timelines are being activated.
+    ///
+    /// `set_stopping()` and `set_broken()` do not work in this state and wait for it to pass.
+    Activating(ActivatingFrom),
+    /// The tenant has finished activating and is open for business.
+    ///
+    /// Transitions out of this state are possible through `set_stopping()` and `set_broken()`.
     Active,
-    /// A tenant is recognized by pageserver, but it is being detached or the
+    /// The tenant is recognized by pageserver, but it is being detached or the
     /// system is being shut down.
+    ///
+    /// Transitions out of this state are possible through `set_broken()`.
     Stopping,
-    /// A tenant is recognized by the pageserver, but can no longer be used for
-    /// any operations, because it failed to be activated.
+    /// The tenant is recognized by the pageserver, but can no longer be used for
+    /// any operations.
+    ///
+    /// If the tenant fails to load or attach, it will transition to this state
+    /// and it is guaranteed that no background tasks are running in its name.
+    ///
+    /// The other way to transition into this state is from `Stopping` state
+    /// through `set_broken()` called from `remove_tenant_from_memory()`. That happens
+    /// if the cleanup future executed by `remove_tenant_from_memory()` fails.
     Broken { reason: String, backtrace: String },
 }
 
 impl TenantState {
     pub fn attachment_status(&self) -> TenantAttachmentStatus {
         use TenantAttachmentStatus::*;
+
+        // Below TenantState::Activating is used as "transient" or "transparent" state for
+        // attachment_status determining.
         match self {
             // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
             // So, technically, we can return Attached here.
             // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
             // But, our attach task might still be fetching the remote timelines, etc.
             // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
-            Self::Attaching => Maybe,
+            Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
             // tenant mgr startup distinguishes attaching from loading via marker file.
             // If it's loading, there is no attach marker file, i.e., attach had finished in the past.
-            Self::Loading => Attached,
+            Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
             // We only reach Active after successful load / attach.
             // So, call atttachment status Attached.
             Self::Active => Attached,
@@ -98,6 +143,15 @@ impl std::fmt::Debug for TenantState {
     }
 }
 
+/// The only [`TenantState`] variants we could be `TenantState::Activating` from.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub enum ActivatingFrom {
+    /// Arrived to [`TenantState::Activating`] from [`TenantState::Loading`]
+    Loading,
+    /// Arrived to [`TenantState::Activating`] from [`TenantState::Attaching`]
+    Attaching,
+}
+
 /// A state of a timeline in pageserver's memory.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TimelineState {
@@ -829,4 +883,55 @@ mod tests {
             err
         );
     }
+
+    #[test]
+    fn tenantstatus_activating_serde() {
+        let states = [
+            TenantState::Activating(ActivatingFrom::Loading),
+            TenantState::Activating(ActivatingFrom::Attaching),
+        ];
+        let expected = "[{\"slug\":\"Activating\",\"data\":\"Loading\"},{\"slug\":\"Activating\",\"data\":\"Attaching\"}]";
+
+        let actual = serde_json::to_string(&states).unwrap();
+
+        assert_eq!(actual, expected);
+
+        let parsed = serde_json::from_str::<Vec<TenantState>>(&actual).unwrap();
+
+        assert_eq!(states.as_slice(), &parsed);
+    }
+
+    #[test]
+    fn tenantstatus_activating_strum() {
+        // tests added, because we use these for metrics
+        let examples = [
+            (line!(), TenantState::Loading, "Loading"),
+            (line!(), TenantState::Attaching, "Attaching"),
+            (
+                line!(),
+                TenantState::Activating(ActivatingFrom::Loading),
+                "Activating",
+            ),
+            (
+                line!(),
+                TenantState::Activating(ActivatingFrom::Attaching),
+                "Activating",
+            ),
+            (line!(), TenantState::Active, "Active"),
+            (line!(), TenantState::Stopping, "Stopping"),
+            (
+                line!(),
+                TenantState::Broken {
+                    reason: "Example".into(),
+                    backtrace: "Looooong backtrace".into(),
+                },
+                "Broken",
+            ),
+        ];
+
+        for (line, rendered, expected) in examples {
+            let actual: &'static str = rendered.into();
+            assert_eq!(actual, expected, "example on {line}");
+        }
+    }
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 279f069be7ca..61028e23fea0 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -859,7 +859,7 @@ async fn handle_tenant_break(r: Request<Body>) -> Result<Response<Body>, ApiErro
         .await
         .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
 
-    tenant.set_broken("broken from test".to_owned());
+    tenant.set_broken("broken from test".to_owned()).await;
 
     json_response(StatusCode::OK, ())
 }
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 36578ee4e025..776cf0dac1b4 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -45,6 +45,7 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
 
 pub use crate::metrics::preinitialize_metrics;
 
+#[tracing::instrument]
 pub async fn shutdown_pageserver(exit_code: i32) {
     // Shut down the libpq endpoint task. This prevents new connections from
     // being accepted.
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 991f5ca1c64e..4c8101af8d3d 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -447,6 +447,11 @@ pub enum DeleteTimelineError {
     Other(#[from] anyhow::Error),
 }
 
+pub enum SetStoppingError {
+    AlreadyStopping,
+    Broken,
+}
+
 struct RemoteStartupData {
     index_part: IndexPart,
     remote_metadata: TimelineMetadata,
@@ -645,16 +650,17 @@ impl Tenant {
             "attach tenant",
             false,
             async move {
-                let doit = async {
-                    tenant_clone.attach(&ctx).await?;
-                    tenant_clone.activate(broker_client, &ctx)?;
-                    anyhow::Ok(())
-                };
-                match doit.await {
-                    Ok(_) => {}
+                match tenant_clone.attach(&ctx).await {
+                    Ok(()) => {
+                        info!("attach finished, activating");
+                        tenant_clone.activate(broker_client, &ctx);
+                    }
                     Err(e) => {
-                        tenant_clone.set_broken(e.to_string());
-                        error!("error attaching tenant: {:?}", e);
+                        error!("attach failed, setting tenant state to Broken: {:?}", e);
+                        tenant_clone.state.send_modify(|state| {
+                            assert_eq!(*state, TenantState::Attaching, "the attach task owns the tenant state until activation is complete");
+                            *state = TenantState::broken_from_reason(e.to_string());
+                        });
                     }
                 }
                 Ok(())
@@ -671,6 +677,8 @@ impl Tenant {
     ///
     /// Background task that downloads all data for a tenant and brings it to Active state.
     ///
+    /// No background tasks are started as part of this routine.
+    ///
     async fn attach(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
         debug_assert_current_span_has_tenant_id();
 
@@ -920,20 +928,20 @@ impl Tenant {
             "initial tenant load",
             false,
             async move {
-                let doit = async {
-                    tenant_clone.load(&ctx).await?;
-                    tenant_clone.activate(broker_client, &ctx)?;
-                    anyhow::Ok(())
-                };
-                match doit.await {
-                    Ok(()) => {}
+                match tenant_clone.load(&ctx).await {
+                    Ok(()) => {
+                        info!("load finished, activating");
+                        tenant_clone.activate(broker_client, &ctx);
+                    }
                     Err(err) => {
-                        tenant_clone.set_broken(err.to_string());
-                        error!("could not load tenant {tenant_id}: {err:?}");
+                        error!("load failed, setting tenant state to Broken: {err:?}");
+                        tenant_clone.state.send_modify(|state| {
+                            assert_eq!(*state, TenantState::Loading, "the loading task owns the tenant state until activation is complete");
+                            *state = TenantState::broken_from_reason(err.to_string());
+                        });
                     }
                 }
-                info!("initial load for tenant {tenant_id} finished!");
-                Ok(())
+               Ok(())
             }
             .instrument({
                 let span = tracing::info_span!(parent: None, "load", tenant_id=%tenant_id);
@@ -951,6 +959,7 @@ impl Tenant {
     /// Background task to load in-memory data structures for this tenant, from
     /// files on disk. Used at pageserver startup.
     ///
+    /// No background tasks are started as part of this routine.
     async fn load(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
         debug_assert_current_span_has_tenant_id();
 
@@ -1657,130 +1666,191 @@ impl Tenant {
     }
 
     /// Changes tenant status to active, unless shutdown was already requested.
-    fn activate(
-        self: &Arc<Self>,
-        broker_client: BrokerClientChannel,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    fn activate(self: &Arc<Self>, broker_client: BrokerClientChannel, ctx: &RequestContext) {
         debug_assert_current_span_has_tenant_id();
 
-        let mut result = Ok(());
+        let mut activating = false;
         self.state.send_modify(|current_state| {
+            use pageserver_api::models::ActivatingFrom;
             match &*current_state {
-                TenantState::Active => {
-                    // activate() was called on an already Active tenant. Shouldn't happen.
-                    result = Err(anyhow::anyhow!("Tenant is already active"));
+                TenantState::Activating(_) | TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping => {
+                    panic!("caller is responsible for calling activate() only on Loading / Attaching tenants, got {state:?}", state = current_state);
                 }
-                TenantState::Broken { reason, .. } => {
-                    // This shouldn't happen either
-                    result = Err(anyhow::anyhow!(
-                        "Could not activate tenant because it is in broken state due to: {reason}",
-                    ));
+                TenantState::Loading => {
+                    *current_state = TenantState::Activating(ActivatingFrom::Loading);
                 }
-                TenantState::Stopping => {
-                    // The tenant was detached, or system shutdown was requested, while we were
-                    // loading or attaching the tenant.
-                    info!("Tenant is already in Stopping state, skipping activation");
+                TenantState::Attaching => {
+                    *current_state = TenantState::Activating(ActivatingFrom::Attaching);
                 }
-                TenantState::Loading | TenantState::Attaching => {
-                    *current_state = TenantState::Active;
+            }
+            debug!(tenant_id = %self.tenant_id, "Activating tenant");
+            activating = true;
+            // Continue outside the closure. We need to grab timelines.lock()
+            // and we plan to turn it into a tokio::sync::Mutex in a future patch.
+        });
 
-                    debug!(tenant_id = %self.tenant_id, "Activating tenant");
+        if activating {
+            let timelines_accessor = self.timelines.lock().unwrap();
+            let not_broken_timelines = timelines_accessor
+                .values()
+                .filter(|timeline| timeline.current_state() != TimelineState::Broken);
 
-                    let timelines_accessor = self.timelines.lock().unwrap();
-                    let not_broken_timelines = timelines_accessor
-                        .values()
-                        .filter(|timeline| timeline.current_state() != TimelineState::Broken);
+            // Spawn gc and compaction loops. The loops will shut themselves
+            // down when they notice that the tenant is inactive.
+            tasks::start_background_loops(self);
 
-                    // Spawn gc and compaction loops. The loops will shut themselves
-                    // down when they notice that the tenant is inactive.
-                    tasks::start_background_loops(self);
+            let mut activated_timelines = 0;
 
-                    let mut activated_timelines = 0;
+            for timeline in not_broken_timelines {
+                timeline.activate(broker_client.clone(), ctx);
+                activated_timelines += 1;
+            }
 
-                    for timeline in not_broken_timelines {
-                        timeline.activate(broker_client.clone(), ctx);
-                        activated_timelines += 1;
-                    }
+            self.state.send_modify(move |current_state| {
+                assert!(
+                    matches!(current_state, TenantState::Activating(_)),
+                    "set_stopping and set_broken wait for us to leave Activating state",
+                );
+                *current_state = TenantState::Active;
 
-                    let elapsed = self.loading_started_at.elapsed();
-                    let total_timelines = timelines_accessor.len();
+                let elapsed = self.loading_started_at.elapsed();
+                let total_timelines = timelines_accessor.len();
 
-                    // log a lot of stuff, because some tenants sometimes suffer from user-visible
-                    // times to activate. see https://github.com/neondatabase/neon/issues/4025
-                    info!(
-                        since_creation_millis = elapsed.as_millis(),
-                        tenant_id = %self.tenant_id,
-                        activated_timelines,
-                        total_timelines,
-                        post_state = <&'static str>::from(&*current_state),
-                        "activation attempt finished"
-                    );
-                }
-            }
-        });
-        result
+                // log a lot of stuff, because some tenants sometimes suffer from user-visible
+                // times to activate. see https://github.com/neondatabase/neon/issues/4025
+                info!(
+                    since_creation_millis = elapsed.as_millis(),
+                    tenant_id = %self.tenant_id,
+                    activated_timelines,
+                    total_timelines,
+                    post_state = <&'static str>::from(&*current_state),
+                    "activation attempt finished"
+                );
+            });
+        }
     }
 
-    /// Change tenant status to Stopping, to mark that it is being shut down
-    pub fn set_stopping(&self) {
-        self.state.send_modify(|current_state| {
-            match current_state {
-                TenantState::Active | TenantState::Loading | TenantState::Attaching => {
-                    *current_state = TenantState::Stopping;
-
-                    // FIXME: If the tenant is still Loading or Attaching, new timelines
-                    // might be created after this. That's harmless, as the Timelines
-                    // won't be accessible to anyone, when the Tenant is in Stopping
-                    // state.
-                    let timelines_accessor = self.timelines.lock().unwrap();
-                    let not_broken_timelines = timelines_accessor
-                        .values()
-                        .filter(|timeline| timeline.current_state() != TimelineState::Broken);
-                    for timeline in not_broken_timelines {
-                        timeline.set_state(TimelineState::Stopping);
-                    }
-                }
-                TenantState::Broken { reason, .. } => {
-                    info!("Cannot set tenant to Stopping state, it is in Broken state due to: {reason}");
-                }
-                TenantState::Stopping => {
-                    // The tenant was detached, or system shutdown was requested, while we were
-                    // loading or attaching the tenant.
-                    info!("Tenant is already in Stopping state");
-                }
+    /// Change tenant status to Stopping, to mark that it is being shut down.
+    ///
+    /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
+    ///
+    /// This function is not cancel-safe!
+    pub async fn set_stopping(&self) -> Result<(), SetStoppingError> {
+        let mut rx = self.state.subscribe();
+
+        // cannot stop before we're done activating, so wait out until we're done activating
+        rx.wait_for(|state| match state {
+            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+                info!(
+                    "waiting for {} to turn Active|Broken|Stopping",
+                    <&'static str>::from(state)
+                );
+                false
+            }
+            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping {} => true,
+        })
+        .await
+        .expect("cannot drop self.state while on a &self method");
+
+        // we now know we're done activating, let's see whether this task is the winner to transition into Stopping
+        let mut err = None;
+        let stopping = self.state.send_if_modified(|current_state| match current_state {
+            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+                unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
+            }
+            TenantState::Active => {
+                // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
+                // are created after the transition to Stopping. That's harmless, as the Timelines
+                // won't be accessible to anyone afterwards, because the Tenant is in Stopping state.
+                *current_state = TenantState::Stopping;
+                // Continue stopping outside the closure. We need to grab timelines.lock()
+                // and we plan to turn it into a tokio::sync::Mutex in a future patch.
+                true
+            }
+            TenantState::Broken { reason, .. } => {
+                info!(
+                    "Cannot set tenant to Stopping state, it is in Broken state due to: {reason}"
+                );
+                err = Some(SetStoppingError::Broken);
+                false
+            }
+            TenantState::Stopping => {
+                info!("Tenant is already in Stopping state");
+                err = Some(SetStoppingError::AlreadyStopping);
+                false
             }
         });
+        match (stopping, err) {
+            (true, None) => {} // continue
+            (false, Some(err)) => return Err(err),
+            (true, Some(_)) => unreachable!(
+                "send_if_modified closure must error out if not transitioning to Stopping"
+            ),
+            (false, None) => unreachable!(
+                "send_if_modified closure must return true if transitioning to Stopping"
+            ),
+        }
+
+        let timelines_accessor = self.timelines.lock().unwrap();
+        let not_broken_timelines = timelines_accessor
+            .values()
+            .filter(|timeline| timeline.current_state() != TimelineState::Broken);
+        for timeline in not_broken_timelines {
+            timeline.set_state(TimelineState::Stopping);
+        }
+        Ok(())
     }
 
-    pub fn set_broken(&self, reason: String) {
+    /// Method for tenant::mgr to transition us into Broken state in case of a late failure in
+    /// `remove_tenant_from_memory`
+    ///
+    /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
+    ///
+    /// In tests, we also use this to set tenants to Broken state on purpose.
+    pub(crate) async fn set_broken(&self, reason: String) {
+        let mut rx = self.state.subscribe();
+
+        // The load & attach routines own the tenant state until it has reached `Active`.
+        // So, wait until it's done.
+        rx.wait_for(|state| match state {
+            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+                info!(
+                    "waiting for {} to turn Active|Broken|Stopping",
+                    <&'static str>::from(state)
+                );
+                false
+            }
+            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping {} => true,
+        })
+        .await
+        .expect("cannot drop self.state while on a &self method");
+
+        // we now know we're done activating, let's see whether this task is the winner to transition into Broken
         self.state.send_modify(|current_state| {
             match *current_state {
+                TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
+                    unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
+                }
                 TenantState::Active => {
-                    // Broken tenants can currently only used for fatal errors that happen
-                    // while loading or attaching a tenant. A tenant that has already been
-                    // activated should never be marked as broken. We cope with it the best
-                    // we can, but it shouldn't happen.
-                    warn!("Changing Active tenant to Broken state, reason: {}", reason);
-                    *current_state = TenantState::broken_from_reason(reason);
+                    if cfg!(feature = "testing") {
+                        warn!("Changing Active tenant to Broken state, reason: {}", reason);
+                        *current_state = TenantState::broken_from_reason(reason);
+                    } else {
+                        unreachable!("not allowed to call set_broken on Active tenants in non-testing builds")
+                    }
                 }
                 TenantState::Broken { .. } => {
-                    // This shouldn't happen either
                     warn!("Tenant is already in Broken state");
                 }
+                // This is the only "expected" path, any other path is a bug.
                 TenantState::Stopping => {
-                    // This shouldn't happen either
                     warn!(
                         "Marking Stopping tenant as Broken state, reason: {}",
                         reason
                     );
                     *current_state = TenantState::broken_from_reason(reason);
                 }
-                TenantState::Loading | TenantState::Attaching => {
-                    info!("Setting tenant as Broken state, reason: {}", reason);
-                    *current_state = TenantState::broken_from_reason(reason);
-                }
-            }
+           }
         });
     }
 
@@ -1793,7 +1863,7 @@ impl Tenant {
         loop {
             let current_state = receiver.borrow_and_update().clone();
             match current_state {
-                TenantState::Loading | TenantState::Attaching => {
+                TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
                     // in these states, there's a chance that we can reach ::Active
                     receiver.changed().await.map_err(
                         |_e: tokio::sync::watch::error::RecvError| {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index dbb9577bf0f3..c0bd81ebfc1e 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -10,6 +10,7 @@ use tokio::fs;
 use anyhow::Context;
 use once_cell::sync::Lazy;
 use tokio::sync::RwLock;
+use tokio::task::JoinSet;
 use tracing::*;
 
 use remote_storage::GenericRemoteStorage;
@@ -19,7 +20,9 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
+use crate::tenant::{
+    create_tenant_files, CreateTenantFilesMode, SetStoppingError, Tenant, TenantState,
+};
 use crate::IGNORED_TENANT_FILE_NAME;
 
 use utils::fs_ext::PathExt;
@@ -222,6 +225,7 @@ pub fn schedule_local_tenant_processing(
 /// That could be easily misinterpreted by control plane, the consumer of the
 /// management API. For example, it could attach the tenant on a different pageserver.
 /// We would then be in split-brain once this pageserver restarts.
+#[instrument]
 pub async fn shutdown_all_tenants() {
     // Prevent new tenants from being created.
     let tenants_to_shut_down = {
@@ -244,15 +248,65 @@ pub async fn shutdown_all_tenants() {
         }
     };
 
+    // Set tenant (and its timlines) to Stoppping state.
+    //
+    // Since we can only transition into Stopping state after activation is complete,
+    // run it in a JoinSet so all tenants have a chance to stop before we get SIGKILLed.
+    //
+    // Transitioning tenants to Stopping state has a couple of non-obvious side effects:
+    // 1. Lock out any new requests to the tenants.
+    // 2. Signal cancellation to WAL receivers (we wait on it below).
+    // 3. Signal cancellation for other tenant background loops.
+    // 4. ???
+    //
+    // The waiting for the cancellation is not done uniformly.
+    // We certainly wait for WAL receivers to shut down.
+    // That is necessary so that no new data comes in before the freeze_and_flush.
+    // But the tenant background loops are joined-on in our caller.
+    // It's mesed up.
+    let mut join_set = JoinSet::new();
     let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len());
-    for (_, tenant) in tenants_to_shut_down {
-        if tenant.is_active() {
-            // updates tenant state, forbidding new GC and compaction iterations from starting
-            tenant.set_stopping();
-            tenants_to_freeze_and_flush.push(tenant);
+    for (tenant_id, tenant) in tenants_to_shut_down {
+        join_set.spawn(
+            async move {
+                match tenant.set_stopping().await {
+                    Ok(()) => debug!("tenant successfully stopped"),
+                    Err(SetStoppingError::Broken) => {
+                        info!("tenant is broken, so stopping failed, freeze_and_flush is likely going to make noise as well");
+                    },
+                    Err(SetStoppingError::AlreadyStopping) => {
+                        // our task_mgr::shutdown_tasks are going to coalesce on that just fine
+                    }
+                }
+
+                tenant
+            }
+            .instrument(info_span!("set_stopping", %tenant_id)),
+        );
+    }
+
+    let mut panicked = 0;
+
+    while let Some(res) = join_set.join_next().await {
+        match res {
+            Err(join_error) if join_error.is_cancelled() => {
+                unreachable!("we are not cancelling any of the futures");
+            }
+            Err(join_error) if join_error.is_panic() => {
+                // cannot really do anything, as this panic is likely a bug
+                panicked += 1;
+            }
+            Err(join_error) => {
+                warn!("unknown kind of JoinError: {join_error}");
+            }
+            Ok(tenant) => tenants_to_freeze_and_flush.push(tenant),
         }
     }
 
+    if panicked > 0 {
+        warn!(panicked, "observed panicks while stopping tenants");
+    }
+
     // Shut down all existing walreceiver connections and stop accepting the new ones.
     task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
 
@@ -264,12 +318,30 @@ pub async fn shutdown_all_tenants() {
     // should be no more activity in any of the repositories.
     //
     // On error, log it but continue with the shutdown for other tenants.
+
+    let mut join_set = tokio::task::JoinSet::new();
+
     for tenant in tenants_to_freeze_and_flush {
         let tenant_id = tenant.tenant_id();
-        debug!("shutdown tenant {tenant_id}");
 
-        if let Err(err) = tenant.freeze_and_flush().await {
-            error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
+        join_set.spawn(
+            async move {
+                if let Err(err) = tenant.freeze_and_flush().await {
+                    warn!("Could not checkpoint tenant during shutdown: {err:?}");
+                }
+            }
+            .instrument(info_span!("freeze_and_flush", %tenant_id)),
+        );
+    }
+
+    while let Some(next) = join_set.join_next().await {
+        match next {
+            Ok(()) => {}
+            Err(join_error) if join_error.is_cancelled() => {
+                unreachable!("no cancelling")
+            }
+            Err(join_error) if join_error.is_panic() => { /* reported already */ }
+            Err(join_error) => warn!("unknown kind of JoinError: {join_error}"),
         }
     }
 }
@@ -589,13 +661,23 @@ where
     {
         let tenants_accessor = TENANTS.write().await;
         match tenants_accessor.get(&tenant_id) {
-            Some(tenant) => match tenant.current_state() {
-                TenantState::Attaching
-                | TenantState::Loading
-                | TenantState::Broken { .. }
-                | TenantState::Active => tenant.set_stopping(),
-                TenantState::Stopping => return Err(TenantStateError::IsStopping(tenant_id)),
-            },
+            Some(tenant) => {
+                let tenant = Arc::clone(tenant);
+                // don't hold TENANTS lock while set_stopping waits for activation to finish
+                drop(tenants_accessor);
+                match tenant.set_stopping().await {
+                    Ok(()) => {
+                        // we won, continue stopping procedure
+                    }
+                    Err(SetStoppingError::Broken) => {
+                        // continue the procedure, let's hope the closure can deal with broken tenants
+                    }
+                    Err(SetStoppingError::AlreadyStopping) => {
+                        // the tenant is already stopping or broken, don't do anything
+                        return Err(TenantStateError::IsStopping(tenant_id));
+                    }
+                }
+            }
             None => return Err(TenantStateError::NotFound(tenant_id)),
         }
     }
@@ -620,7 +702,7 @@ where
             let tenants_accessor = TENANTS.read().await;
             match tenants_accessor.get(&tenant_id) {
                 Some(tenant) => {
-                    tenant.set_broken(e.to_string());
+                    tenant.set_broken(e.to_string()).await;
                 }
                 None => {
                     warn!("Tenant {tenant_id} got removed from memory");
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index fb592bfbc355..0fb3b4f26209 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -20,7 +20,7 @@ def test_broken_timeline(neon_env_builder: NeonEnvBuilder):
             ".*is not active. Current state: Broken.*",
             ".*will not become active. Current state: Broken.*",
             ".*failed to load metadata.*",
-            ".*could not load tenant.*load local timeline.*",
+            ".*load failed.*load local timeline.*",
         ]
     )
 
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 02f1aac99ca6..aefc8befeb4d 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -140,7 +140,7 @@ def test_remote_storage_backup_and_restore(
     # This is before the failures injected by test_remote_failures, so it's a permanent error.
     pageserver_http.configure_failpoints(("storage-sync-list-remote-timelines", "return"))
     env.pageserver.allowed_errors.append(
-        ".*error attaching tenant: storage-sync-list-remote-timelines",
+        ".*attach failed.*: storage-sync-list-remote-timelines",
     )
     # Attach it. This HTTP request will succeed and launch a
     # background task to load the tenant. In that background task,
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 82664cff94f7..f5e0e34bc9a4 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -647,7 +647,9 @@ def test_ignored_tenant_stays_broken_without_metadata(
             metadata_removed = True
     assert metadata_removed, f"Failed to find metadata file in {tenant_timeline_dir}"
 
-    env.pageserver.allowed_errors.append(".*could not load tenant .*?: failed to load metadata.*")
+    env.pageserver.allowed_errors.append(
+        f".*{tenant_id}.*: load failed.*: failed to load metadata.*"
+    )
 
     # now, load it from the local files and expect it to be broken due to inability to load tenant files into memory
     pageserver_http.tenant_load(tenant_id=tenant_id)
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 6599fa7ba59c..59b7b574cdf2 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -22,6 +22,7 @@
     available_remote_storages,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import wait_until
 from prometheus_client.samples import Sample
 
 
@@ -308,9 +309,7 @@ def test_pageserver_with_empty_tenants(
     env.pageserver.allowed_errors.append(
         ".*marking .* as locally complete, while it doesnt exist in remote index.*"
     )
-    env.pageserver.allowed_errors.append(
-        ".*could not load tenant.*Failed to list timelines directory.*"
-    )
+    env.pageserver.allowed_errors.append(".*load failed.*Failed to list timelines directory.*")
 
     client = env.pageserver.http_client()
 
@@ -341,9 +340,15 @@ def test_pageserver_with_empty_tenants(
     env.pageserver.start()
 
     client = env.pageserver.http_client()
-    tenants = client.tenant_list()
 
-    assert len(tenants) == 2
+    def not_loading():
+        tenants = client.tenant_list()
+        assert len(tenants) == 2
+        assert all(t["state"]["slug"] != "Loading" for t in tenants)
+
+    wait_until(10, 0.2, not_loading)
+
+    tenants = client.tenant_list()
 
     [broken_tenant] = [t for t in tenants if t["id"] == str(tenant_without_timelines_dir)]
     assert (
@@ -355,7 +360,7 @@ def test_pageserver_with_empty_tenants(
         broken_tenant_status["state"]["slug"] == "Broken"
     ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
 
-    assert env.pageserver.log_contains(".*Setting tenant as Broken state, reason:.*")
+    assert env.pageserver.log_contains(".*load failed, setting tenant state to Broken:.*")
 
     [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines)]
     assert (

From cb834957446b39bf67e0fbacaeef669f572c9ca4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 29 May 2023 21:48:38 +0300
Subject: [PATCH 28/59] try: startup speedup (#4366)

Startup can take a long time. We suspect it's the initial logical size
calculations. Long term solution is to not block the tokio executors but
do most of I/O in spawn_blocking.

See: #4025, #4183

Short-term solution to above:

- Delay global background tasks until initial tenant loads complete
- Just limit how many init logical size calculations can we have at the
same time to `cores / 2`

This PR is for trying in staging.
---
 pageserver/src/bin/pageserver.rs           | 33 ++++++++++++++++++++++
 pageserver/src/disk_usage_eviction_task.rs |  5 ++++
 pageserver/src/tenant.rs                   |  4 +++
 pageserver/src/tenant/mgr.rs               | 18 +++++++++---
 pageserver/src/tenant/timeline.rs          | 17 +++++++++++
 5 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index d9d3d9d66244..cbc97e722816 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -335,13 +335,36 @@ fn start_pageserver(
     // Set up remote storage client
     let remote_storage = create_remote_storage_client(conf)?;
 
+    // All tenant load operations carry this while they are ongoing; it will be dropped once those
+    // operations finish either successfully or in some other manner. However, the initial load
+    // will be then done, and we can start the global background tasks.
+    let (init_done_tx, init_done_rx) = tokio::sync::mpsc::channel::<()>(1);
+    let init_done_rx = Arc::new(tokio::sync::Mutex::new(init_done_rx));
+
     // Scan the local 'tenants/' directory and start loading the tenants
+    let init_started_at = std::time::Instant::now();
     BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
         conf,
         broker_client.clone(),
         remote_storage.clone(),
+        init_done_tx,
     ))?;
 
+    BACKGROUND_RUNTIME.spawn({
+        let init_done_rx = init_done_rx.clone();
+        async move {
+            let init_done = async move { init_done_rx.lock().await.recv().await };
+            init_done.await;
+
+            let elapsed = init_started_at.elapsed();
+
+            tracing::info!(
+                elapsed_millis = elapsed.as_millis(),
+                "Initial load completed."
+            );
+        }
+    });
+
     // shared state between the disk-usage backed eviction background task and the http endpoint
     // that allows triggering disk-usage based eviction manually. note that the http endpoint
     // is still accessible even if background task is not configured as long as remote storage has
@@ -353,6 +376,7 @@ fn start_pageserver(
             conf,
             remote_storage.clone(),
             disk_usage_eviction_state.clone(),
+            init_done_rx.clone(),
         )?;
     }
 
@@ -390,6 +414,7 @@ fn start_pageserver(
         );
 
         if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+            let init_done_rx = init_done_rx;
             let metrics_ctx = RequestContext::todo_child(
                 TaskKind::MetricsCollection,
                 // This task itself shouldn't download anything.
@@ -405,6 +430,14 @@ fn start_pageserver(
                 "consumption metrics collection",
                 true,
                 async move {
+                    // first wait for initial load to complete before first iteration.
+                    //
+                    // this is because we only process active tenants and timelines, and the
+                    // Timeline::get_current_logical_size will spawn the logical size calculation,
+                    // which will not be rate-limited.
+                    let init_done = async move { init_done_rx.lock().await.recv().await };
+                    init_done.await;
+
                     pageserver::consumption_metrics::collect_metrics(
                         metric_collection_endpoint,
                         conf.metric_collection_interval,
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index f4a0f3f18eb0..03589691994a 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -82,6 +82,7 @@ pub fn launch_disk_usage_global_eviction_task(
     conf: &'static PageServerConf,
     storage: GenericRemoteStorage,
     state: Arc<State>,
+    init_done_rx: Arc<tokio::sync::Mutex<tokio::sync::mpsc::Receiver<()>>>,
 ) -> anyhow::Result<()> {
     let Some(task_config) = &conf.disk_usage_based_eviction else {
         info!("disk usage based eviction task not configured");
@@ -98,6 +99,10 @@ pub fn launch_disk_usage_global_eviction_task(
         "disk usage based eviction",
         false,
         async move {
+            // wait until initial load is complete, because we cannot evict from loading tenants.
+            let init_done = async move { init_done_rx.lock().await.recv().await };
+            init_done.await;
+
             disk_usage_eviction_task(
                 &state,
                 task_config,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 4c8101af8d3d..d6eb82410755 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -895,6 +895,7 @@ impl Tenant {
         tenant_id: TenantId,
         broker_client: storage_broker::BrokerClientChannel,
         remote_storage: Option<GenericRemoteStorage>,
+        init_done_tx: Option<tokio::sync::mpsc::Sender<()>>,
         ctx: &RequestContext,
     ) -> Arc<Tenant> {
         let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
@@ -928,6 +929,9 @@ impl Tenant {
             "initial tenant load",
             false,
             async move {
+                // keep the sender alive as long as we have the initial load ongoing; it will be
+                // None for loads spawned after init_tenant_mgr.
+                let _init_done_tx = init_done_tx;
                 match tenant_clone.load(&ctx).await {
                     Ok(()) => {
                         info!("load finished, activating");
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index c0bd81ebfc1e..d74a025bbb11 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -66,6 +66,7 @@ pub async fn init_tenant_mgr(
     conf: &'static PageServerConf,
     broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
+    init_done_tx: tokio::sync::mpsc::Sender<()>,
 ) -> anyhow::Result<()> {
     // Scan local filesystem for attached tenants
     let tenants_dir = conf.tenants_path();
@@ -122,6 +123,7 @@ pub async fn init_tenant_mgr(
                         &tenant_dir_path,
                         broker_client.clone(),
                         remote_storage.clone(),
+                        Some(init_done_tx.clone()),
                         &ctx,
                     ) {
                         Ok(tenant) => {
@@ -157,6 +159,7 @@ pub fn schedule_local_tenant_processing(
     tenant_path: &Path,
     broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
+    init_done_tx: Option<tokio::sync::mpsc::Sender<()>>,
     ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
     anyhow::ensure!(
@@ -210,7 +213,14 @@ pub fn schedule_local_tenant_processing(
     } else {
         info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
         // Start loading the tenant into memory. It will initially be in Loading state.
-        Tenant::spawn_load(conf, tenant_id, broker_client, remote_storage, ctx)
+        Tenant::spawn_load(
+            conf,
+            tenant_id,
+            broker_client,
+            remote_storage,
+            init_done_tx,
+            ctx,
+        )
     };
     Ok(tenant)
 }
@@ -363,7 +373,7 @@ pub async fn create_tenant(
         //       See https://github.com/neondatabase/neon/issues/4233
 
         let created_tenant =
-            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, ctx)?;
+            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
         // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
         //      See https://github.com/neondatabase/neon/issues/4233
 
@@ -509,7 +519,7 @@ pub async fn load_tenant(
                 .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
         }
 
-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx)
             .with_context(|| {
                 format!("Failed to schedule tenant processing in path {tenant_path:?}")
             })?;
@@ -582,7 +592,7 @@ pub async fn attach_tenant(
             .context("check for attach marker file existence")?;
         anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");
 
-        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), ctx)?;
+        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
         // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
         //      See https://github.com/neondatabase/neon/issues/4233
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 9dd5352a540c..0569bd45e0ce 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1910,6 +1910,23 @@ impl Timeline {
                 // no cancellation here, because nothing really waits for this to complete compared
                 // to spawn_ondemand_logical_size_calculation.
                 let cancel = CancellationToken::new();
+
+                /// Ugly, but necessary until `spawn_blocking` is used for blocking I/O, otherwise
+                /// we could lock up all worker threads.
+                static GLOBAL_INITIAL_LOGICAL_SIZES_AT_ONCE: once_cell::sync::Lazy<Arc<tokio::sync::Semaphore>> = once_cell::sync::Lazy::new(|| {
+                    let cores = std::thread::available_parallelism();
+                    // half rationale: we have other blocking work which will start later:
+                    // consumption metrics and per timeline eviction task. we however need to
+                    // be fast to accept page reads, so perhaps this is a suitable middle ground?
+                    let max_blocked_threads = cores.map(|count| count.get() / 2);
+                    let max_blocked_threads = max_blocked_threads.unwrap_or(1);
+                    let max_blocked_threads = std::cmp::max(1, max_blocked_threads);
+                    tracing::info!("using max {max_blocked_threads} threads for initial logical size");
+                    Arc::new(tokio::sync::Semaphore::new(max_blocked_threads))
+                });
+
+                let _permit = GLOBAL_INITIAL_LOGICAL_SIZES_AT_ONCE.clone().acquire_owned().await.expect("global semaphore is never closed");
+
                 let calculated_size = match self_clone
                     .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
                     .await

From db1435536779b37a37da57774f07a233975bdd25 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 30 May 2023 10:40:37 +0300
Subject: [PATCH 29/59] revert: static global init logical size limiter (#4368)

added in #4366. revert for testing without it; it may have unintenteded
side-effects, and it's very difficult to understand the results from the
10k load testing environments. earlier results:
https://github.com/neondatabase/neon/pull/4366#issuecomment-1567491064
---
 pageserver/src/tenant/timeline.rs | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0569bd45e0ce..5c889e804ccc 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1911,22 +1911,6 @@ impl Timeline {
                 // to spawn_ondemand_logical_size_calculation.
                 let cancel = CancellationToken::new();
 
-                /// Ugly, but necessary until `spawn_blocking` is used for blocking I/O, otherwise
-                /// we could lock up all worker threads.
-                static GLOBAL_INITIAL_LOGICAL_SIZES_AT_ONCE: once_cell::sync::Lazy<Arc<tokio::sync::Semaphore>> = once_cell::sync::Lazy::new(|| {
-                    let cores = std::thread::available_parallelism();
-                    // half rationale: we have other blocking work which will start later:
-                    // consumption metrics and per timeline eviction task. we however need to
-                    // be fast to accept page reads, so perhaps this is a suitable middle ground?
-                    let max_blocked_threads = cores.map(|count| count.get() / 2);
-                    let max_blocked_threads = max_blocked_threads.unwrap_or(1);
-                    let max_blocked_threads = std::cmp::max(1, max_blocked_threads);
-                    tracing::info!("using max {max_blocked_threads} threads for initial logical size");
-                    Arc::new(tokio::sync::Semaphore::new(max_blocked_threads))
-                });
-
-                let _permit = GLOBAL_INITIAL_LOGICAL_SIZES_AT_ONCE.clone().acquire_owned().await.expect("global semaphore is never closed");
-
                 let calculated_size = match self_clone
                     .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
                     .await

From daa79b150f8d81430d9d3aa92c9cdc8c5568e377 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 30 May 2023 14:05:41 +0100
Subject: [PATCH 30/59] Code Coverage: store lcov report (#4358)

## Problem

In the future, we want to compare code coverage on a PR with coverage on
the main branch.
Currently, we store only code coverage HTML reports, I suggest we start
storing reports in "lcov info" format that we can use/parse in the
future. Currently, the file size is ~7Mb (it's a text-based format and
could be compressed into a ~400Kb archive)

- More about "lcov info" format:
https://manpages.ubuntu.com/manpages/jammy/man1/geninfo.1.html#files
- Part of https://github.com/neondatabase/neon/issues/3543

## Summary of changes
- Change `scripts/coverage` to output lcov coverage to
`report/lcov.info` file instead of stdout (we already upload the whole
`report/` directory to S3)
---
 .github/workflows/build_and_test.yml | 11 ++++++++---
 scripts/coverage                     | 21 +++++++++++++++------
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e00b98250c33..b732095f8fb2 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -492,19 +492,24 @@ jobs:
         env:
           COMMIT_URL: ${{ github.server_url }}/${{ github.repository }}/commit/${{ github.event.pull_request.head.sha || github.sha }}
         run: |
-          scripts/coverage \
-            --dir=/tmp/coverage report \
+          scripts/coverage --dir=/tmp/coverage \
+            report \
             --input-objects=/tmp/coverage/binaries.list \
             --commit-url=${COMMIT_URL} \
             --format=github
 
+          scripts/coverage --dir=/tmp/coverage \
+            report \
+            --input-objects=/tmp/coverage/binaries.list \
+            --format=lcov
+
       - name: Upload coverage report
         id: upload-coverage-report
         env:
           BUCKET: neon-github-public-dev
           COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
         run: |
-          aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://neon-github-public-dev/code-coverage/${COMMIT_SHA}
+          aws s3 cp --only-show-errors --recursive /tmp/coverage/report s3://${BUCKET}/code-coverage/${COMMIT_SHA}
 
           REPORT_URL=https://${BUCKET}.s3.amazonaws.com/code-coverage/${COMMIT_SHA}/index.html
           echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
diff --git a/scripts/coverage b/scripts/coverage
index 1dc92e57cc4e..52a69c93b945 100755
--- a/scripts/coverage
+++ b/scripts/coverage
@@ -156,7 +156,9 @@ class LLVM:
              profdata: Path,
              objects: List[str],
              sources: List[str],
-             demangler: Optional[Path] = None) -> None:
+             demangler: Optional[Path] = None,
+             output_file: Optional[Path] = None,
+             ) -> None:
 
         cwd = self.cargo.cwd
         objects = list(intersperse('-object', objects))
@@ -180,14 +182,18 @@ class LLVM:
             *objects,
             *sources,
         ]
-        subprocess.check_call(cmd, cwd=cwd)
+        if output_file is not None:
+            with output_file.open('w') as outfile:
+                subprocess.check_call(cmd, cwd=cwd, stdout=outfile)
+        else:
+            subprocess.check_call(cmd, cwd=cwd)
 
     def cov_report(self, **kwargs) -> None:
         self._cov(subcommand='report', **kwargs)
 
-    def cov_export(self, *, kind: str, **kwargs) -> None:
+    def cov_export(self, *, kind: str, output_file: Optional[Path], **kwargs) -> None:
         extras = (f'-format={kind}', )
-        self._cov(subcommand='export', *extras, **kwargs)
+        self._cov(subcommand='export', *extras, output_file=output_file, **kwargs)
 
     def cov_show(self, *, kind: str, output_dir: Optional[Path] = None, **kwargs) -> None:
         extras = [f'-format={kind}']
@@ -283,9 +289,12 @@ class TextReport(Report):
         self.llvm.cov_show(kind='text', **self._common_kwargs())
 
 
+@dataclass
 class LcovReport(Report):
+    output_file: Path
+
     def generate(self) -> None:
-        self.llvm.cov_export(kind='lcov', **self._common_kwargs())
+        self.llvm.cov_export(kind='lcov',  output_file=self.output_file, **self._common_kwargs())
 
 
 @dataclass
@@ -475,7 +484,7 @@ class State:
             'text':
             lambda: TextReport(**params),
             'lcov':
-            lambda: LcovReport(**params),
+            lambda: LcovReport(**params, output_file=self.report_dir / 'lcov.info'),
             'summary':
             lambda: SummaryReport(**params),
             'github':

From 210be6b6aba377592aa9aaefcea51c6427d22dac Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 30 May 2023 16:08:02 +0300
Subject: [PATCH 31/59] Replace broker duration logs with metrics (#4370)

I've added logs for broker push duration after every iteration in https://github.com/neondatabase/neon/pull/4142. This log has not found any real issues, so we can replace it with metrics, to slightly reduce log volume.

LogQL query found that pushes longer that 500ms happened only 90 times for the last month. https://neonprod.grafana.net/goto/KTNj9UwVg?orgId=1

`{unit="safekeeper.service"} |= "timeline updates to broker in" | regexp "to broker in (?P<duration>.*)" | duration > 500ms`
---
 safekeeper/src/broker.rs  | 12 ++++++++++--
 safekeeper/src/metrics.rs | 19 +++++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/safekeeper/src/broker.rs b/safekeeper/src/broker.rs
index 5e25d22ec157..48c56ee58f57 100644
--- a/safekeeper/src/broker.rs
+++ b/safekeeper/src/broker.rs
@@ -19,8 +19,10 @@ use tokio::task::JoinHandle;
 use tokio::{runtime, time::sleep};
 use tracing::*;
 
+use crate::metrics::BROKER_ITERATION_TIMELINES;
 use crate::metrics::BROKER_PULLED_UPDATES;
 use crate::metrics::BROKER_PUSHED_UPDATES;
+use crate::metrics::BROKER_PUSH_ALL_UPDATES_SECONDS;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;
 
@@ -61,8 +63,14 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
                 BROKER_PUSHED_UPDATES.inc();
             }
             let elapsed = now.elapsed();
-            // Log duration every second. Should be about 10MB of logs per day.
-            info!("pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed);
+
+            BROKER_PUSH_ALL_UPDATES_SECONDS.observe(elapsed.as_secs_f64());
+            BROKER_ITERATION_TIMELINES.observe(active_tlis.len() as f64);
+
+            if elapsed > push_interval / 2 {
+                info!("broker push is too long, pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed);
+            }
+
             sleep(push_interval).await;
         }
     };
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 189af2b0449d..235a88501d20 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -125,6 +125,25 @@ pub static BACKUP_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
     )
     .expect("Failed to register safekeeper_backup_errors_total counter")
 });
+pub static BROKER_PUSH_ALL_UPDATES_SECONDS: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "safekeeper_broker_push_update_seconds",
+        "Seconds to push all timeline updates to the broker",
+        DISK_WRITE_SECONDS_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_broker_push_update_seconds histogram vec")
+});
+pub const TIMELINES_COUNT_BUCKETS: &[f64] = &[
+    1.0, 10.0, 50.0, 100.0, 200.0, 500.0, 1000.0, 2000.0, 5000.0, 10000.0, 20000.0, 50000.0,
+];
+pub static BROKER_ITERATION_TIMELINES: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "safekeeper_broker_iteration_timelines",
+        "Count of timelines pushed to the broker in a single iteration",
+        TIMELINES_COUNT_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_broker_iteration_timelines histogram vec")
+});
 
 pub const LABEL_UNKNOWN: &str = "unknown";
 

From f4db85de404f2bba8d8cede7440a1fa654d53ce6 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 30 May 2023 16:25:07 +0300
Subject: [PATCH 32/59] Continued startup speedup (#4372)

Startup continues to be slow, work towards to alleviate it.

Summary of changes:

- pretty the functional improvements from #4366 into
`utils::completion::{Completion, Barrier}`
- extend "initial load completion" usage up to tenant background tasks
    - previously only global background tasks
- spawn_blocking the tenant load directory traversal
- demote some logging
- remove some unwraps
- propagate some spans to `spawn_blocking`

Runtime effects should be major speedup to loading, but after that, the
`BACKGROUND_RUNTIME` will be blocked for a long time (minutes). Possible
follow-ups:
- complete initial tenant sizes before allowing background tasks to
block the `BACKGROUND_RUNTIME`
---
 libs/utils/src/completion.rs               |  33 ++++
 libs/utils/src/lib.rs                      |   3 +
 pageserver/src/bin/pageserver.rs           |  11 +-
 pageserver/src/disk_usage_eviction_task.rs |   6 +-
 pageserver/src/tenant.rs                   | 207 +++++++++++----------
 pageserver/src/tenant/mgr.rs               |   9 +-
 pageserver/src/tenant/tasks.rs             |   7 +-
 pageserver/src/tenant/timeline.rs          |  19 +-
 test_runner/regress/test_tenants.py        |   2 +-
 9 files changed, 181 insertions(+), 116 deletions(-)
 create mode 100644 libs/utils/src/completion.rs

diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs
new file mode 100644
index 000000000000..2cdaee548e65
--- /dev/null
+++ b/libs/utils/src/completion.rs
@@ -0,0 +1,33 @@
+use std::sync::Arc;
+
+use tokio::sync::{mpsc, Mutex};
+
+/// While a reference is kept around, the associated [`Barrier::wait`] will wait.
+///
+/// Can be cloned, moved and kept around in futures as "guard objects".
+#[derive(Clone)]
+pub struct Completion(mpsc::Sender<()>);
+
+/// Barrier will wait until all clones of [`Completion`] have been dropped.
+#[derive(Clone)]
+pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
+
+impl Barrier {
+    pub async fn wait(self) {
+        self.0.lock().await.recv().await;
+    }
+
+    pub async fn maybe_wait(barrier: Option<Barrier>) {
+        if let Some(b) = barrier {
+            b.wait().await
+        }
+    }
+}
+
+/// Create new Guard and Barrier pair.
+pub fn channel() -> (Completion, Barrier) {
+    let (tx, rx) = mpsc::channel::<()>(1);
+    let rx = Mutex::new(rx);
+    let rx = Arc::new(rx);
+    (Completion(tx), Barrier(rx))
+}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 4e4f79ab6b0e..69d3a1b9f2db 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -60,6 +60,9 @@ pub mod tracing_span_assert;
 
 pub mod rate_limit;
 
+/// Simple once-barrier and a guard which keeps barrier awaiting.
+pub mod completion;
+
 mod failpoint_macro_helpers {
 
     /// use with fail::cfg("$name", "return(2000)")
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index cbc97e722816..a2cebffc83e2 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -338,8 +338,7 @@ fn start_pageserver(
     // All tenant load operations carry this while they are ongoing; it will be dropped once those
     // operations finish either successfully or in some other manner. However, the initial load
     // will be then done, and we can start the global background tasks.
-    let (init_done_tx, init_done_rx) = tokio::sync::mpsc::channel::<()>(1);
-    let init_done_rx = Arc::new(tokio::sync::Mutex::new(init_done_rx));
+    let (init_done_tx, init_done_rx) = utils::completion::channel();
 
     // Scan the local 'tenants/' directory and start loading the tenants
     let init_started_at = std::time::Instant::now();
@@ -347,14 +346,13 @@ fn start_pageserver(
         conf,
         broker_client.clone(),
         remote_storage.clone(),
-        init_done_tx,
+        (init_done_tx, init_done_rx.clone()),
     ))?;
 
     BACKGROUND_RUNTIME.spawn({
         let init_done_rx = init_done_rx.clone();
         async move {
-            let init_done = async move { init_done_rx.lock().await.recv().await };
-            init_done.await;
+            init_done_rx.wait().await;
 
             let elapsed = init_started_at.elapsed();
 
@@ -435,8 +433,7 @@ fn start_pageserver(
                     // this is because we only process active tenants and timelines, and the
                     // Timeline::get_current_logical_size will spawn the logical size calculation,
                     // which will not be rate-limited.
-                    let init_done = async move { init_done_rx.lock().await.recv().await };
-                    init_done.await;
+                    init_done_rx.wait().await;
 
                     pageserver::consumption_metrics::collect_metrics(
                         metric_collection_endpoint,
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 03589691994a..1a8886935c8b 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -54,6 +54,7 @@ use serde::{Deserialize, Serialize};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn, Instrument};
+use utils::completion;
 use utils::serde_percent::Percent;
 
 use crate::{
@@ -82,7 +83,7 @@ pub fn launch_disk_usage_global_eviction_task(
     conf: &'static PageServerConf,
     storage: GenericRemoteStorage,
     state: Arc<State>,
-    init_done_rx: Arc<tokio::sync::Mutex<tokio::sync::mpsc::Receiver<()>>>,
+    init_done: completion::Barrier,
 ) -> anyhow::Result<()> {
     let Some(task_config) = &conf.disk_usage_based_eviction else {
         info!("disk usage based eviction task not configured");
@@ -100,8 +101,7 @@ pub fn launch_disk_usage_global_eviction_task(
         false,
         async move {
             // wait until initial load is complete, because we cannot evict from loading tenants.
-            let init_done = async move { init_done_rx.lock().await.recv().await };
-            init_done.await;
+            init_done.wait().await;
 
             disk_usage_eviction_task(
                 &state,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d6eb82410755..ff975db6018d 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -20,6 +20,7 @@ use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
 use tracing::*;
+use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
 
 use std::cmp::min;
@@ -653,7 +654,7 @@ impl Tenant {
                 match tenant_clone.attach(&ctx).await {
                     Ok(()) => {
                         info!("attach finished, activating");
-                        tenant_clone.activate(broker_client, &ctx);
+                        tenant_clone.activate(broker_client, None, &ctx);
                     }
                     Err(e) => {
                         error!("attach failed, setting tenant state to Broken: {:?}", e);
@@ -889,15 +890,17 @@ impl Tenant {
     /// If the loading fails for some reason, the Tenant will go into Broken
     /// state.
     ///
-    #[instrument(skip(conf, remote_storage, ctx), fields(tenant_id=%tenant_id))]
+    #[instrument(skip_all, fields(tenant_id=%tenant_id))]
     pub fn spawn_load(
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         broker_client: storage_broker::BrokerClientChannel,
         remote_storage: Option<GenericRemoteStorage>,
-        init_done_tx: Option<tokio::sync::mpsc::Sender<()>>,
+        init_done: Option<(completion::Completion, completion::Barrier)>,
         ctx: &RequestContext,
     ) -> Arc<Tenant> {
+        debug_assert_current_span_has_tenant_id();
+
         let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
             Ok(conf) => conf,
             Err(e) => {
@@ -931,11 +934,15 @@ impl Tenant {
             async move {
                 // keep the sender alive as long as we have the initial load ongoing; it will be
                 // None for loads spawned after init_tenant_mgr.
-                let _init_done_tx = init_done_tx;
+                let (_tx, rx) = if let Some((tx, rx)) = init_done {
+                    (Some(tx), Some(rx))
+                } else {
+                    (None, None)
+                };
                 match tenant_clone.load(&ctx).await {
                     Ok(()) => {
-                        info!("load finished, activating");
-                        tenant_clone.activate(broker_client, &ctx);
+                        debug!("load finished, activating");
+                        tenant_clone.activate(broker_client, rx.as_ref(), &ctx);
                     }
                     Err(err) => {
                         error!("load failed, setting tenant state to Broken: {err:?}");
@@ -954,8 +961,6 @@ impl Tenant {
             }),
         );
 
-        info!("spawned load into background");
-
         tenant
     }
 
@@ -967,7 +972,7 @@ impl Tenant {
     async fn load(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
         debug_assert_current_span_has_tenant_id();
 
-        info!("loading tenant task");
+        debug!("loading tenant task");
 
         utils::failpoint_sleep_millis_async!("before-loading-tenant");
 
@@ -977,102 +982,109 @@ impl Tenant {
         //
         // Scan the directory, peek into the metadata file of each timeline, and
         // collect a list of timelines and their ancestors.
-        let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
-        let timelines_dir = self.conf.timelines_path(&self.tenant_id);
-        for entry in std::fs::read_dir(&timelines_dir).with_context(|| {
-            format!(
-                "Failed to list timelines directory for tenant {}",
-                self.tenant_id
-            )
-        })? {
-            let entry = entry.with_context(|| {
-                format!("cannot read timeline dir entry for {}", self.tenant_id)
-            })?;
-            let timeline_dir = entry.path();
+        let tenant_id = self.tenant_id;
+        let conf = self.conf;
+        let span = info_span!("blocking");
 
-            if crate::is_temporary(&timeline_dir) {
-                info!(
-                    "Found temporary timeline directory, removing: {}",
-                    timeline_dir.display()
-                );
-                if let Err(e) = std::fs::remove_dir_all(&timeline_dir) {
-                    error!(
-                        "Failed to remove temporary directory '{}': {:?}",
-                        timeline_dir.display(),
-                        e
+        let sorted_timelines: Vec<(_, _)> = tokio::task::spawn_blocking(move || {
+            let _g = span.entered();
+            let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
+            let timelines_dir = conf.timelines_path(&tenant_id);
+
+            for entry in
+                std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")?
+            {
+                let entry = entry.context("read timeline dir entry")?;
+                let timeline_dir = entry.path();
+
+                if crate::is_temporary(&timeline_dir) {
+                    info!(
+                        "Found temporary timeline directory, removing: {}",
+                        timeline_dir.display()
                     );
-                }
-            } else if is_uninit_mark(&timeline_dir) {
-                let timeline_uninit_mark_file = &timeline_dir;
-                info!(
-                    "Found an uninit mark file {}, removing the timeline and its uninit mark",
-                    timeline_uninit_mark_file.display()
-                );
-                let timeline_id = timeline_uninit_mark_file
-                    .file_stem()
-                    .and_then(OsStr::to_str)
-                    .unwrap_or_default()
-                    .parse::<TimelineId>()
-                    .with_context(|| {
-                        format!(
+                    if let Err(e) = std::fs::remove_dir_all(&timeline_dir) {
+                        error!(
+                            "Failed to remove temporary directory '{}': {:?}",
+                            timeline_dir.display(),
+                            e
+                        );
+                    }
+                } else if is_uninit_mark(&timeline_dir) {
+                    let timeline_uninit_mark_file = &timeline_dir;
+                    info!(
+                        "Found an uninit mark file {}, removing the timeline and its uninit mark",
+                        timeline_uninit_mark_file.display()
+                    );
+                    let timeline_id = timeline_uninit_mark_file
+                        .file_stem()
+                        .and_then(OsStr::to_str)
+                        .unwrap_or_default()
+                        .parse::<TimelineId>()
+                        .with_context(|| {
+                            format!(
                             "Could not parse timeline id out of the timeline uninit mark name {}",
                             timeline_uninit_mark_file.display()
                         )
-                    })?;
-                let timeline_dir = self.conf.timeline_path(&timeline_id, &self.tenant_id);
-                if let Err(e) =
-                    remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
-                {
-                    error!("Failed to clean up uninit marked timeline: {e:?}");
-                }
-            } else {
-                let timeline_id = timeline_dir
-                    .file_name()
-                    .and_then(OsStr::to_str)
-                    .unwrap_or_default()
-                    .parse::<TimelineId>()
-                    .with_context(|| {
-                        format!(
-                            "Could not parse timeline id out of the timeline dir name {}",
-                            timeline_dir.display()
-                        )
-                    })?;
-                let timeline_uninit_mark_file = self
-                    .conf
-                    .timeline_uninit_mark_file_path(self.tenant_id, timeline_id);
-                if timeline_uninit_mark_file.exists() {
-                    info!(
-                        "Found an uninit mark file for timeline {}/{}, removing the timeline and its uninit mark",
-                        self.tenant_id, timeline_id
-                    );
+                        })?;
+                    let timeline_dir = conf.timeline_path(&timeline_id, &tenant_id);
                     if let Err(e) =
-                        remove_timeline_and_uninit_mark(&timeline_dir, &timeline_uninit_mark_file)
+                        remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
                     {
                         error!("Failed to clean up uninit marked timeline: {e:?}");
                     }
-                    continue;
-                }
-
-                let file_name = entry.file_name();
-                if let Ok(timeline_id) =
-                    file_name.to_str().unwrap_or_default().parse::<TimelineId>()
-                {
-                    let metadata = load_metadata(self.conf, timeline_id, self.tenant_id)
-                        .context("failed to load metadata")?;
-                    timelines_to_load.insert(timeline_id, metadata);
                 } else {
-                    // A file or directory that doesn't look like a timeline ID
-                    warn!(
-                        "unexpected file or directory in timelines directory: {}",
-                        file_name.to_string_lossy()
-                    );
+                    let timeline_id = timeline_dir
+                        .file_name()
+                        .and_then(OsStr::to_str)
+                        .unwrap_or_default()
+                        .parse::<TimelineId>()
+                        .with_context(|| {
+                            format!(
+                                "Could not parse timeline id out of the timeline dir name {}",
+                                timeline_dir.display()
+                            )
+                        })?;
+                    let timeline_uninit_mark_file =
+                        conf.timeline_uninit_mark_file_path(tenant_id, timeline_id);
+                    if timeline_uninit_mark_file.exists() {
+                        info!(
+                            %timeline_id,
+                            "Found an uninit mark file, removing the timeline and its uninit mark",
+                        );
+                        if let Err(e) = remove_timeline_and_uninit_mark(
+                            &timeline_dir,
+                            &timeline_uninit_mark_file,
+                        ) {
+                            error!("Failed to clean up uninit marked timeline: {e:?}");
+                        }
+                        continue;
+                    }
+
+                    let file_name = entry.file_name();
+                    if let Ok(timeline_id) =
+                        file_name.to_str().unwrap_or_default().parse::<TimelineId>()
+                    {
+                        let metadata = load_metadata(conf, timeline_id, tenant_id)
+                            .context("failed to load metadata")?;
+                        timelines_to_load.insert(timeline_id, metadata);
+                    } else {
+                        // A file or directory that doesn't look like a timeline ID
+                        warn!(
+                            "unexpected file or directory in timelines directory: {}",
+                            file_name.to_string_lossy()
+                        );
+                    }
                 }
             }
-        }
 
-        // Sort the array of timeline IDs into tree-order, so that parent comes before
-        // all its children.
-        let sorted_timelines = tree_sort_timelines(timelines_to_load)?;
+            // Sort the array of timeline IDs into tree-order, so that parent comes before
+            // all its children.
+            tree_sort_timelines(timelines_to_load)
+        })
+        .await
+        .context("load spawn_blocking")
+        .and_then(|res| res)?;
+
         // FIXME original collect_timeline_files contained one more check:
         //    1. "Timeline has no ancestor and no layer files"
 
@@ -1082,7 +1094,7 @@ impl Tenant {
                 .with_context(|| format!("load local timeline {timeline_id}"))?;
         }
 
-        info!("Done");
+        trace!("Done");
 
         Ok(())
     }
@@ -1670,7 +1682,12 @@ impl Tenant {
     }
 
     /// Changes tenant status to active, unless shutdown was already requested.
-    fn activate(self: &Arc<Self>, broker_client: BrokerClientChannel, ctx: &RequestContext) {
+    fn activate(
+        self: &Arc<Self>,
+        broker_client: BrokerClientChannel,
+        init_done: Option<&completion::Barrier>,
+        ctx: &RequestContext,
+    ) {
         debug_assert_current_span_has_tenant_id();
 
         let mut activating = false;
@@ -1701,7 +1718,7 @@ impl Tenant {
 
             // Spawn gc and compaction loops. The loops will shut themselves
             // down when they notice that the tenant is inactive.
-            tasks::start_background_loops(self);
+            tasks::start_background_loops(self, init_done);
 
             let mut activated_timelines = 0;
 
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index d74a025bbb11..d3cd91403708 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -25,6 +25,7 @@ use crate::tenant::{
 };
 use crate::IGNORED_TENANT_FILE_NAME;
 
+use utils::completion;
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};
 
@@ -66,7 +67,7 @@ pub async fn init_tenant_mgr(
     conf: &'static PageServerConf,
     broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
-    init_done_tx: tokio::sync::mpsc::Sender<()>,
+    init_done: (completion::Completion, completion::Barrier),
 ) -> anyhow::Result<()> {
     // Scan local filesystem for attached tenants
     let tenants_dir = conf.tenants_path();
@@ -123,7 +124,7 @@ pub async fn init_tenant_mgr(
                         &tenant_dir_path,
                         broker_client.clone(),
                         remote_storage.clone(),
-                        Some(init_done_tx.clone()),
+                        Some(init_done.clone()),
                         &ctx,
                     ) {
                         Ok(tenant) => {
@@ -159,7 +160,7 @@ pub fn schedule_local_tenant_processing(
     tenant_path: &Path,
     broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
-    init_done_tx: Option<tokio::sync::mpsc::Sender<()>>,
+    init_done: Option<(completion::Completion, completion::Barrier)>,
     ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
     anyhow::ensure!(
@@ -218,7 +219,7 @@ pub fn schedule_local_tenant_processing(
             tenant_id,
             broker_client,
             remote_storage,
-            init_done_tx,
+            init_done,
             ctx,
         )
     };
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index b3c8a4a3bbd3..02aed11114d2 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -12,8 +12,9 @@ use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::completion;
 
-pub fn start_background_loops(tenant: &Arc<Tenant>) {
+pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completion::Barrier>) {
     let tenant_id = tenant.tenant_id;
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
@@ -24,7 +25,9 @@ pub fn start_background_loops(tenant: &Arc<Tenant>) {
         false,
         {
             let tenant = Arc::clone(tenant);
+            let init_done = init_done.cloned();
             async move {
+                completion::Barrier::maybe_wait(init_done).await;
                 compaction_loop(tenant)
                     .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
                     .await;
@@ -41,7 +44,9 @@ pub fn start_background_loops(tenant: &Arc<Tenant>) {
         false,
         {
             let tenant = Arc::clone(tenant);
+            let init_done = init_done.cloned();
             async move {
+                completion::Barrier::maybe_wait(init_done).await;
                 gc_loop(tenant)
                     .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
                     .await;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5c889e804ccc..ee7b002450e4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2728,7 +2728,7 @@ impl Timeline {
     }
 
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    #[instrument(skip(self, frozen_layer, ctx), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
     async fn flush_frozen_layer(
         self: &Arc<Self>,
         frozen_layer: Arc<InMemoryLayer>,
@@ -2752,9 +2752,14 @@ impl Timeline {
                 // normal case, write out a L0 delta layer file.
                 let this = self.clone();
                 let frozen_layer = frozen_layer.clone();
-                let (delta_path, metadata) =
-                    tokio::task::spawn_blocking(move || this.create_delta_layer(&frozen_layer))
-                        .await??;
+                let span = tracing::info_span!("blocking");
+                let (delta_path, metadata) = tokio::task::spawn_blocking(move || {
+                    let _g = span.entered();
+                    this.create_delta_layer(&frozen_layer)
+                })
+                .await
+                .context("create_delta_layer spawn_blocking")
+                .and_then(|res| res)?;
                 HashMap::from([(delta_path, metadata)])
             };
 
@@ -3523,14 +3528,18 @@ impl Timeline {
         let this = self.clone();
         let ctx_inner = ctx.clone();
         let layer_removal_cs_inner = layer_removal_cs.clone();
+        let span = tracing::info_span!("blocking");
         let CompactLevel0Phase1Result {
             new_layers,
             deltas_to_compact,
         } = tokio::task::spawn_blocking(move || {
+            let _g = span.entered();
             this.compact_level0_phase1(layer_removal_cs_inner, target_file_size, &ctx_inner)
         })
         .await
-        .unwrap()?;
+        .context("compact_level0_phase1 spawn_blocking")
+        .map_err(CompactionError::Other)
+        .and_then(|res| res)?;
 
         if new_layers.is_empty() && deltas_to_compact.is_empty() {
             // nothing to do
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 59b7b574cdf2..15712b9e5506 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -309,7 +309,7 @@ def test_pageserver_with_empty_tenants(
     env.pageserver.allowed_errors.append(
         ".*marking .* as locally complete, while it doesnt exist in remote index.*"
     )
-    env.pageserver.allowed_errors.append(".*load failed.*Failed to list timelines directory.*")
+    env.pageserver.allowed_errors.append(".*load failed.*list timelines directory.*")
 
     client = env.pageserver.http_client()
 

From b190c3e6c3771005ac761e71a9635092c3addc93 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Tue, 30 May 2023 20:11:44 +0300
Subject: [PATCH 33/59] reduce flakiness by allowing Compaction failed,
 retrying in X queue is in state Stopped. (#4379)

resolves https://github.com/neondatabase/neon/issues/4374 by adding the error to allowed_errors
---
 test_runner/fixtures/neon_fixtures.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 6b97c33ae4e2..1007cb11b5ba 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1621,6 +1621,8 @@ def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional
             ".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant",
             # these can happen anytime we do compactions from background task and shutdown pageserver
             r".*ERROR.*ancestor timeline \S+ is being stopped",
+            # this is expected given our collaborative shutdown approach for the UploadQueue
+            ".*Compaction failed, retrying in .*: queue is in state Stopped.*",
         ]
 
     def start(

From b6447462dc72b8634cf122c76d3e155c2f6b5d60 Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Wed, 31 May 2023 12:23:00 -0400
Subject: [PATCH 34/59] Fix layer map correctness bug (#4342)

---
 .../layer_map/historic_layer_coverage.rs      | 29 ++++++++++++++++
 .../src/tenant/layer_map/layer_coverage.rs    | 33 ++++++++++++-------
 2 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
index b63c36131465..49dcbc63c2b7 100644
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -204,6 +204,35 @@ fn test_off_by_one() {
     assert_eq!(version.image_coverage.query(5), None);
 }
 
+/// White-box regression test, checking for incorrect removal of node at key.end
+#[test]
+fn test_regression() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 0..5,
+            is_image: false,
+        },
+        "Layer 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 1..2,
+            is_image: false,
+        },
+        "Layer 2".to_string(),
+    );
+
+    // If an insertion operation improperly deletes the endpoint of a previous layer
+    // (which is more likely to happen with layers that collide on key.end), we will
+    // end up with an infinite layer, covering the entire keyspace. Here we assert
+    // that there's no layer at key 100 because we didn't insert any layer there.
+    let version = map.get_version(100).unwrap();
+    assert_eq!(version.delta_coverage.query(100), None);
+}
+
 /// Cover edge cases where layers begin or end on the same key
 #[test]
 fn test_key_collision() {
diff --git a/pageserver/src/tenant/layer_map/layer_coverage.rs b/pageserver/src/tenant/layer_map/layer_coverage.rs
index 4e3b4516dc3c..9d9d1d6ccf6c 100644
--- a/pageserver/src/tenant/layer_map/layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -10,19 +10,22 @@ use rpds::RedBlackTreeMapSync;
 /// - iterate the latest layers in a key range
 /// - insert layers in non-decreasing lsn.start order
 ///
-/// The struct is parameterized over Value for easier
-/// testing, but in practice it's some sort of layer.
+/// For a detailed explanation and justification of this approach, see:
+/// https://neon.tech/blog/persistent-structures-in-neons-wal-indexing
+///
+/// NOTE The struct is parameterized over Value for easier
+///      testing, but in practice it's some sort of layer.
 pub struct LayerCoverage<Value> {
     /// For every change in coverage (as we sweep the key space)
     /// we store (lsn.end, value).
     ///
-    /// We use an immutable/persistent tree so that we can keep historic
-    /// versions of this coverage without cloning the whole thing and
-    /// incurring quadratic memory cost. See HistoricLayerCoverage.
+    /// NOTE We use an immutable/persistent tree so that we can keep historic
+    ///      versions of this coverage without cloning the whole thing and
+    ///      incurring quadratic memory cost. See HistoricLayerCoverage.
     ///
-    /// We use the Sync version of the map because we want Self to
-    /// be Sync. Using nonsync might be faster, if we can work with
-    /// that.
+    /// NOTE We use the Sync version of the map because we want Self to
+    ///      be Sync. Using nonsync might be faster, if we can work with
+    ///      that.
     nodes: RedBlackTreeMapSync<i128, Option<(u64, Value)>>,
 }
 
@@ -41,6 +44,13 @@ impl<Value: Clone> LayerCoverage<Value> {
 
     /// Helper function to subdivide the key range without changing any values
     ///
+    /// This operation has no semantic effect by itself. It only helps us pin in
+    /// place the part of the coverage we don't want to change when inserting.
+    ///
+    /// As an analogy, think of a polygon. If you add a vertex along one of the
+    /// segments, the polygon is still the same, but it behaves differently when
+    /// we move or delete one of the other points.
+    ///
     /// Complexity: O(log N)
     fn add_node(&mut self, key: i128) {
         let value = match self.nodes.range(..=key).last() {
@@ -74,7 +84,7 @@ impl<Value: Clone> LayerCoverage<Value> {
         let mut to_update = Vec::new();
         let mut to_remove = Vec::new();
         let mut prev_covered = false;
-        for (k, node) in self.nodes.range(key.clone()) {
+        for (k, node) in self.nodes.range(key) {
             let needs_cover = match node {
                 None => true,
                 Some((h, _)) => h < &lsn.end,
@@ -87,9 +97,8 @@ impl<Value: Clone> LayerCoverage<Value> {
             }
             prev_covered = needs_cover;
         }
-        if !prev_covered {
-            to_remove.push(key.end);
-        }
+        // TODO check if the nodes inserted at key.start and key.end are safe
+        //      to remove. It's fine to keep them but they could be redundant.
         for k in to_update {
             self.nodes.insert_mut(k, Some((lsn.end, value.clone())));
         }

From 952d6e43a21d3b1ff7e5c602007fe06725c2e1bc Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 31 May 2023 21:37:20 +0300
Subject: [PATCH 35/59] =?UTF-8?q?Add=20pageserver=20parameter=20forced=5Fi?=
 =?UTF-8?q?mage=5Fcreation=5Flimit=20which=20can=20be=20used=E2=80=A6=20(#?=
 =?UTF-8?q?4353)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This parameter can be use to restrict number of image layers generated
because of GC request (wanted image layers).
Been set to zero it completely eliminates creation of such image layers.
So it allows to avoid extra storage consumption after merging #3673

## Problem
PR #3673 forces generation of missed image layers. So i short term is
cause cause increase (in worst case up to two times) size of storage.
It was intended (by me) that GC period is comparable with PiTR interval.
But looks like it is not the case now - GC is performed much more
frequently. It may cause the problem with space exhaustion: GC forces
new image creation while large PiTR still prevent GC from collecting old
layers.

## Summary of changes

Add new pageserver parameter` forced_image_creation_limit` which
restrict number of created image layers which are requested by GC.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 control_plane/src/pageserver.rs                  | 10 ++++++++++
 libs/pageserver_api/src/models.rs                |  2 ++
 pageserver/src/config.rs                         | 10 +++++++++-
 pageserver/src/tenant.rs                         |  1 +
 pageserver/src/tenant/config.rs                  |  8 ++++++++
 pageserver/src/tenant/timeline.rs                | 10 +++++++++-
 test_runner/regress/test_attach_tenant_config.py |  1 +
 7 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 149cfd00cba0..400df60f0e50 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -369,6 +369,11 @@ impl PageServerNode {
             evictions_low_residence_duration_metric_threshold: settings
                 .remove("evictions_low_residence_duration_metric_threshold")
                 .map(|x| x.to_string()),
+            gc_feedback: settings
+                .remove("gc_feedback")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'gc_feedback' as bool")?,
         };
 
         // If tenant ID was not specified, generate one
@@ -463,6 +468,11 @@ impl PageServerNode {
                 evictions_low_residence_duration_metric_threshold: settings
                     .remove("evictions_low_residence_duration_metric_threshold")
                     .map(|x| x.to_string()),
+                gc_feedback: settings
+                    .remove("gc_feedback")
+                    .map(|x| x.parse::<bool>())
+                    .transpose()
+                    .context("Failed to parse 'gc_feedback' as bool")?,
             }
         };
 
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 0b4457a9a5b9..162bf6b294fa 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -223,6 +223,7 @@ pub struct TenantConfig {
     pub eviction_policy: Option<serde_json::Value>,
     pub min_resident_size_override: Option<u64>,
     pub evictions_low_residence_duration_metric_threshold: Option<String>,
+    pub gc_feedback: Option<bool>,
 }
 
 #[serde_as]
@@ -281,6 +282,7 @@ impl TenantConfigRequest {
             eviction_policy: None,
             min_resident_size_override: None,
             evictions_low_residence_duration_metric_threshold: None,
+            gc_feedback: None,
         };
         TenantConfigRequest { tenant_id, config }
     }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 88a7f15b2142..02763c9b7d7b 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -108,7 +108,7 @@ pub mod defaults {
 
 #min_resident_size_override = .. # in bytes
 #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
-
+#gc_feedback = false
 # [remote_storage]
 
 "###
@@ -828,6 +828,14 @@ impl PageServerConf {
             )?);
         }
 
+        if let Some(gc_feedback) = item.get("gc_feedback") {
+            t_conf.gc_feedback = Some(
+                gc_feedback
+                    .as_bool()
+                    .with_context(|| "configure option gc_feedback is not a bool".to_string())?,
+            );
+        }
+
         Ok(t_conf)
     }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ff975db6018d..af6a70c4f28b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3209,6 +3209,7 @@ pub mod harness {
                 evictions_low_residence_duration_metric_threshold: Some(
                     tenant_conf.evictions_low_residence_duration_metric_threshold,
                 ),
+                gc_feedback: Some(tenant_conf.gc_feedback),
             }
         }
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 50de316bc45e..80d153661a9c 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -99,6 +99,7 @@ pub struct TenantConf {
     // See the corresponding metric's help string.
     #[serde(with = "humantime_serde")]
     pub evictions_low_residence_duration_metric_threshold: Duration,
+    pub gc_feedback: bool,
 }
 
 /// Same as TenantConf, but this struct preserves the information about
@@ -175,6 +176,10 @@ pub struct TenantConfOpt {
     #[serde(with = "humantime_serde")]
     #[serde(default)]
     pub evictions_low_residence_duration_metric_threshold: Option<Duration>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub gc_feedback: Option<bool>,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -242,6 +247,7 @@ impl TenantConfOpt {
             evictions_low_residence_duration_metric_threshold: self
                 .evictions_low_residence_duration_metric_threshold
                 .unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
+            gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
         }
     }
 }
@@ -278,6 +284,7 @@ impl Default for TenantConf {
                 DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD,
             )
             .expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
+            gc_feedback: false,
         }
     }
 }
@@ -372,6 +379,7 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
                     ))?,
             );
         }
+        tenant_conf.gc_feedback = request_data.gc_feedback;
 
         Ok(tenant_conf)
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ee7b002450e4..36c4b0bcd4d0 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1291,6 +1291,13 @@ impl Timeline {
             .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
     }
 
+    fn get_gc_feedback(&self) -> bool {
+        let tenant_conf = self.tenant_conf.read().unwrap();
+        tenant_conf
+            .gc_feedback
+            .unwrap_or(self.conf.default_tenant_conf.gc_feedback)
+    }
+
     pub(super) fn tenant_conf_updated(&self) {
         // NB: Most tenant conf options are read by background loops, so,
         // changes will automatically be picked up.
@@ -3124,6 +3131,7 @@ impl Timeline {
         let mut layers = self.layers.write().unwrap();
         let mut updates = layers.batch_update();
         let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
+
         for l in image_layers {
             let path = l.filename();
             let metadata = timeline_path
@@ -3896,7 +3904,7 @@ impl Timeline {
                 // delta layers. Image layers can form "stairs" preventing old image from been deleted.
                 // But image layers are in any case less sparse than delta layers. Also we need some
                 // protection from replacing recent image layers with new one after each GC iteration.
-                if l.is_incremental() && !LayerMap::is_l0(&*l) {
+                if self.get_gc_feedback() && l.is_incremental() && !LayerMap::is_l0(&*l) {
                     wanted_image_layers.add_range(l.get_key_range());
                 }
                 result.layers_not_updated += 1;
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index eb2ba3e9ed1e..6261ec28dbdf 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -158,6 +158,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
             "threshold": "23h",
         },
         "evictions_low_residence_duration_metric_threshold": "2days",
+        "gc_feedback": True,
         "gc_horizon": 23 * (1024 * 1024),
         "gc_period": "2h 13m",
         "image_creation_threshold": 7,

From 330083638fef3db1c2162d62b6bdbd47f890a18b Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Wed, 31 May 2023 22:04:46 -0400
Subject: [PATCH 36/59] Fix stale and misleading comment in LayerMap (#4297)

---
 pageserver/src/tenant/layer_map/layer_coverage.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/layer_map/layer_coverage.rs b/pageserver/src/tenant/layer_map/layer_coverage.rs
index 9d9d1d6ccf6c..47aace97a52a 100644
--- a/pageserver/src/tenant/layer_map/layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -1,8 +1,8 @@
 use std::ops::Range;
 
-// TODO the `im` crate has 20x more downloads and also has
-// persistent/immutable BTree. It also runs a bit faster but
-// results are not the same on some tests.
+// NOTE the `im` crate has 20x more downloads and also has
+// persistent/immutable BTree. But it's bugged so rpds is a
+// better choice https://github.com/neondatabase/neon/issues/3395
 use rpds::RedBlackTreeMapSync;
 
 /// Data structure that can efficiently:

From 36fee50f4d376f0f665a2f005b0ab4b122bba323 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 1 Jun 2023 20:12:07 +0300
Subject: [PATCH 37/59] compute_ctl: enable tracing panic hook (#4375)

compute_ctl can panic, but `tracing` is used for logging. panic stderr
output can interleave with messages from normal logging. The fix is to
use the established way (pageserver, safekeeper, storage_broker) of using
`tracing` to report panics.
---
 compute_tools/src/logger.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs
index 1b5cf647b0b8..f6fc88296830 100644
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -33,5 +33,7 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
         .init();
     tracing::info!("logging and tracing started");
 
+    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
+
     Ok(())
 }

From 82484e82415c85da2dac32c924c514d95efb81ab Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 1 Jun 2023 14:46:04 -0400
Subject: [PATCH 38/59] pgserver: add more metrics for better observability
 (#4323)

## Problem

This PR includes doc changes to the current metrics as well as adding
new metrics. With the new set of metrics, we can quantitatively analyze
the read amp., write amp. and space amp. in the system, when used
together with https://github.com/neondatabase/neonbench

close https://github.com/neondatabase/neon/issues/4312
ref https://github.com/neondatabase/neon/issues/4347

compaction metrics TBD, a novel idea is to print L0 file number and
number of layers in the system, and we can do this in the future when we
start working on compaction.

## Summary of changes

* Add `READ_NUM_FS_LAYERS` for computing read amp.
* Add `MATERIALIZED_PAGE_CACHE_HIT_UPON_REQUEST`.
* Add `GET_RECONSTRUCT_DATA_TIME`. GET_RECONSTRUCT_DATA_TIME +
RECONSTRUCT_TIME + WAIT_LSN_TIME should be approximately total time of
reads.
* Add `5.0` and `10.0` to `STORAGE_IO_TIME_BUCKETS` given some fsync
runs slow (i.e., > 1s) in some cases.
* Some `WAL_REDO` metrics are only used when Postgres is involved in the
redo process.

---------

Signed-off-by: Alex Chi <iskyzh@gmail.com>
---
 pageserver/src/metrics.rs         | 55 +++++++++++++++++++++++++++++--
 pageserver/src/tenant/timeline.rs | 15 ++++++++-
 test_runner/fixtures/metrics.py   |  7 ++++
 3 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 75bea9dbab33..cc444c479a46 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -84,6 +84,16 @@ pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_read_num_fs_layers",
+        "Number of persistent layers accessed for processing a read request, including those in the cache",
+        &["tenant_id", "timeline_id"],
+        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
+    )
+    .expect("failed to define a metric")
+});
+
 // Metrics collected on operations on the storage repository.
 static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
@@ -95,6 +105,25 @@ static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_materialized_cache_hits_direct_total",
+        "Number of cache hits from materialized page cache without redo",
+        &["tenant_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_getpage_get_reconstruct_data_seconds",
+        "Time spent in get_reconstruct_value_data",
+        &["tenant_id", "timeline_id"],
+        CRITICAL_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
 static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "pageserver_materialized_cache_hits_total",
@@ -354,6 +383,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
     0.001000, // 1000 usec
     0.030,    // 30 ms
     1.000,    // 1000 ms
+    30.000,   // 30000 ms
 ];
 
 const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
@@ -622,7 +652,7 @@ pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
 pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_wal_redo_wait_seconds",
-        "Time spent waiting for access to the WAL redo process",
+        "Time spent waiting for access to the Postgres WAL redo process",
         redo_histogram_time_buckets!(),
     )
     .expect("failed to define a metric")
@@ -631,7 +661,7 @@ pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
 pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_wal_redo_records_histogram",
-        "Histogram of number of records replayed per redo",
+        "Histogram of number of records replayed per redo in the Postgres WAL redo process",
         redo_histogram_count_buckets!(),
     )
     .expect("failed to define a metric")
@@ -640,7 +670,7 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
 pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_wal_redo_bytes_histogram",
-        "Histogram of number of records replayed per redo",
+        "Histogram of number of records replayed per redo sent to Postgres",
         redo_bytes_histogram_count_buckets!(),
     )
     .expect("failed to define a metric")
@@ -723,7 +753,9 @@ pub struct TimelineMetrics {
     tenant_id: String,
     timeline_id: String,
     pub reconstruct_time_histo: Histogram,
+    pub get_reconstruct_data_time_histo: Histogram,
     pub materialized_page_cache_hit_counter: GenericCounter<AtomicU64>,
+    pub materialized_page_cache_hit_upon_request_counter: GenericCounter<AtomicU64>,
     pub flush_time_histo: StorageTimeMetrics,
     pub compact_time_histo: StorageTimeMetrics,
     pub create_images_time_histo: StorageTimeMetrics,
@@ -734,6 +766,7 @@ pub struct TimelineMetrics {
     pub last_record_gauge: IntGauge,
     pub wait_lsn_time_histo: Histogram,
     pub resident_physical_size_gauge: UIntGauge,
+    pub read_num_fs_layers: Histogram,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
     pub num_persistent_files_created: IntCounter,
@@ -753,6 +786,9 @@ impl TimelineMetrics {
         let reconstruct_time_histo = RECONSTRUCT_TIME
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
+        let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
         let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
@@ -794,6 +830,12 @@ impl TimelineMetrics {
         let evictions = EVICTIONS
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
+        let read_num_fs_layers = READ_NUM_FS_LAYERS
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
+        let materialized_page_cache_hit_upon_request_counter = MATERIALIZED_PAGE_CACHE_HIT_DIRECT
+            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .unwrap();
         let evictions_with_low_residence_duration =
             evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
 
@@ -801,7 +843,9 @@ impl TimelineMetrics {
             tenant_id,
             timeline_id,
             reconstruct_time_histo,
+            get_reconstruct_data_time_histo,
             materialized_page_cache_hit_counter,
+            materialized_page_cache_hit_upon_request_counter,
             flush_time_histo,
             compact_time_histo,
             create_images_time_histo,
@@ -819,6 +863,7 @@ impl TimelineMetrics {
             evictions_with_low_residence_duration: std::sync::RwLock::new(
                 evictions_with_low_residence_duration,
             ),
+            read_num_fs_layers,
         }
     }
 }
@@ -828,7 +873,9 @@ impl Drop for TimelineMetrics {
         let tenant_id = &self.tenant_id;
         let timeline_id = &self.timeline_id;
         let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
         let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = MATERIALIZED_PAGE_CACHE_HIT_DIRECT.remove_label_values(&[tenant_id, timeline_id]);
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
         let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
         let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
@@ -836,6 +883,8 @@ impl Drop for TimelineMetrics {
         let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
         let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
         let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);
+
         self.evictions_with_low_residence_duration
             .write()
             .unwrap()
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 36c4b0bcd4d0..8885e761a21d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -525,7 +525,12 @@ impl Timeline {
             Some((cached_lsn, cached_img)) => {
                 match cached_lsn.cmp(&lsn) {
                     Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
-                    Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image
+                    Ordering::Equal => {
+                        self.metrics
+                            .materialized_page_cache_hit_upon_request_counter
+                            .inc();
+                        return Ok(cached_img); // exact LSN match, return the image
+                    }
                     Ordering::Greater => {
                         unreachable!("the returned lsn should never be after the requested lsn")
                     }
@@ -540,8 +545,10 @@ impl Timeline {
             img: cached_page_img,
         };
 
+        let timer = self.metrics.get_reconstruct_data_time_histo.start_timer();
         self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
             .await?;
+        timer.stop_and_record();
 
         self.metrics
             .reconstruct_time_histo
@@ -2261,6 +2268,9 @@ impl Timeline {
         let mut timeline_owned;
         let mut timeline = self;
 
+        let mut read_count =
+            scopeguard::guard(0, |cnt| self.metrics.read_num_fs_layers.observe(cnt as f64));
+
         // For debugging purposes, collect the path of layers that we traversed
         // through. It's included in the error message if we fail to find the key.
         let mut traversal_path = Vec::<TraversalPathItem>::new();
@@ -2395,6 +2405,7 @@ impl Timeline {
                                 Err(e) => return Err(PageReconstructError::from(e)),
                             };
                             cont_lsn = lsn_floor;
+                            // metrics: open_layer does not count as fs access, so we are not updating `read_count`
                             traversal_path.push((
                                 result,
                                 cont_lsn,
@@ -2421,6 +2432,7 @@ impl Timeline {
                                 Err(e) => return Err(PageReconstructError::from(e)),
                             };
                             cont_lsn = lsn_floor;
+                            // metrics: open_layer does not count as fs access, so we are not updating `read_count`
                             traversal_path.push((
                                 result,
                                 cont_lsn,
@@ -2455,6 +2467,7 @@ impl Timeline {
                                 Err(e) => return Err(PageReconstructError::from(e)),
                             };
                             cont_lsn = lsn_floor;
+                            *read_count += 1;
                             traversal_path.push((
                                 result,
                                 cont_lsn,
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 0e958ddd0624..b4c237cfa61c 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -65,12 +65,19 @@ def parse_metrics(text: str, name: str = "") -> Metrics:
     "pageserver_getpage_reconstruct_seconds_bucket",
     "pageserver_getpage_reconstruct_seconds_count",
     "pageserver_getpage_reconstruct_seconds_sum",
+    "pageserver_getpage_get_reconstruct_data_seconds_bucket",
+    "pageserver_getpage_get_reconstruct_data_seconds_count",
+    "pageserver_getpage_get_reconstruct_data_seconds_sum",
     "pageserver_io_operations_bytes_total",
     "pageserver_io_operations_seconds_bucket",
     "pageserver_io_operations_seconds_count",
     "pageserver_io_operations_seconds_sum",
     "pageserver_last_record_lsn",
     "pageserver_materialized_cache_hits_total",
+    "pageserver_materialized_cache_hits_direct_total",
+    "pageserver_read_num_fs_layers_bucket",
+    "pageserver_read_num_fs_layers_count",
+    "pageserver_read_num_fs_layers_sum",
     "pageserver_smgr_query_seconds_bucket",
     "pageserver_smgr_query_seconds_count",
     "pageserver_smgr_query_seconds_sum",

From 66cdba990a364f7ca3d2a95a6c042bbfbb9ade87 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Thu, 1 Jun 2023 15:06:28 -0400
Subject: [PATCH 39/59] refactor: use PersistentLayerDesc for persistent layers
 (#4398)

## Problem

Part of https://github.com/neondatabase/neon/issues/4373

## Summary of changes

This PR adds `PersistentLayerDesc`, which will be used in LayerMap
mapping and probably layer cache. After this PR and after we change
LayerMap to map to layer desc, we can safely drop RemoteLayerDesc.

---------

Signed-off-by: Alex Chi <iskyzh@gmail.com>
Co-authored-by: bojanserafimov <bojan.serafimov7@gmail.com>
---
 pageserver/src/tenant/storage_layer.rs        |  17 ++-
 .../src/tenant/storage_layer/delta_layer.rs   | 119 ++++++++-------
 .../src/tenant/storage_layer/filename.rs      |   4 +-
 .../src/tenant/storage_layer/image_layer.rs   | 122 +++++++++-------
 .../src/tenant/storage_layer/layer_desc.rs    | 109 ++++++++++++++
 .../src/tenant/storage_layer/remote_layer.rs  | 135 +++++++-----------
 pageserver/src/tenant/timeline.rs             |   5 +-
 7 files changed, 307 insertions(+), 204 deletions(-)
 create mode 100644 pageserver/src/tenant/storage_layer/layer_desc.rs

diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 3ca8e28c1662..7c071463de60 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,6 +4,7 @@ pub mod delta_layer;
 mod filename;
 mod image_layer;
 mod inmemory_layer;
+mod layer_desc;
 mod remote_layer;
 
 use crate::config::PageServerConf;
@@ -37,6 +38,7 @@ pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
 pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
+pub use layer_desc::PersistentLayerDesc;
 pub use remote_layer::RemoteLayer;
 
 use super::layer_map::BatchedUpdates;
@@ -406,14 +408,23 @@ pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
 /// An image layer is a snapshot of all the data in a key-range, at a single
 /// LSN.
 pub trait PersistentLayer: Layer {
-    fn get_tenant_id(&self) -> TenantId;
+    /// Get the layer descriptor.
+    fn layer_desc(&self) -> &PersistentLayerDesc;
+
+    fn get_tenant_id(&self) -> TenantId {
+        self.layer_desc().tenant_id
+    }
 
     /// Identify the timeline this layer belongs to
-    fn get_timeline_id(&self) -> TimelineId;
+    fn get_timeline_id(&self) -> TimelineId {
+        self.layer_desc().timeline_id
+    }
 
     /// File name used for this layer, both in the pageserver's local filesystem
     /// state as well as in the remote storage.
-    fn filename(&self) -> LayerFileName;
+    fn filename(&self) -> LayerFileName {
+        self.layer_desc().filename()
+    }
 
     // Path to the layer file in the local filesystem.
     // `None` for `RemoteLayer`.
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 63b8e57bb04d..5f2fb1ebea16 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -56,8 +56,8 @@ use utils::{
 };
 
 use super::{
-    DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerFileName, LayerIter,
-    LayerKeyIter, PathOrConf,
+    DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
+    PathOrConf, PersistentLayerDesc,
 };
 
 ///
@@ -89,10 +89,10 @@ impl From<&DeltaLayer> for Summary {
             magic: DELTA_FILE_MAGIC,
             format_version: STORAGE_FORMAT_VERSION,
 
-            tenant_id: layer.tenant_id,
-            timeline_id: layer.timeline_id,
-            key_range: layer.key_range.clone(),
-            lsn_range: layer.lsn_range.clone(),
+            tenant_id: layer.desc.tenant_id,
+            timeline_id: layer.desc.timeline_id,
+            key_range: layer.desc.key_range.clone(),
+            lsn_range: layer.desc.lsn_range.clone(),
 
             index_start_blk: 0,
             index_root_blk: 0,
@@ -180,10 +180,7 @@ impl DeltaKey {
 pub struct DeltaLayer {
     path_or_conf: PathOrConf,
 
-    pub tenant_id: TenantId,
-    pub timeline_id: TimelineId,
-    pub key_range: Range<Key>,
-    pub lsn_range: Range<Lsn>,
+    pub desc: PersistentLayerDesc,
 
     pub file_size: u64,
 
@@ -197,8 +194,8 @@ impl std::fmt::Debug for DeltaLayer {
         use super::RangeDisplayDebug;
 
         f.debug_struct("DeltaLayer")
-            .field("key_range", &RangeDisplayDebug(&self.key_range))
-            .field("lsn_range", &self.lsn_range)
+            .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
+            .field("lsn_range", &self.desc.lsn_range)
             .field("file_size", &self.file_size)
             .field("inner", &self.inner)
             .finish()
@@ -228,30 +225,16 @@ impl std::fmt::Debug for DeltaLayerInner {
 }
 
 impl Layer for DeltaLayer {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.lsn_range.clone()
-    }
-    fn is_incremental(&self) -> bool {
-        true
-    }
-
-    fn short_id(&self) -> String {
-        self.filename().file_name()
-    }
     /// debugging function to print out the contents of the layer
     fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
         println!(
             "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
-            self.tenant_id,
-            self.timeline_id,
-            self.key_range.start,
-            self.key_range.end,
-            self.lsn_range.start,
-            self.lsn_range.end
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.desc.lsn_range.start,
+            self.desc.lsn_range.end
         );
 
         if !verbose {
@@ -324,10 +307,10 @@ impl Layer for DeltaLayer {
         reconstruct_state: &mut ValueReconstructState,
         ctx: &RequestContext,
     ) -> anyhow::Result<ValueReconstructResult> {
-        ensure!(lsn_range.start >= self.lsn_range.start);
+        ensure!(lsn_range.start >= self.desc.lsn_range.start);
         let mut need_image = true;
 
-        ensure!(self.key_range.contains(&key));
+        ensure!(self.desc.key_range.contains(&key));
 
         {
             // Open the file and lock the metadata in memory
@@ -402,19 +385,31 @@ impl Layer for DeltaLayer {
             Ok(ValueReconstructResult::Complete)
         }
     }
-}
 
-impl PersistentLayer for DeltaLayer {
-    fn get_tenant_id(&self) -> TenantId {
-        self.tenant_id
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_key_range(&self) -> Range<Key> {
+        self.layer_desc().key_range.clone()
     }
 
-    fn get_timeline_id(&self) -> TimelineId {
-        self.timeline_id
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.layer_desc().lsn_range.clone()
     }
 
-    fn filename(&self) -> LayerFileName {
-        self.layer_name().into()
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn is_incremental(&self) -> bool {
+        self.layer_desc().is_incremental
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn short_id(&self) -> String {
+        self.layer_desc().short_id()
+    }
+}
+
+impl PersistentLayer for DeltaLayer {
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
     }
 
     fn local_path(&self) -> Option<PathBuf> {
@@ -602,10 +597,12 @@ impl DeltaLayer {
     ) -> DeltaLayer {
         DeltaLayer {
             path_or_conf: PathOrConf::Conf(conf),
-            timeline_id,
-            tenant_id,
-            key_range: filename.key_range.clone(),
-            lsn_range: filename.lsn_range.clone(),
+            desc: PersistentLayerDesc::new_delta(
+                tenant_id,
+                timeline_id,
+                filename.key_range.clone(),
+                filename.lsn_range.clone(),
+            ),
             file_size,
             access_stats,
             inner: RwLock::new(DeltaLayerInner {
@@ -632,10 +629,12 @@ impl DeltaLayer {
 
         Ok(DeltaLayer {
             path_or_conf: PathOrConf::Path(path.to_path_buf()),
-            timeline_id: summary.timeline_id,
-            tenant_id: summary.tenant_id,
-            key_range: summary.key_range,
-            lsn_range: summary.lsn_range,
+            desc: PersistentLayerDesc::new_delta(
+                summary.tenant_id,
+                summary.timeline_id,
+                summary.key_range,
+                summary.lsn_range,
+            ),
             file_size: metadata.len(),
             access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
             inner: RwLock::new(DeltaLayerInner {
@@ -648,18 +647,14 @@ impl DeltaLayer {
     }
 
     fn layer_name(&self) -> DeltaFileName {
-        DeltaFileName {
-            key_range: self.key_range.clone(),
-            lsn_range: self.lsn_range.clone(),
-        }
+        self.desc.delta_file_name()
     }
-
     /// Path to the layer file in pageserver workdir.
     pub fn path(&self) -> PathBuf {
         Self::path_for(
             &self.path_or_conf,
-            self.timeline_id,
-            self.tenant_id,
+            self.desc.timeline_id,
+            self.desc.tenant_id,
             &self.layer_name(),
         )
     }
@@ -803,10 +798,12 @@ impl DeltaLayerWriterInner {
         // set inner.file here. The first read will have to re-open it.
         let layer = DeltaLayer {
             path_or_conf: PathOrConf::Conf(self.conf),
-            tenant_id: self.tenant_id,
-            timeline_id: self.timeline_id,
-            key_range: self.key_start..key_end,
-            lsn_range: self.lsn_range.clone(),
+            desc: PersistentLayerDesc::new_delta(
+                self.tenant_id,
+                self.timeline_id,
+                self.key_start..key_end,
+                self.lsn_range.clone(),
+            ),
             file_size: metadata.len(),
             access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
             inner: RwLock::new(DeltaLayerInner {
diff --git a/pageserver/src/tenant/storage_layer/filename.rs b/pageserver/src/tenant/storage_layer/filename.rs
index e2112fc388e1..5dcd54689ed1 100644
--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -9,6 +9,8 @@ use std::str::FromStr;
 
 use utils::lsn::Lsn;
 
+use super::PersistentLayerDesc;
+
 // Note: Timeline::load_layer_map() relies on this sort order
 #[derive(PartialEq, Eq, Clone, Hash)]
 pub struct DeltaFileName {
@@ -153,7 +155,7 @@ impl Ord for ImageFileName {
 impl ImageFileName {
     pub fn lsn_as_range(&self) -> Range<Lsn> {
         // Saves from having to copypaste this all over
-        self.lsn..(self.lsn + 1)
+        PersistentLayerDesc::image_layer_lsn_range(self.lsn)
     }
 }
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index a5dd16fae225..b55dd08a6ded 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -52,8 +52,8 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::filename::{ImageFileName, LayerFileName};
-use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf};
+use super::filename::ImageFileName;
+use super::{Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc};
 
 ///
 /// Header stored in the beginning of the file
@@ -84,9 +84,9 @@ impl From<&ImageLayer> for Summary {
         Self {
             magic: IMAGE_FILE_MAGIC,
             format_version: STORAGE_FORMAT_VERSION,
-            tenant_id: layer.tenant_id,
-            timeline_id: layer.timeline_id,
-            key_range: layer.key_range.clone(),
+            tenant_id: layer.desc.tenant_id,
+            timeline_id: layer.desc.timeline_id,
+            key_range: layer.desc.key_range.clone(),
             lsn: layer.lsn,
 
             index_start_blk: 0,
@@ -104,14 +104,13 @@ impl From<&ImageLayer> for Summary {
 /// and it needs to be loaded before using it in queries.
 pub struct ImageLayer {
     path_or_conf: PathOrConf,
-    pub tenant_id: TenantId,
-    pub timeline_id: TimelineId,
-    pub key_range: Range<Key>,
-    pub file_size: u64,
 
-    // This entry contains an image of all pages as of this LSN
+    pub desc: PersistentLayerDesc,
+    // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
     pub lsn: Lsn,
 
+    pub file_size: u64,
+
     access_stats: LayerAccessStats,
 
     inner: RwLock<ImageLayerInner>,
@@ -122,7 +121,7 @@ impl std::fmt::Debug for ImageLayer {
         use super::RangeDisplayDebug;
 
         f.debug_struct("ImageLayer")
-            .field("key_range", &RangeDisplayDebug(&self.key_range))
+            .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
             .field("file_size", &self.file_size)
             .field("lsn", &self.lsn)
             .field("inner", &self.inner)
@@ -153,27 +152,15 @@ impl std::fmt::Debug for ImageLayerInner {
 }
 
 impl Layer for ImageLayer {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        // End-bound is exclusive
-        self.lsn..(self.lsn + 1)
-    }
-    fn is_incremental(&self) -> bool {
-        false
-    }
-
-    fn short_id(&self) -> String {
-        self.filename().file_name()
-    }
-
     /// debugging function to print out the contents of the layer
     fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
         println!(
             "----- image layer for ten {} tli {} key {}-{} at {} ----",
-            self.tenant_id, self.timeline_id, self.key_range.start, self.key_range.end, self.lsn
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.lsn
         );
 
         if !verbose {
@@ -203,7 +190,7 @@ impl Layer for ImageLayer {
         reconstruct_state: &mut ValueReconstructState,
         ctx: &RequestContext,
     ) -> anyhow::Result<ValueReconstructResult> {
-        assert!(self.key_range.contains(&key));
+        assert!(self.desc.key_range.contains(&key));
         assert!(lsn_range.start >= self.lsn);
         assert!(lsn_range.end >= self.lsn);
 
@@ -230,24 +217,37 @@ impl Layer for ImageLayer {
             Ok(ValueReconstructResult::Missing)
         }
     }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_key_range(&self) -> Range<Key> {
+        self.layer_desc().key_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.layer_desc().lsn_range.clone()
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn is_incremental(&self) -> bool {
+        self.layer_desc().is_incremental
+    }
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn short_id(&self) -> String {
+        self.layer_desc().short_id()
+    }
 }
 
 impl PersistentLayer for ImageLayer {
-    fn filename(&self) -> LayerFileName {
-        self.layer_name().into()
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
     }
 
     fn local_path(&self) -> Option<PathBuf> {
         Some(self.path())
     }
 
-    fn get_tenant_id(&self) -> TenantId {
-        self.tenant_id
-    }
-
-    fn get_timeline_id(&self) -> TimelineId {
-        self.timeline_id
-    }
     fn iter(&self, _ctx: &RequestContext) -> Result<LayerIter<'_>> {
         unimplemented!();
     }
@@ -405,9 +405,13 @@ impl ImageLayer {
     ) -> ImageLayer {
         ImageLayer {
             path_or_conf: PathOrConf::Conf(conf),
-            timeline_id,
-            tenant_id,
-            key_range: filename.key_range.clone(),
+            desc: PersistentLayerDesc::new_img(
+                tenant_id,
+                timeline_id,
+                filename.key_range.clone(),
+                filename.lsn,
+                false,
+            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
             lsn: filename.lsn,
             file_size,
             access_stats,
@@ -433,9 +437,13 @@ impl ImageLayer {
             .context("get file metadata to determine size")?;
         Ok(ImageLayer {
             path_or_conf: PathOrConf::Path(path.to_path_buf()),
-            timeline_id: summary.timeline_id,
-            tenant_id: summary.tenant_id,
-            key_range: summary.key_range,
+            desc: PersistentLayerDesc::new_img(
+                summary.tenant_id,
+                summary.timeline_id,
+                summary.key_range,
+                summary.lsn,
+                false,
+            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
             lsn: summary.lsn,
             file_size: metadata.len(),
             access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
@@ -449,18 +457,15 @@ impl ImageLayer {
     }
 
     fn layer_name(&self) -> ImageFileName {
-        ImageFileName {
-            key_range: self.key_range.clone(),
-            lsn: self.lsn,
-        }
+        self.desc.image_file_name()
     }
 
     /// Path to the layer file in pageserver workdir.
     pub fn path(&self) -> PathBuf {
         Self::path_for(
             &self.path_or_conf,
-            self.timeline_id,
-            self.tenant_id,
+            self.desc.timeline_id,
+            self.desc.tenant_id,
             &self.layer_name(),
         )
     }
@@ -484,6 +489,7 @@ struct ImageLayerWriterInner {
     tenant_id: TenantId,
     key_range: Range<Key>,
     lsn: Lsn,
+    is_incremental: bool,
 
     blob_writer: WriteBlobWriter<VirtualFile>,
     tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
@@ -499,6 +505,7 @@ impl ImageLayerWriterInner {
         tenant_id: TenantId,
         key_range: &Range<Key>,
         lsn: Lsn,
+        is_incremental: bool,
     ) -> anyhow::Result<Self> {
         // Create the file initially with a temporary filename.
         // We'll atomically rename it to the final name when we're done.
@@ -533,6 +540,7 @@ impl ImageLayerWriterInner {
             lsn,
             tree: tree_builder,
             blob_writer,
+            is_incremental,
         };
 
         Ok(writer)
@@ -570,6 +578,14 @@ impl ImageLayerWriterInner {
             file.write_all(buf.as_ref())?;
         }
 
+        let desc = PersistentLayerDesc::new_img(
+            self.tenant_id,
+            self.timeline_id,
+            self.key_range.clone(),
+            self.lsn,
+            self.is_incremental, // for now, image layer ALWAYS covers the full range
+        );
+
         // Fill in the summary on blk 0
         let summary = Summary {
             magic: IMAGE_FILE_MAGIC,
@@ -593,9 +609,7 @@ impl ImageLayerWriterInner {
         // set inner.file here. The first read will have to re-open it.
         let layer = ImageLayer {
             path_or_conf: PathOrConf::Conf(self.conf),
-            timeline_id: self.timeline_id,
-            tenant_id: self.tenant_id,
-            key_range: self.key_range.clone(),
+            desc,
             lsn: self.lsn,
             file_size: metadata.len(),
             access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
@@ -667,6 +681,7 @@ impl ImageLayerWriter {
         tenant_id: TenantId,
         key_range: &Range<Key>,
         lsn: Lsn,
+        is_incremental: bool,
     ) -> anyhow::Result<ImageLayerWriter> {
         Ok(Self {
             inner: Some(ImageLayerWriterInner::new(
@@ -675,6 +690,7 @@ impl ImageLayerWriter {
                 tenant_id,
                 key_range,
                 lsn,
+                is_incremental,
             )?),
         })
     }
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
new file mode 100644
index 000000000000..a9859681d320
--- /dev/null
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -0,0 +1,109 @@
+use std::ops::Range;
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+use crate::repository::Key;
+
+use super::{DeltaFileName, ImageFileName, LayerFileName};
+
+/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
+/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
+/// a unified way to generate layer information like file name.
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct PersistentLayerDesc {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub key_range: Range<Key>,
+    /// For image layer, this is `[lsn, lsn+1)`.
+    pub lsn_range: Range<Lsn>,
+    /// Whether this is a delta layer.
+    pub is_delta: bool,
+    /// Whether this layer only contains page images for part of the keys in the range. In the current implementation, this should
+    /// always be equal to `is_delta`. If we land the partial image layer PR someday, image layer could also be
+    /// incremental.
+    pub is_incremental: bool,
+}
+
+impl PersistentLayerDesc {
+    pub fn short_id(&self) -> String {
+        self.filename().file_name()
+    }
+
+    pub fn new_img(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        key_range: Range<Key>,
+        lsn: Lsn,
+        is_incremental: bool,
+    ) -> Self {
+        Self {
+            tenant_id,
+            timeline_id,
+            key_range,
+            lsn_range: Self::image_layer_lsn_range(lsn),
+            is_delta: false,
+            is_incremental,
+        }
+    }
+
+    pub fn new_delta(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+    ) -> Self {
+        Self {
+            tenant_id,
+            timeline_id,
+            key_range,
+            lsn_range,
+            is_delta: true,
+            is_incremental: true,
+        }
+    }
+
+    /// Get the LSN that the image layer covers.
+    pub fn image_layer_lsn(&self) -> Lsn {
+        assert!(!self.is_delta);
+        assert!(self.lsn_range.start + 1 == self.lsn_range.end);
+        self.lsn_range.start
+    }
+
+    /// Get the LSN range corresponding to a single image layer LSN.
+    pub fn image_layer_lsn_range(lsn: Lsn) -> Range<Lsn> {
+        lsn..(lsn + 1)
+    }
+
+    /// Get a delta file name for this layer.
+    ///
+    /// Panic: if this is not a delta layer.
+    pub fn delta_file_name(&self) -> DeltaFileName {
+        assert!(self.is_delta);
+        DeltaFileName {
+            key_range: self.key_range.clone(),
+            lsn_range: self.lsn_range.clone(),
+        }
+    }
+
+    /// Get a delta file name for this layer.
+    ///
+    /// Panic: if this is not an image layer, or the lsn range is invalid
+    pub fn image_file_name(&self) -> ImageFileName {
+        assert!(!self.is_delta);
+        assert!(self.lsn_range.start + 1 == self.lsn_range.end);
+        ImageFileName {
+            key_range: self.key_range.clone(),
+            lsn: self.lsn_range.start,
+        }
+    }
+
+    pub fn filename(&self) -> LayerFileName {
+        if self.is_delta {
+            self.delta_file_name().into()
+        } else {
+            self.image_file_name().into()
+        }
+    }
+}
diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs
index 2106587ab20b..ff0f44da929c 100644
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -18,11 +18,10 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::filename::{DeltaFileName, ImageFileName, LayerFileName};
-use super::image_layer::ImageLayer;
+use super::filename::{DeltaFileName, ImageFileName};
 use super::{
-    DeltaLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
-    LayerResidenceStatus, PersistentLayer,
+    DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
+    LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
 };
 
 /// RemoteLayer is a not yet downloaded [`ImageLayer`] or
@@ -34,19 +33,10 @@ use super::{
 ///
 /// See: [`crate::context::RequestContext`] for authorization to download
 pub struct RemoteLayer {
-    tenantid: TenantId,
-    timelineid: TimelineId,
-    key_range: Range<Key>,
-    lsn_range: Range<Lsn>,
-
-    pub file_name: LayerFileName,
+    pub desc: PersistentLayerDesc,
 
     pub layer_metadata: LayerFileMetadata,
 
-    is_delta: bool,
-
-    is_incremental: bool,
-
     access_stats: LayerAccessStats,
 
     pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
@@ -66,22 +56,14 @@ pub struct RemoteLayer {
 impl std::fmt::Debug for RemoteLayer {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("RemoteLayer")
-            .field("file_name", &self.file_name)
+            .field("file_name", &self.desc.filename())
             .field("layer_metadata", &self.layer_metadata)
-            .field("is_incremental", &self.is_incremental)
+            .field("is_incremental", &self.desc.is_incremental)
             .finish()
     }
 }
 
 impl Layer for RemoteLayer {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.lsn_range.clone()
-    }
-
     fn get_value_reconstruct_data(
         &self,
         _key: Key,
@@ -95,53 +77,45 @@ impl Layer for RemoteLayer {
         );
     }
 
-    fn is_incremental(&self) -> bool {
-        self.is_incremental
-    }
-
     /// debugging function to print out the contents of the layer
     fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
         println!(
             "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
-            self.tenantid,
-            self.timelineid,
-            self.key_range.start,
-            self.key_range.end,
-            self.lsn_range.start,
-            self.lsn_range.end
+            self.desc.tenant_id,
+            self.desc.timeline_id,
+            self.desc.key_range.start,
+            self.desc.key_range.end,
+            self.desc.lsn_range.start,
+            self.desc.lsn_range.end
         );
 
         Ok(())
     }
 
-    fn short_id(&self) -> String {
-        self.filename().file_name()
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_key_range(&self) -> Range<Key> {
+        self.layer_desc().key_range.clone()
     }
-}
 
-impl PersistentLayer for RemoteLayer {
-    fn get_tenant_id(&self) -> TenantId {
-        self.tenantid
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.layer_desc().lsn_range.clone()
     }
 
-    fn get_timeline_id(&self) -> TimelineId {
-        self.timelineid
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn is_incremental(&self) -> bool {
+        self.layer_desc().is_incremental
     }
 
-    fn filename(&self) -> LayerFileName {
-        if self.is_delta {
-            DeltaFileName {
-                key_range: self.key_range.clone(),
-                lsn_range: self.lsn_range.clone(),
-            }
-            .into()
-        } else {
-            ImageFileName {
-                key_range: self.key_range.clone(),
-                lsn: self.lsn_range.start,
-            }
-            .into()
-        }
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn short_id(&self) -> String {
+        self.layer_desc().short_id()
+    }
+}
+
+impl PersistentLayer for RemoteLayer {
+    fn layer_desc(&self) -> &PersistentLayerDesc {
+        &self.desc
     }
 
     fn local_path(&self) -> Option<PathBuf> {
@@ -176,7 +150,7 @@ impl PersistentLayer for RemoteLayer {
         let layer_file_name = self.filename().file_name();
         let lsn_range = self.get_lsn_range();
 
-        if self.is_delta {
+        if self.desc.is_delta {
             HistoricLayerInfo::Delta {
                 layer_file_name,
                 layer_file_size: self.layer_metadata.file_size(),
@@ -210,13 +184,13 @@ impl RemoteLayer {
         access_stats: LayerAccessStats,
     ) -> RemoteLayer {
         RemoteLayer {
-            tenantid,
-            timelineid,
-            key_range: fname.key_range.clone(),
-            lsn_range: fname.lsn_as_range(),
-            is_delta: false,
-            is_incremental: false,
-            file_name: fname.to_owned().into(),
+            desc: PersistentLayerDesc::new_img(
+                tenantid,
+                timelineid,
+                fname.key_range.clone(),
+                fname.lsn,
+                false,
+            ),
             layer_metadata: layer_metadata.clone(),
             ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
             download_replacement_failure: std::sync::atomic::AtomicBool::default(),
@@ -232,13 +206,12 @@ impl RemoteLayer {
         access_stats: LayerAccessStats,
     ) -> RemoteLayer {
         RemoteLayer {
-            tenantid,
-            timelineid,
-            key_range: fname.key_range.clone(),
-            lsn_range: fname.lsn_range.clone(),
-            is_delta: true,
-            is_incremental: true,
-            file_name: fname.to_owned().into(),
+            desc: PersistentLayerDesc::new_delta(
+                tenantid,
+                timelineid,
+                fname.key_range.clone(),
+                fname.lsn_range.clone(),
+            ),
             layer_metadata: layer_metadata.clone(),
             ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
             download_replacement_failure: std::sync::atomic::AtomicBool::default(),
@@ -256,15 +229,12 @@ impl RemoteLayer {
     where
         L: ?Sized + Layer,
     {
-        if self.is_delta {
-            let fname = DeltaFileName {
-                key_range: self.key_range.clone(),
-                lsn_range: self.lsn_range.clone(),
-            };
+        if self.desc.is_delta {
+            let fname = self.desc.delta_file_name();
             Arc::new(DeltaLayer::new(
                 conf,
-                self.timelineid,
-                self.tenantid,
+                self.desc.timeline_id,
+                self.desc.tenant_id,
                 &fname,
                 file_size,
                 self.access_stats.clone_for_residence_change(
@@ -273,14 +243,11 @@ impl RemoteLayer {
                 ),
             ))
         } else {
-            let fname = ImageFileName {
-                key_range: self.key_range.clone(),
-                lsn: self.lsn_range.start,
-            };
+            let fname = self.desc.image_file_name();
             Arc::new(ImageLayer::new(
                 conf,
-                self.timelineid,
-                self.tenantid,
+                self.desc.timeline_id,
+                self.desc.tenant_id,
                 &fname,
                 file_size,
                 self.access_stats.clone_for_residence_change(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8885e761a21d..b45dcc4e42a9 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2533,7 +2533,7 @@ impl Timeline {
                     (DownloadBehavior::Error, false) => {
                         return Err(PageReconstructError::NeedsDownload(
                             TenantTimelineId::new(self.tenant_id, self.timeline_id),
-                            remote_layer.file_name.clone(),
+                            remote_layer.filename(),
                         ))
                     }
                 }
@@ -3066,6 +3066,7 @@ impl Timeline {
                     self.tenant_id,
                     &img_range,
                     lsn,
+                    false, // image layer always covers the full range
                 )?;
 
                 fail_point!("image-layer-writer-fail-before-finish", |_| {
@@ -4126,7 +4127,7 @@ impl Timeline {
                 // Does retries + exponential back-off internally.
                 // When this fails, don't layer further retry attempts here.
                 let result = remote_client
-                    .download_layer_file(&remote_layer.file_name, &remote_layer.layer_metadata)
+                    .download_layer_file(&remote_layer.filename(), &remote_layer.layer_metadata)
                     .await;
 
                 if let Ok(size) = &result {

From ef80a902c88e6357fcca71c2e282edd876adbf28 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 2 Jun 2023 14:59:10 +0300
Subject: [PATCH 40/59] pg_sni_router: add session_id to more messages (#4403)

See superceded #4390.

- capture log in test
- expand the span to cover init and error reporting
- remove obvious logging by logging only unexpected
---
 proxy/src/bin/pg_sni_router.rs         | 22 ++++++++++++----------
 test_runner/regress/test_sni_router.py |  9 ++++++++-
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index bba2d51caf4b..a5f50cc7c1b1 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -17,7 +17,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::sync::CancellationToken;
 use utils::{project_git_version, sentry_init::init_sentry};
 
-use tracing::{error, info, warn};
+use tracing::{error, info, warn, Instrument};
 
 project_git_version!(GIT_VERSION);
 
@@ -141,7 +141,6 @@ async fn task_main(
         tokio::select! {
             accept_result = listener.accept() => {
                 let (socket, peer_addr) = accept_result?;
-                info!("accepted postgres client connection from {peer_addr}");
 
                 let session_id = uuid::Uuid::new_v4();
                 let tls_config = Arc::clone(&tls_config);
@@ -149,18 +148,18 @@ async fn task_main(
 
                 connections.spawn(
                     async move {
-                        info!("spawned a task for {peer_addr}");
-
                         socket
                             .set_nodelay(true)
                             .context("failed to set socket option")?;
 
-                        handle_client(dest_suffix, tls_config, session_id, socket).await
+                        info!(%peer_addr, "serving");
+                        handle_client(dest_suffix, tls_config, socket).await
                     }
                     .unwrap_or_else(|e| {
                         // Acknowledge that the task has finished with an error.
                         error!("per-client task finished with an error: {e:#}");
-                    }),
+                    })
+                    .instrument(tracing::info_span!("handle_client", ?session_id))
                 );
             }
             _ = cancellation_token.cancelled() => {
@@ -192,7 +191,6 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
     let mut stream = PqStream::new(Stream::from_raw(raw_stream));
 
     let msg = stream.read_startup_packet().await?;
-    info!("received {msg:?}");
     use pq_proto::FeStartupPacket::*;
 
     match msg {
@@ -215,15 +213,19 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
             }
             Ok(raw.upgrade(tls_config).await?)
         }
-        _ => stream.throw_error_str(ERR_INSECURE_CONNECTION).await?,
+        unexpected => {
+            info!(
+                ?unexpected,
+                "unexpected startup packet, rejecting connection"
+            );
+            stream.throw_error_str(ERR_INSECURE_CONNECTION).await?
+        }
     }
 }
 
-#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
 async fn handle_client(
     dest_suffix: Arc<String>,
     tls_config: Arc<rustls::ServerConfig>,
-    session_id: uuid::Uuid,
     stream: impl AsyncRead + AsyncWrite + Unpin,
 ) -> anyhow::Result<()> {
     let tls_stream = ssl_handshake(stream, tls_config).await?;
diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py
index f3aa429c49ea..9b78e8287ecf 100644
--- a/test_runner/regress/test_sni_router.py
+++ b/test_runner/regress/test_sni_router.py
@@ -37,6 +37,7 @@ def __init__(
         destination: str,
         tls_cert: Path,
         tls_key: Path,
+        test_output_dir: Path,
     ):
         # Must use a hostname rather than IP here, for SNI to work
         host = "localhost"
@@ -49,6 +50,7 @@ def __init__(
         self.tls_cert = tls_cert
         self.tls_key = tls_key
         self._popen: Optional[subprocess.Popen[bytes]] = None
+        self.test_output_dir = test_output_dir
 
     def start(self) -> "PgSniRouter":
         assert self._popen is None
@@ -60,8 +62,12 @@ def start(self) -> "PgSniRouter":
             *["--destination", self.destination],
         ]
 
-        self._popen = subprocess.Popen(args)
+        router_log_path = self.test_output_dir / "pg_sni_router.log"
+        router_log = open(router_log_path, "w")
+
+        self._popen = subprocess.Popen(args, stderr=router_log)
         self._wait_until_ready()
+        log.info(f"pg_sni_router started, log file: {router_log_path}")
         return self
 
     @backoff.on_exception(backoff.expo, OSError, max_time=10)
@@ -121,6 +127,7 @@ def test_pg_sni_router(
         destination="localtest.me",
         tls_cert=test_output_dir / "router.crt",
         tls_key=test_output_dir / "router.key",
+        test_output_dir=test_output_dir,
     ) as router:
         router.start()
 

From 9787227c35d6e79b8b8328dc39b1a5592441fcc4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 2 Jun 2023 08:28:13 -0400
Subject: [PATCH 41/59] Shield HTTP request handlers from async cancellations.
 (#4314)

We now spawn a new task for every HTTP request, and wait on the
JoinHandle. If Hyper drops the Future, the spawned task will keep
running. This protects the rest of the pageserver code from unexpected
async cancellations.

This creates a CancellationToken for each request and passes it to the
handler function. If the HTTP request is dropped by the client, the
CancellationToken is signaled. None of the handler functions make use
for the CancellationToken currently, but they now they could.

The CancellationToken arguments also work like documentation. When
you're looking at a function signature and you see that it takes a
CancellationToken as argument, it's a nice hint that the function might
run for a long time, and won't be async cancelled. The default
assumption in the pageserver is now that async functions are not
cancellation-safe anyway, unless explictly marked as such, but this is a
nice extra reminder.

Spawning a task for each request is OK from a performance point of view
because spawning is very cheap in Tokio, and none of our HTTP requests
are very performance critical anyway.

Fixes issue #3478
---
 docs/pageserver-thread-mgmt.md              |   4 +-
 libs/utils/src/http/endpoint.rs             |   6 +
 pageserver/src/http/routes.rs               | 319 ++++++++++++++------
 pageserver/src/tenant/mgr.rs                |   1 -
 test_runner/regress/test_timeline_delete.py |  24 +-
 5 files changed, 250 insertions(+), 104 deletions(-)

diff --git a/docs/pageserver-thread-mgmt.md b/docs/pageserver-thread-mgmt.md
index 0cc897f1542f..b91193352892 100644
--- a/docs/pageserver-thread-mgmt.md
+++ b/docs/pageserver-thread-mgmt.md
@@ -52,9 +52,7 @@ completion, or shield the rest of the code from surprise cancellations
 by spawning a separate task. The code that handles incoming HTTP
 requests, for example, spawns a separate task for each request,
 because Hyper will drop the request-handling Future if the HTTP
-connection is lost.  (FIXME: our HTTP handlers do not do that
-currently, but we should fix that. See [issue
-3478](https://github.com/neondatabase/neon/issues/3478)).
+connection is lost.
 
 
 #### How to cancel, then?
diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index db3642b507e7..7cb96d909458 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -40,6 +40,12 @@ struct RequestId(String);
 ///
 /// This also handles errors, logging them and converting them to an HTTP error response.
 ///
+/// NB: If the client disconnects, Hyper will drop the Future, without polling it to
+/// completion. In other words, the handler must be async cancellation safe! request_span
+/// prints a warning to the log when that happens, so that you have some trace of it in
+/// the log.
+///
+///
 /// There could be other ways to implement similar functionality:
 ///
 /// * procmacros placed on top of all handler methods
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 61028e23fea0..22dedbe5b29a 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1,3 +1,6 @@
+//!
+//! Management HTTP API
+//!
 use std::collections::HashMap;
 use std::sync::Arc;
 
@@ -46,7 +49,6 @@ use utils::{
 };
 
 // Imports only used for testing APIs
-#[cfg(feature = "testing")]
 use super::models::ConfigureFailpointsRequest;
 
 struct State {
@@ -290,13 +292,19 @@ fn build_timeline_info_common(
 }
 
 // healthcheck handler
-async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn status_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
     let config = get_config(&request);
     json_response(StatusCode::OK, StatusResponse { id: config.id })
 }
 
-async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_create_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     let request_data: TimelineCreateRequest = json_request(&mut request).await?;
     check_permission(&request, Some(tenant_id))?;
@@ -332,7 +340,10 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
     .await
 }
 
-async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_list_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     let include_non_incremental_logical_size: Option<bool> =
         parse_query_param(&request, "include-non-incremental-logical-size")?;
@@ -366,7 +377,10 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
     json_response(StatusCode::OK, response_data)
 }
 
-async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_detail_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let include_non_incremental_logical_size: Option<bool> =
@@ -400,7 +414,10 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
     json_response(StatusCode::OK, timeline_info)
 }
 
-async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn get_lsn_by_timestamp_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
@@ -424,7 +441,10 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
     json_response(StatusCode::OK, result)
 }
 
-async fn tenant_attach_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_attach_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
@@ -460,7 +480,10 @@ async fn tenant_attach_handler(mut request: Request<Body>) -> Result<Response<Bo
     json_response(StatusCode::ACCEPTED, ())
 }
 
-async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_delete_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
@@ -474,7 +497,10 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
     json_response(StatusCode::OK, ())
 }
 
-async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_detach_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
     let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;
@@ -488,7 +514,10 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
     json_response(StatusCode::OK, ())
 }
 
-async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_load_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
@@ -508,7 +537,10 @@ async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, A
     json_response(StatusCode::ACCEPTED, ())
 }
 
-async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_ignore_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
@@ -521,7 +553,10 @@ async fn tenant_ignore_handler(request: Request<Body>) -> Result<Response<Body>,
     json_response(StatusCode::OK, ())
 }
 
-async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_list_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
 
     let response_data = mgr::list_tenants()
@@ -541,7 +576,10 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
     json_response(StatusCode::OK, response_data)
 }
 
-async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_status(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
@@ -581,7 +619,10 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
 /// Note: we don't update the cached size and prometheus metric here.
 /// The retention period might be different, and it's nice to have a method to just calculate it
 /// without modifying anything anyway.
-async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_size_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
     let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
@@ -646,7 +687,10 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
     )
 }
 
-async fn layer_map_info_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn layer_map_info_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     let reset: LayerAccessStatsReset =
@@ -660,7 +704,10 @@ async fn layer_map_info_handler(request: Request<Body>) -> Result<Response<Body>
     json_response(StatusCode::OK, layer_map_info)
 }
 
-async fn layer_download_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn layer_download_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -683,7 +730,10 @@ async fn layer_download_handler(request: Request<Body>) -> Result<Response<Body>
     }
 }
 
-async fn evict_timeline_layer_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn evict_timeline_layer_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -761,7 +811,10 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
     Ok(response)
 }
 
-async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn tenant_create_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let request_data: TenantCreateRequest = json_request(&mut request).await?;
     let target_tenant_id = request_data.new_tenant_id;
     check_permission(&request, None)?;
@@ -808,7 +861,10 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
     )
 }
 
-async fn get_tenant_config_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn get_tenant_config_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
@@ -834,6 +890,7 @@ async fn get_tenant_config_handler(request: Request<Body>) -> Result<Response<Bo
 
 async fn update_tenant_config_handler(
     mut request: Request<Body>,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
     let request_data: TenantConfigRequest = json_request(&mut request).await?;
     let tenant_id = request_data.tenant_id;
@@ -851,8 +908,10 @@ async fn update_tenant_config_handler(
 }
 
 /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
-#[cfg(feature = "testing")]
-async fn handle_tenant_break(r: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn handle_tenant_break(
+    r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
 
     let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
@@ -864,8 +923,10 @@ async fn handle_tenant_break(r: Request<Body>) -> Result<Response<Body>, ApiErro
     json_response(StatusCode::OK, ())
 }
 
-#[cfg(feature = "testing")]
-async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn failpoints_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     if !fail::has_failpoints() {
         return Err(ApiError::BadRequest(anyhow!(
             "Cannot manage failpoints because pageserver was compiled without failpoints support"
@@ -898,7 +959,10 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
 }
 
 // Run GC immediately on given timeline.
-async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_gc_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
@@ -917,8 +981,10 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
 }
 
 // Run compaction immediately on given timeline.
-#[cfg(feature = "testing")]
-async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_compact_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
@@ -939,8 +1005,10 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
 }
 
 // Run checkpoint immediately on given timeline.
-#[cfg(feature = "testing")]
-async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn timeline_checkpoint_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
@@ -964,6 +1032,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
 
 async fn timeline_download_remote_layers_handler_post(
     mut request: Request<Body>,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -979,6 +1048,7 @@ async fn timeline_download_remote_layers_handler_post(
 
 async fn timeline_download_remote_layers_handler_get(
     request: Request<Body>,
+    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
@@ -1002,7 +1072,10 @@ async fn active_timeline_of_active_tenant(
         .map_err(ApiError::NotFound)
 }
 
-async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn always_panic_handler(
+    req: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     // Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook().
     // For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it.
     // Use catch_unwind to ensure that tokio nor hyper are distracted by our panic.
@@ -1013,7 +1086,10 @@ async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiE
     json_response(StatusCode::NO_CONTENT, ())
 }
 
-async fn disk_usage_eviction_run(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn disk_usage_eviction_run(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     check_permission(&r, None)?;
 
     #[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)]
@@ -1103,8 +1179,10 @@ async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     )
 }
 
-#[cfg(feature = "testing")]
-async fn post_tracing_event_handler(mut r: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn post_tracing_event_handler(
+    mut r: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
     #[derive(Debug, serde::Deserialize)]
     #[serde(rename_all = "lowercase")]
     enum Level {
@@ -1134,6 +1212,85 @@ async fn post_tracing_event_handler(mut r: Request<Body>) -> Result<Response<Bod
     json_response(StatusCode::OK, ())
 }
 
+/// Common functionality of all the HTTP API handlers.
+///
+/// - Adds a tracing span to each request (by `request_span`)
+/// - Logs the request depending on the request method (by `request_span`)
+/// - Logs the response if it was not successful (by `request_span`
+/// - Shields the handler function from async cancellations. Hyper can drop the handler
+///   Future if the connection to the client is lost, but most of the pageserver code is
+///   not async cancellation safe. This converts the dropped future into a graceful cancellation
+///   request with a CancellationToken.
+async fn api_handler<R, H>(request: Request<Body>, handler: H) -> Result<Response<Body>, ApiError>
+where
+    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
+    H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
+{
+    // Spawn a new task to handle the request, to protect the handler from unexpected
+    // async cancellations. Most pageserver functions are not async cancellation safe.
+    // We arm a drop-guard, so that if Hyper drops the Future, we signal the task
+    // with the cancellation token.
+    let token = CancellationToken::new();
+    let cancel_guard = token.clone().drop_guard();
+    let result = request_span(request, move |r| async {
+        let handle = tokio::spawn(
+            async {
+                let token_cloned = token.clone();
+                let result = handler(r, token).await;
+                if token_cloned.is_cancelled() {
+                    info!("Cancelled request finished");
+                }
+                result
+            }
+            .in_current_span(),
+        );
+
+        match handle.await {
+            Ok(result) => result,
+            Err(e) => {
+                // The handler task panicked. We have a global panic handler that logs the
+                // panic with its backtrace, so no need to log that here. Only log a brief
+                // message to make it clear that we returned the error to the client.
+                error!("HTTP request handler task panicked: {e:#}");
+
+                // Don't return an Error here, because then fallback error handler that was
+                // installed in make_router() will print the error. Instead, construct the
+                // HTTP error response and return that.
+                Ok(
+                    ApiError::InternalServerError(anyhow!("HTTP request handler task panicked"))
+                        .into_response(),
+                )
+            }
+        }
+    })
+    .await;
+
+    cancel_guard.disarm();
+
+    result
+}
+
+/// Like api_handler, but returns an error response if the server is built without
+/// the 'testing' feature.
+async fn testing_api_handler<R, H>(
+    desc: &str,
+    request: Request<Body>,
+    handler: H,
+) -> Result<Response<Body>, ApiError>
+where
+    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
+    H: FnOnce(Request<Body>, CancellationToken) -> R + Send + Sync + 'static,
+{
+    if cfg!(feature = "testing") {
+        api_handler(request, handler).await
+    } else {
+        std::future::ready(Err(ApiError::BadRequest(anyhow!(
+            "Cannot {desc} because pageserver was compiled without testing APIs",
+        ))))
+        .await
+    }
+}
+
 pub fn make_router(
     conf: &'static PageServerConf,
     launch_ts: &'static LaunchTimestamp,
@@ -1163,26 +1320,6 @@ pub fn make_router(
         .expect("construct launch timestamp header middleware"),
     );
 
-    macro_rules! testing_api {
-        ($handler_desc:literal, $handler:path $(,)?) => {{
-            #[cfg(not(feature = "testing"))]
-            async fn cfg_disabled(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
-                Err(ApiError::BadRequest(anyhow!(concat!(
-                    "Cannot ",
-                    $handler_desc,
-                    " because pageserver was compiled without testing APIs",
-                ))))
-            }
-
-            #[cfg(feature = "testing")]
-            let handler = $handler;
-            #[cfg(not(feature = "testing"))]
-            let handler = cfg_disabled;
-
-            move |r| request_span(r, handler)
-        }};
-    }
-
     Ok(router
         .data(Arc::new(
             State::new(
@@ -1194,92 +1331,88 @@ pub fn make_router(
             )
             .context("Failed to initialize router state")?,
         ))
-        .get("/v1/status", |r| request_span(r, status_handler))
-        .put(
-            "/v1/failpoints",
-            testing_api!("manage failpoints", failpoints_handler),
-        )
-        .get("/v1/tenant", |r| request_span(r, tenant_list_handler))
-        .post("/v1/tenant", |r| request_span(r, tenant_create_handler))
-        .get("/v1/tenant/:tenant_id", |r| request_span(r, tenant_status))
+        .get("/v1/status", |r| api_handler(r, status_handler))
+        .put("/v1/failpoints", |r| {
+            testing_api_handler("manage failpoints", r, failpoints_handler)
+        })
+        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
+        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
+        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
         .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
-            request_span(r, tenant_size_handler)
+            api_handler(r, tenant_size_handler)
         })
         .put("/v1/tenant/config", |r| {
-            request_span(r, update_tenant_config_handler)
+            api_handler(r, update_tenant_config_handler)
         })
         .get("/v1/tenant/:tenant_id/config", |r| {
-            request_span(r, get_tenant_config_handler)
+            api_handler(r, get_tenant_config_handler)
         })
         .get("/v1/tenant/:tenant_id/timeline", |r| {
-            request_span(r, timeline_list_handler)
+            api_handler(r, timeline_list_handler)
         })
         .post("/v1/tenant/:tenant_id/timeline", |r| {
-            request_span(r, timeline_create_handler)
+            api_handler(r, timeline_create_handler)
         })
         .post("/v1/tenant/:tenant_id/attach", |r| {
-            request_span(r, tenant_attach_handler)
+            api_handler(r, tenant_attach_handler)
         })
         .post("/v1/tenant/:tenant_id/detach", |r| {
-            request_span(r, tenant_detach_handler)
+            api_handler(r, tenant_detach_handler)
         })
         .post("/v1/tenant/:tenant_id/load", |r| {
-            request_span(r, tenant_load_handler)
+            api_handler(r, tenant_load_handler)
         })
         .post("/v1/tenant/:tenant_id/ignore", |r| {
-            request_span(r, tenant_ignore_handler)
+            api_handler(r, tenant_ignore_handler)
         })
         .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            request_span(r, timeline_detail_handler)
+            api_handler(r, timeline_detail_handler)
         })
         .get(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
-            |r| request_span(r, get_lsn_by_timestamp_handler),
+            |r| api_handler(r, get_lsn_by_timestamp_handler),
         )
         .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
-            request_span(r, timeline_gc_handler)
+            api_handler(r, timeline_gc_handler)
+        })
+        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| {
+            testing_api_handler("run timeline compaction", r, timeline_compact_handler)
         })
-        .put(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
-            testing_api!("run timeline compaction", timeline_compact_handler),
-        )
         .put(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
-            testing_api!("run timeline checkpoint", timeline_checkpoint_handler),
+            |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
         )
         .post(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
-            |r| request_span(r, timeline_download_remote_layers_handler_post),
+            |r| api_handler(r, timeline_download_remote_layers_handler_post),
         )
         .get(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
-            |r| request_span(r, timeline_download_remote_layers_handler_get),
+            |r| api_handler(r, timeline_download_remote_layers_handler_get),
         )
         .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            request_span(r, timeline_delete_handler)
+            api_handler(r, timeline_delete_handler)
         })
         .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
-            request_span(r, layer_map_info_handler)
+            api_handler(r, layer_map_info_handler)
         })
         .get(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
-            |r| request_span(r, layer_download_handler),
+            |r| api_handler(r, layer_download_handler),
         )
         .delete(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
-            |r| request_span(r, evict_timeline_layer_handler),
+            |r| api_handler(r, evict_timeline_layer_handler),
         )
         .put("/v1/disk_usage_eviction/run", |r| {
-            request_span(r, disk_usage_eviction_run)
+            api_handler(r, disk_usage_eviction_run)
+        })
+        .put("/v1/tenant/:tenant_id/break", |r| {
+            testing_api_handler("set tenant state to broken", r, handle_tenant_break)
+        })
+        .get("/v1/panic", |r| api_handler(r, always_panic_handler))
+        .post("/v1/tracing/event", |r| {
+            testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
         })
-        .put(
-            "/v1/tenant/:tenant_id/break",
-            testing_api!("set tenant state to broken", handle_tenant_break),
-        )
-        .get("/v1/panic", |r| request_span(r, always_panic_handler))
-        .post(
-            "/v1/tracing/event",
-            testing_api!("emit a tracing event", post_tracing_event_handler),
-        )
         .any(handler_404))
 }
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index d3cd91403708..4318749777f7 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -779,7 +779,6 @@ pub async fn immediate_gc(
     Ok(wait_task_done)
 }
 
-#[cfg(feature = "testing")]
 pub async fn immediate_compact(
     tenant_id: TenantId,
     timeline_id: TimelineId,
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 99bf4002079f..1e15a8e7cb23 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -437,12 +437,22 @@ def got_hangup_log_message():
 
     wait_until(50, 0.1, got_hangup_log_message)
 
-    # ok, retry without failpoint, it should succeed
+    # check that the timeline is still present
+    ps_http.timeline_detail(env.initial_tenant, child_timeline_id)
+
+    # ok, disable the failpoint to let the deletion finish
     ps_http.configure_failpoints((failpoint_name, "off"))
 
-    # this should succeed
-    ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2)
-    # the second call will try to transition the timeline into Stopping state, but it's already in that state
-    env.pageserver.allowed_errors.append(
-        f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping"
-    )
+    def first_request_finished():
+        message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished"
+        assert env.pageserver.log_contains(message)
+
+    wait_until(50, 0.1, first_request_finished)
+
+    # check that the timeline is gone
+    notfound_message = f"Timeline {env.initial_tenant}/{child_timeline_id} was not found"
+    env.pageserver.allowed_errors.append(".*" + notfound_message)
+    with pytest.raises(PageserverApiException, match=notfound_message) as exc:
+        ps_http.timeline_detail(env.initial_tenant, child_timeline_id)
+
+    assert exc.value.status_code == 404

From a55c6638489ae545245c00253f94fae3c256ab12 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 2 Jun 2023 21:03:12 +0300
Subject: [PATCH 42/59] chore: comment marker fixes (#4406)

Upgrading to rust 1.70 will require these.
---
 proxy/src/console.rs     | 4 ++--
 proxy/src/proxy/tests.rs | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/proxy/src/console.rs b/proxy/src/console.rs
index 1f3ef995555c..0e5eaaf845c7 100644
--- a/proxy/src/console.rs
+++ b/proxy/src/console.rs
@@ -1,5 +1,5 @@
-///! Various stuff for dealing with the Neon Console.
-///! Later we might move some API wrappers here.
+//! Various stuff for dealing with the Neon Console.
+//! Later we might move some API wrappers here.
 
 /// Payloads used in the console's APIs.
 pub mod messages;
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 60acb588dc24..3373c4967611 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -1,4 +1,4 @@
-///! A group of high-level tests for connection establishing logic and auth.
+//! A group of high-level tests for connection establishing logic and auth.
 use super::*;
 use crate::{auth, sasl, scram};
 use async_trait::async_trait;

From 4ba950a35a9840ca1043e5e10d1ed62fed2d1877 Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Fri, 2 Jun 2023 18:07:45 -0400
Subject: [PATCH 43/59] Add libcurl as dependency to readme (#4405)

---
 README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 8e6f2cda81a7..efa714e5be23 100644
--- a/README.md
+++ b/README.md
@@ -28,18 +28,19 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 * On Ubuntu or Debian, this set of packages should be sufficient to build the code:
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
-libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler
+libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
+libcurl4-openssl-dev
 ```
 * On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
   libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
-  protobuf-devel
+  protobuf-devel libcurl-devel
 ```
 * On Arch based systems, these packages are needed:
 ```bash
 pacman -S base-devel readline zlib libseccomp openssl clang \
-postgresql-libs cmake postgresql protobuf
+postgresql-libs cmake postgresql protobuf curl
 ```
 
 Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).

From 04542826be2c71a955a3b9fd7d1bd19ccab1745c Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Sun, 4 Jun 2023 11:41:38 +0300
Subject: [PATCH 44/59] Add HNSW extension (#4227)

## Describe your changes

Port HNSW implementation for ANN search top Postgres

## Issue ticket number and link

https://www.pinecone.io/learn/hnsw

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 Dockerfile.compute-node         |   4 +
 Makefile                        |   8 +
 pgxn/hnsw/Makefile              |  26 ++
 pgxn/hnsw/README.md             |  25 ++
 pgxn/hnsw/hnsw--0.1.0.sql       |  29 ++
 pgxn/hnsw/hnsw.c                | 551 ++++++++++++++++++++++++++++++++
 pgxn/hnsw/hnsw.control          |   5 +
 pgxn/hnsw/hnsw.h                |  15 +
 pgxn/hnsw/hnswalg.cpp           | 368 +++++++++++++++++++++
 pgxn/hnsw/hnswalg.h             |  67 ++++
 pgxn/hnsw/test/expected/knn.out |  28 ++
 pgxn/hnsw/test/sql/knn.sql      |  13 +
 12 files changed, 1139 insertions(+)
 create mode 100644 pgxn/hnsw/Makefile
 create mode 100644 pgxn/hnsw/README.md
 create mode 100644 pgxn/hnsw/hnsw--0.1.0.sql
 create mode 100644 pgxn/hnsw/hnsw.c
 create mode 100644 pgxn/hnsw/hnsw.control
 create mode 100644 pgxn/hnsw/hnsw.h
 create mode 100644 pgxn/hnsw/hnswalg.cpp
 create mode 100644 pgxn/hnsw/hnswalg.h
 create mode 100644 pgxn/hnsw/test/expected/knn.out
 create mode 100644 pgxn/hnsw/test/sql/knn.sql

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index de8a904c02ea..8446ef9fa0d0 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -556,6 +556,10 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
     make -j $(getconf _NPROCESSORS_ONLN) \
         PG_CONFIG=/usr/local/pgsql/bin/pg_config \
         -C pgxn/neon_utils \
+        -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
+        -C pgxn/hnsw \
         -s install
 
 #########################################################################################
diff --git a/Makefile b/Makefile
index 9d78c5d0fca9..ae979b8b4cff 100644
--- a/Makefile
+++ b/Makefile
@@ -138,6 +138,11 @@ neon-pg-ext-%: postgres-%
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install
+	+@echo "Compiling hnsw $*"
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build/hnsw-$*
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
+		-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
+		-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile install
 
 .PHONY: neon-pg-ext-clean-%
 neon-pg-ext-clean-%:
@@ -153,6 +158,9 @@ neon-pg-ext-clean-%:
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
 	-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
+	-C $(POSTGRES_INSTALL_DIR)/build/hnsw-$* \
+	-f $(ROOT_PROJECT_DIR)/pgxn/hnsw/Makefile clean
 
 .PHONY: neon-pg-ext
 neon-pg-ext: \
diff --git a/pgxn/hnsw/Makefile b/pgxn/hnsw/Makefile
new file mode 100644
index 000000000000..9bdd87430c3b
--- /dev/null
+++ b/pgxn/hnsw/Makefile
@@ -0,0 +1,26 @@
+EXTENSION = hnsw
+EXTVERSION = 0.1.0
+
+MODULE_big = hnsw
+DATA = $(wildcard *--*.sql)
+OBJS = hnsw.o hnswalg.o
+
+TESTS = $(wildcard test/sql/*.sql)
+REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS))
+REGRESS_OPTS = --inputdir=test --load-extension=hnsw
+
+# For auto-vectorization:
+# - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html
+PG_CFLAGS += -O3
+PG_CPPFLAGS +=  -msse4.1 -O3 -march=native -ftree-vectorize -ftree-vectorizer-verbose=0
+PG_LDFLAGS += -lstdc++
+
+all: $(EXTENSION)--$(EXTVERSION).sql
+
+PG_CONFIG ?= pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+
+dist:
+	mkdir -p dist
+	git archive --format zip --prefix=$(EXTENSION)-$(EXTVERSION)/ --output dist/$(EXTENSION)-$(EXTVERSION).zip master
diff --git a/pgxn/hnsw/README.md b/pgxn/hnsw/README.md
new file mode 100644
index 000000000000..bc9c8d571c11
--- /dev/null
+++ b/pgxn/hnsw/README.md
@@ -0,0 +1,25 @@
+# Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors
+
+This ANN extension of Postgres is based
+on [ivf-hnsw](https://github.com/dbaranchuk/ivf-hnsw.git) implementation of [HNSW](https://www.pinecone.io/learn/hnsw),
+the code for the current state-of-the-art billion-scale nearest neighbor search system presented in the paper:
+
+[Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors](http://openaccess.thecvf.com/content_ECCV_2018/html/Dmitry_Baranchuk_Revisiting_the_Inverted_ECCV_2018_paper.html),
+<br>
+Dmitry Baranchuk, Artem Babenko, Yury Malkov
+
+# Postgres extension
+
+HNSW index is hold in memory (built on demand) and it's maxial size is limited
+by `maxelements` index parameter. Another required parameter is nubmer of dimensions (if it is not specified in column type).
+Optional parameter `ef` specifies number of neighbors which are considered during index construction and search (corresponds `efConstruction` and `efSearch` parameters
+described in the article).
+
+# Example of usage:
+
+```
+create extension hnsw;
+create table embeddings(id integer primary key, payload real[]);
+create index on embeddings using hnsw(payload) with (maxelements=1000000, dims=100, m=32);
+select id from embeddings order by payload <-> array[1.0, 2.0,...] limit 100;
+```
\ No newline at end of file
diff --git a/pgxn/hnsw/hnsw--0.1.0.sql b/pgxn/hnsw/hnsw--0.1.0.sql
new file mode 100644
index 000000000000..ebf424326db8
--- /dev/null
+++ b/pgxn/hnsw/hnsw--0.1.0.sql
@@ -0,0 +1,29 @@
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION hnsw" to load this file. \quit
+
+-- functions
+
+CREATE FUNCTION l2_distance(real[], real[]) RETURNS real
+	AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
+
+-- operators
+
+CREATE OPERATOR <-> (
+	LEFTARG = real[], RIGHTARG = real[], PROCEDURE = l2_distance,
+	COMMUTATOR = '<->'
+);
+
+-- access method
+
+CREATE FUNCTION hnsw_handler(internal) RETURNS index_am_handler
+	AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE ACCESS METHOD hnsw TYPE INDEX HANDLER hnsw_handler;
+
+COMMENT ON ACCESS METHOD hnsw IS 'hnsw index access method';
+
+-- opclasses
+
+CREATE OPERATOR CLASS knn_ops
+	DEFAULT FOR TYPE real[] USING hnsw AS
+	OPERATOR 1 <-> (real[], real[]) FOR ORDER BY float_ops;
diff --git a/pgxn/hnsw/hnsw.c b/pgxn/hnsw/hnsw.c
new file mode 100644
index 000000000000..434f4986f8c4
--- /dev/null
+++ b/pgxn/hnsw/hnsw.c
@@ -0,0 +1,551 @@
+#include "postgres.h"
+
+#include "access/amapi.h"
+#include "access/generic_xlog.h"
+#include "access/relation.h"
+#include "access/reloptions.h"
+#include "access/tableam.h"
+#include "catalog/index.h"
+#include "commands/vacuum.h"
+#include "nodes/execnodes.h"
+#include "storage/bufmgr.h"
+#include "utils/guc.h"
+#include "utils/selfuncs.h"
+
+#include <math.h>
+#include <float.h>
+
+#include "hnsw.h"
+
+PG_MODULE_MAGIC;
+
+typedef struct {
+	int32 vl_len_;		/* varlena header (do not touch directly!) */
+	int dims;
+	int maxelements;
+	int efConstruction;
+	int efSearch;
+	int M;
+} HnswOptions;
+
+static relopt_kind hnsw_relopt_kind;
+
+typedef struct {
+	HierarchicalNSW* hnsw;
+	size_t curr;
+	size_t n_results;
+	ItemPointer results;
+} HnswScanOpaqueData;
+
+typedef HnswScanOpaqueData* HnswScanOpaque;
+
+typedef struct {
+	Oid relid;
+	uint32 status;
+	HierarchicalNSW* hnsw;
+} HnswHashEntry;
+
+
+#define SH_PREFIX			 hnsw_index
+#define SH_ELEMENT_TYPE		 HnswHashEntry
+#define SH_KEY_TYPE			 Oid
+#define SH_KEY				 relid
+#define SH_STORE_HASH
+#define SH_GET_HASH(tb, a)	 ((a)->relid)
+#define SH_HASH_KEY(tb, key) (key)
+#define SH_EQUAL(tb, a, b)	((a) == (b))
+#define SH_SCOPE			static inline
+#define SH_DEFINE
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+#define INDEX_HASH_SIZE     11
+
+#define DEFAULT_EF_SEARCH   64
+
+PGDLLEXPORT void _PG_init(void);
+
+static hnsw_index_hash *hnsw_indexes;
+
+/*
+ * Initialize index options and variables
+ */
+void
+_PG_init(void)
+{
+	hnsw_relopt_kind = add_reloption_kind();
+	add_int_reloption(hnsw_relopt_kind, "dims", "Number of dimensions",
+					  0, 0, INT_MAX, AccessExclusiveLock);
+	add_int_reloption(hnsw_relopt_kind, "maxelements", "Maximal number of elements",
+					  0, 0, INT_MAX, AccessExclusiveLock);
+	add_int_reloption(hnsw_relopt_kind, "m", "Number of neighbors of each vertex",
+					  100, 0, INT_MAX, AccessExclusiveLock);
+	add_int_reloption(hnsw_relopt_kind, "efconstruction", "Number of inspected neighbors during index construction",
+					  16, 1, INT_MAX, AccessExclusiveLock);
+	add_int_reloption(hnsw_relopt_kind, "efsearch", "Number of inspected neighbors during index search",
+					  64, 1, INT_MAX, AccessExclusiveLock);
+	hnsw_indexes = hnsw_index_create(TopMemoryContext, INDEX_HASH_SIZE, NULL);
+}
+
+
+static void
+hnsw_build_callback(Relation index, ItemPointer tid, Datum *values,
+					bool *isnull, bool tupleIsAlive, void *state)
+{
+	HierarchicalNSW* hnsw = (HierarchicalNSW*) state;
+	ArrayType* array;
+	int n_items;
+	label_t label = 0;
+
+	/* Skip nulls */
+	if (isnull[0])
+		return;
+
+	array = DatumGetArrayTypeP(values[0]);
+	n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
+	if (n_items != hnsw_dimensions(hnsw))
+	{
+		elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
+			 n_items, hnsw_dimensions(hnsw));
+	}
+
+	memcpy(&label, tid, sizeof(*tid));
+	hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label);
+}
+
+static void
+hnsw_populate(HierarchicalNSW* hnsw, Relation indexRel, Relation heapRel)
+{
+	IndexInfo* indexInfo = BuildIndexInfo(indexRel);
+	Assert(indexInfo->ii_NumIndexAttrs == 1);
+	table_index_build_scan(heapRel, indexRel, indexInfo,
+						   true, true, hnsw_build_callback, (void *) hnsw, NULL);
+}
+
+static HierarchicalNSW*
+hnsw_get_index(Relation indexRel, Relation heapRel)
+{
+	HierarchicalNSW* hnsw;
+	Oid indexoid = RelationGetRelid(indexRel);
+	HnswHashEntry* entry = hnsw_index_lookup(hnsw_indexes, indexoid);
+	if (entry == NULL)
+	{
+		size_t dims, maxelements;
+		size_t M;
+		size_t maxM;
+		size_t size_links_level0;
+		size_t size_data_per_element;
+		size_t data_size;
+		dsm_handle handle = indexoid << 1; /* make it even */
+		void* impl_private = NULL;
+		void* mapped_address = NULL;
+		Size  mapped_size = 0;
+		Size  shmem_size;
+		bool exists = true;
+		bool found;
+		HnswOptions *opts = (HnswOptions *) indexRel->rd_options;
+		if (opts == NULL || opts->maxelements == 0 || opts->dims == 0) {
+			elog(ERROR, "HNSW index requires 'maxelements' and 'dims' to be specified");
+		}
+		dims = opts->dims;
+		maxelements = opts->maxelements;
+		M = opts->M;
+		maxM = M * 2;
+		data_size = dims * sizeof(coord_t);
+		size_links_level0 = (maxM + 1) * sizeof(idx_t);
+		size_data_per_element = size_links_level0 + data_size + sizeof(label_t);
+		shmem_size =  hnsw_sizeof() + maxelements * size_data_per_element;
+
+		/* first try to attach to existed index */
+		if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
+						 &mapped_address, &mapped_size, DEBUG1))
+		{
+			/* index doesn't exists: try to create it */
+			if (!dsm_impl_op(DSM_OP_CREATE, handle, shmem_size, &impl_private,
+							 &mapped_address, &mapped_size, DEBUG1))
+			{
+				/* We can do it under shared lock, so some other backend may
+				 * try to initialize index. If create is failed because index already
+				 * created by somebody else, then try to attach to it once again
+				 */
+				if (!dsm_impl_op(DSM_OP_ATTACH, handle, 0, &impl_private,
+								 &mapped_address, &mapped_size, ERROR))
+				{
+					return NULL;
+				}
+			}
+			else
+			{
+				exists = false;
+			}
+		}
+		Assert(mapped_size == shmem_size);
+		hnsw = (HierarchicalNSW*)mapped_address;
+
+		if (!exists)
+		{
+			hnsw_init(hnsw, dims, maxelements, M, maxM, opts->efConstruction);
+			hnsw_populate(hnsw, indexRel, heapRel);
+		}
+		entry = hnsw_index_insert(hnsw_indexes, indexoid, &found);
+		Assert(!found);
+		entry->hnsw = hnsw;
+	}
+	else
+	{
+		hnsw = entry->hnsw;
+	}
+	return hnsw;
+}
+
+/*
+ * Start or restart an index scan
+ */
+static IndexScanDesc
+hnsw_beginscan(Relation index, int nkeys, int norderbys)
+{
+	IndexScanDesc scan = RelationGetIndexScan(index, nkeys, norderbys);
+	HnswScanOpaque so = (HnswScanOpaque) palloc(sizeof(HnswScanOpaqueData));
+	Relation heap = relation_open(index->rd_index->indrelid, NoLock);
+	so->hnsw = hnsw_get_index(index, heap);
+	relation_close(heap, NoLock);
+	so->curr = 0;
+	so->n_results = 0;
+	so->results = NULL;
+	scan->opaque = so;
+	return scan;
+}
+
+/*
+ * Start or restart an index scan
+ */
+static void
+hnsw_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys)
+{
+	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
+	if (so->results)
+	{
+		pfree(so->results);
+		so->results = NULL;
+	}
+	so->curr = 0;
+	if (orderbys && scan->numberOfOrderBys > 0)
+		memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData));
+}
+
+/*
+ * Fetch the next tuple in the given scan
+ */
+static bool
+hnsw_gettuple(IndexScanDesc scan, ScanDirection dir)
+{
+	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
+
+	/*
+	 * Index can be used to scan backward, but Postgres doesn't support
+	 * backward scan on operators
+	 */
+	Assert(ScanDirectionIsForward(dir));
+
+	if (so->curr == 0)
+	{
+		Datum		value;
+		ArrayType*	array;
+		int         n_items;
+		size_t      n_results;
+		label_t*    results;
+		HnswOptions *opts = (HnswOptions *) scan->indexRelation->rd_options;
+		size_t      efSearch = opts ? opts->efSearch : DEFAULT_EF_SEARCH;
+
+		/* Safety check */
+		if (scan->orderByData == NULL)
+			elog(ERROR, "cannot scan HNSW index without order");
+
+		/* No items will match if null */
+		if (scan->orderByData->sk_flags & SK_ISNULL)
+			return false;
+
+		value = scan->orderByData->sk_argument;
+		array = DatumGetArrayTypeP(value);
+		n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
+		if (n_items != hnsw_dimensions(so->hnsw))
+		{
+			elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
+				 n_items, hnsw_dimensions(so->hnsw));
+		}
+
+		if (!hnsw_search(so->hnsw, (coord_t*)ARR_DATA_PTR(array), efSearch, &n_results, &results))
+			elog(ERROR, "HNSW index search failed");
+		so->results = (ItemPointer)palloc(n_results*sizeof(ItemPointerData));
+		so->n_results = n_results;
+		for (size_t i = 0; i < n_results; i++)
+		{
+			memcpy(&so->results[i], &results[i], sizeof(so->results[i]));
+		}
+		free(results);
+	}
+	if (so->curr >= so->n_results)
+	{
+		return false;
+	}
+	else
+	{
+		scan->xs_heaptid = so->results[so->curr++];
+		scan->xs_recheckorderby = false;
+		return true;
+	}
+}
+
+/*
+ * End a scan and release resources
+ */
+static void
+hnsw_endscan(IndexScanDesc scan)
+{
+	HnswScanOpaque so = (HnswScanOpaque) scan->opaque;
+	if (so->results)
+		pfree(so->results);
+	pfree(so);
+	scan->opaque = NULL;
+}
+
+
+/*
+ * Estimate the cost of an index scan
+ */
+static void
+hnsw_costestimate(PlannerInfo *root, IndexPath *path, double loop_count,
+				 Cost *indexStartupCost, Cost *indexTotalCost,
+				 Selectivity *indexSelectivity, double *indexCorrelation
+				 ,double *indexPages
+)
+{
+	GenericCosts costs;
+
+	/* Never use index without order */
+	if (path->indexorderbys == NULL)
+	{
+		*indexStartupCost = DBL_MAX;
+		*indexTotalCost = DBL_MAX;
+		*indexSelectivity = 0;
+		*indexCorrelation = 0;
+		*indexPages = 0;
+		return;
+	}
+
+	MemSet(&costs, 0, sizeof(costs));
+
+	genericcostestimate(root, path, loop_count, &costs);
+
+	/* Startup cost and total cost are same */
+	*indexStartupCost = costs.indexTotalCost;
+	*indexTotalCost = costs.indexTotalCost;
+	*indexSelectivity = costs.indexSelectivity;
+	*indexCorrelation = costs.indexCorrelation;
+	*indexPages = costs.numIndexPages;
+}
+
+/*
+ * Parse and validate the reloptions
+ */
+static bytea *
+hnsw_options(Datum reloptions, bool validate)
+{
+	static const relopt_parse_elt tab[] = {
+		{"dims", RELOPT_TYPE_INT, offsetof(HnswOptions, dims)},
+		{"maxelements", RELOPT_TYPE_INT, offsetof(HnswOptions, maxelements)},
+		{"efconstruction", RELOPT_TYPE_INT, offsetof(HnswOptions, efConstruction)},
+		{"efsearch", RELOPT_TYPE_INT, offsetof(HnswOptions, efSearch)},
+		{"m", RELOPT_TYPE_INT, offsetof(HnswOptions, M)}
+	};
+
+	return (bytea *) build_reloptions(reloptions, validate,
+									  hnsw_relopt_kind,
+									  sizeof(HnswOptions),
+									  tab, lengthof(tab));
+}
+
+/*
+ * Validate catalog entries for the specified operator class
+ */
+static bool
+hnsw_validate(Oid opclassoid)
+{
+	return true;
+}
+
+/*
+ * Build the index for a logged table
+ */
+static IndexBuildResult *
+hnsw_build(Relation heap, Relation index, IndexInfo *indexInfo)
+{
+	HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
+	IndexBuildResult* result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
+	result->heap_tuples = result->index_tuples = hnsw_count(hnsw);
+
+	return result;
+}
+
+/*
+ * Insert a tuple into the index
+ */
+static bool
+hnsw_insert(Relation index, Datum *values, bool *isnull, ItemPointer heap_tid,
+			  Relation heap, IndexUniqueCheck checkUnique,
+			  bool indexUnchanged,
+			  IndexInfo *indexInfo)
+{
+	HierarchicalNSW* hnsw = hnsw_get_index(index, heap);
+	Datum value;
+	ArrayType* array;
+	int n_items;
+	label_t label = 0;
+
+	/* Skip nulls */
+	if (isnull[0])
+		return false;
+
+	/* Detoast value */
+	value = PointerGetDatum(PG_DETOAST_DATUM(values[0]));
+	array = DatumGetArrayTypeP(value);
+	n_items = ArrayGetNItems(ARR_NDIM(array), ARR_DIMS(array));
+	if (n_items != hnsw_dimensions(hnsw))
+	{
+		elog(ERROR, "Wrong number of dimensions: %d instead of %d expected",
+			 n_items, hnsw_dimensions(hnsw));
+	}
+	memcpy(&label, heap_tid, sizeof(*heap_tid));
+	if (!hnsw_add_point(hnsw, (coord_t*)ARR_DATA_PTR(array), label))
+		elog(ERROR, "HNSW index insert failed");
+	return true;
+}
+
+/*
+ * Build the index for an unlogged table
+ */
+static void
+hnsw_buildempty(Relation index)
+{
+	/* index will be constructed on dema nd when accessed */
+}
+
+/*
+ * Clean up after a VACUUM operation
+ */
+static IndexBulkDeleteResult *
+hnsw_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
+{
+	Relation	rel = info->index;
+
+	if (stats == NULL)
+		return NULL;
+
+	stats->num_pages = RelationGetNumberOfBlocks(rel);
+
+	return stats;
+}
+
+/*
+ * Bulk delete tuples from the index
+ */
+static IndexBulkDeleteResult *
+hnsw_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+				IndexBulkDeleteCallback callback, void *callback_state)
+{
+	if (stats == NULL)
+		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+	return stats;
+}
+
+/*
+ * Define index handler
+ *
+ * See https://www.postgresql.org/docs/current/index-api.html
+ */
+PGDLLEXPORT PG_FUNCTION_INFO_V1(hnsw_handler);
+Datum
+hnsw_handler(PG_FUNCTION_ARGS)
+{
+	IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
+
+	amroutine->amstrategies = 0;
+	amroutine->amsupport = 0;
+	amroutine->amoptsprocnum = 0;
+	amroutine->amcanorder = false;
+	amroutine->amcanorderbyop = true;
+	amroutine->amcanbackward = false;	/* can change direction mid-scan */
+	amroutine->amcanunique = false;
+	amroutine->amcanmulticol = false;
+	amroutine->amoptionalkey = true;
+	amroutine->amsearcharray = false;
+	amroutine->amsearchnulls = false;
+	amroutine->amstorage = false;
+	amroutine->amclusterable = false;
+	amroutine->ampredlocks = false;
+	amroutine->amcanparallel = false;
+	amroutine->amcaninclude = false;
+	amroutine->amusemaintenanceworkmem = false; /* not used during VACUUM */
+	amroutine->amparallelvacuumoptions = VACUUM_OPTION_PARALLEL_BULKDEL;
+	amroutine->amkeytype = InvalidOid;
+
+	/* Interface functions */
+	amroutine->ambuild = hnsw_build;
+	amroutine->ambuildempty = hnsw_buildempty;
+	amroutine->aminsert = hnsw_insert;
+	amroutine->ambulkdelete = hnsw_bulkdelete;
+	amroutine->amvacuumcleanup = hnsw_vacuumcleanup;
+	amroutine->amcanreturn = NULL;	/* tuple not included in heapsort */
+	amroutine->amcostestimate = hnsw_costestimate;
+	amroutine->amoptions = hnsw_options;
+	amroutine->amproperty = NULL;	/* TODO AMPROP_DISTANCE_ORDERABLE */
+	amroutine->ambuildphasename = NULL;
+	amroutine->amvalidate = hnsw_validate;
+	amroutine->amadjustmembers = NULL;
+	amroutine->ambeginscan = hnsw_beginscan;
+	amroutine->amrescan = hnsw_rescan;
+	amroutine->amgettuple = hnsw_gettuple;
+	amroutine->amgetbitmap = NULL;
+	amroutine->amendscan = hnsw_endscan;
+	amroutine->ammarkpos = NULL;
+	amroutine->amrestrpos = NULL;
+
+	/* Interface functions to support parallel index scans */
+	amroutine->amestimateparallelscan = NULL;
+	amroutine->aminitparallelscan = NULL;
+	amroutine->amparallelrescan = NULL;
+
+	PG_RETURN_POINTER(amroutine);
+}
+
+/*
+ * Get the L2 distance between vectors
+ */
+PGDLLEXPORT PG_FUNCTION_INFO_V1(l2_distance);
+Datum
+l2_distance(PG_FUNCTION_ARGS)
+{
+	ArrayType  *a = PG_GETARG_ARRAYTYPE_P(0);
+	ArrayType  *b = PG_GETARG_ARRAYTYPE_P(1);
+	int         a_dim = ArrayGetNItems(ARR_NDIM(a), ARR_DIMS(a));
+	int         b_dim = ArrayGetNItems(ARR_NDIM(b), ARR_DIMS(b));
+	dist_t 		distance = 0.0;
+	dist_t		diff;
+	coord_t	   *ax = (coord_t*)ARR_DATA_PTR(a);
+	coord_t	   *bx = (coord_t*)ARR_DATA_PTR(b);
+
+	if (a_dim != b_dim)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_EXCEPTION),
+				 errmsg("different array dimensions %d and %d", a_dim, b_dim)));
+	}
+
+	for (int i = 0; i < a_dim; i++)
+	{
+		diff = ax[i] - bx[i];
+		distance += diff * diff;
+	}
+
+	PG_RETURN_FLOAT4((dist_t)sqrt(distance));
+}
diff --git a/pgxn/hnsw/hnsw.control b/pgxn/hnsw/hnsw.control
new file mode 100644
index 000000000000..b292b960260c
--- /dev/null
+++ b/pgxn/hnsw/hnsw.control
@@ -0,0 +1,5 @@
+comment = 'hNsw index'
+default_version = '0.1.0'
+module_pathname = '$libdir/hnsw'
+relocatable = true
+trusted = true
diff --git a/pgxn/hnsw/hnsw.h b/pgxn/hnsw/hnsw.h
new file mode 100644
index 000000000000..d4065ab8fe52
--- /dev/null
+++ b/pgxn/hnsw/hnsw.h
@@ -0,0 +1,15 @@
+#pragma once
+
+typedef float    coord_t;
+typedef float    dist_t;
+typedef uint32_t idx_t;
+typedef uint64_t label_t;
+
+typedef struct HierarchicalNSW HierarchicalNSW;
+
+bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results);
+bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label);
+void hnsw_init(HierarchicalNSW* hnsw, size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
+int  hnsw_dimensions(HierarchicalNSW* hnsw);
+size_t hnsw_count(HierarchicalNSW* hnsw);
+size_t hnsw_sizeof(void);
diff --git a/pgxn/hnsw/hnswalg.cpp b/pgxn/hnsw/hnswalg.cpp
new file mode 100644
index 000000000000..226dcbd53d7f
--- /dev/null
+++ b/pgxn/hnsw/hnswalg.cpp
@@ -0,0 +1,368 @@
+#include "hnswalg.h"
+
+
+#if defined(__x86_64__)
+
+#include <x86intrin.h>
+#define USE_AVX
+#if defined(__GNUC__)
+#define PORTABLE_ALIGN32 __attribute__((aligned(32)))
+#else
+#define PORTABLE_ALIGN32 __declspec(align(32))
+#endif
+
+#define PREFETCH(addr,hint) _mm_prefetch(addr, hint)
+
+#else
+
+#define PREFETCH(addr,hint)
+
+#endif
+
+HierarchicalNSW::HierarchicalNSW(size_t dim_, size_t maxelements_, size_t M_, size_t maxM_, size_t efConstruction_)
+{
+    dim = dim_;
+    data_size = dim * sizeof(coord_t);
+
+    efConstruction = efConstruction_;
+
+    maxelements = maxelements_;
+    M = M_;
+    maxM = maxM_;
+    size_links_level0 = (maxM + 1) * sizeof(idx_t);
+    size_data_per_element = size_links_level0 + data_size  + sizeof(label_t);
+    offset_data = size_links_level0;
+	offset_label = offset_data + data_size;
+
+    enterpoint_node = 0;
+    cur_element_count = 0;
+	dist_calc = 0;
+}
+
+std::priority_queue<std::pair<dist_t, idx_t>> HierarchicalNSW::searchBaseLayer(const coord_t *point, size_t ef)
+{
+	std::vector<uint32_t> visited;
+	visited.resize((cur_element_count + 31) >> 5);
+
+    std::priority_queue<std::pair<dist_t, idx_t >> topResults;
+    std::priority_queue<std::pair<dist_t, idx_t >> candidateSet;
+
+    dist_t dist = fstdistfunc(point, getDataByInternalId(enterpoint_node));
+
+    topResults.emplace(dist, enterpoint_node);
+    candidateSet.emplace(-dist, enterpoint_node);
+    visited[enterpoint_node >> 5] = 1 << (enterpoint_node & 31);
+    dist_t lowerBound = dist;
+
+    while (!candidateSet.empty())
+    {
+        std::pair<dist_t, idx_t> curr_el_pair = candidateSet.top();
+        if (-curr_el_pair.first > lowerBound)
+            break;
+
+        candidateSet.pop();
+        idx_t curNodeNum = curr_el_pair.second;
+
+        idx_t* data = get_linklist0(curNodeNum);
+        size_t size = *data++;
+
+        PREFETCH(getDataByInternalId(*data), _MM_HINT_T0);
+
+        for (size_t j = 0; j < size; ++j) {
+            size_t tnum = *(data + j);
+
+            PREFETCH(getDataByInternalId(*(data + j + 1)), _MM_HINT_T0);
+
+            if (!(visited[tnum >> 5] & (1 << (tnum & 31)))) {
+				visited[tnum >> 5] |= 1 << (tnum & 31);
+
+                dist = fstdistfunc(point, getDataByInternalId(tnum));
+
+                if (topResults.top().first > dist || topResults.size() < ef) {
+                    candidateSet.emplace(-dist, tnum);
+
+                    PREFETCH(get_linklist0(candidateSet.top().second), _MM_HINT_T0);
+                    topResults.emplace(dist, tnum);
+
+                    if (topResults.size() > ef)
+                        topResults.pop();
+
+                    lowerBound = topResults.top().first;
+                }
+            }
+        }
+    }
+    return topResults;
+}
+
+
+void HierarchicalNSW::getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN)
+{
+    if (topResults.size() < NN)
+        return;
+
+    std::priority_queue<std::pair<dist_t, idx_t>> resultSet;
+    std::vector<std::pair<dist_t, idx_t>> returnlist;
+
+    while (topResults.size() > 0) {
+        resultSet.emplace(-topResults.top().first, topResults.top().second);
+        topResults.pop();
+    }
+
+    while (resultSet.size()) {
+        if (returnlist.size() >= NN)
+            break;
+        std::pair<dist_t, idx_t> curen = resultSet.top();
+        dist_t dist_to_query = -curen.first;
+        resultSet.pop();
+        bool good = true;
+        for (std::pair<dist_t, idx_t> curen2 : returnlist) {
+            dist_t curdist = fstdistfunc(getDataByInternalId(curen2.second),
+                                         getDataByInternalId(curen.second));
+            if (curdist < dist_to_query) {
+                good = false;
+                break;
+            }
+        }
+        if (good) returnlist.push_back(curen);
+    }
+    for (std::pair<dist_t, idx_t> elem : returnlist)
+        topResults.emplace(-elem.first, elem.second);
+}
+
+void HierarchicalNSW::mutuallyConnectNewElement(const coord_t *point, idx_t cur_c,
+                               std::priority_queue<std::pair<dist_t, idx_t>> topResults)
+{
+    getNeighborsByHeuristic(topResults, M);
+
+    std::vector<idx_t> res;
+    res.reserve(M);
+    while (topResults.size() > 0) {
+        res.push_back(topResults.top().second);
+        topResults.pop();
+    }
+    {
+        idx_t* data = get_linklist0(cur_c);
+        if (*data)
+            throw std::runtime_error("Should be blank");
+
+        *data++ = res.size();
+
+        for (size_t idx = 0; idx < res.size(); idx++) {
+            if (data[idx])
+                throw std::runtime_error("Should be blank");
+            data[idx] = res[idx];
+        }
+    }
+    for (size_t idx = 0; idx < res.size(); idx++) {
+        if (res[idx] == cur_c)
+            throw std::runtime_error("Connection to the same element");
+
+        size_t resMmax = maxM;
+        idx_t *ll_other = get_linklist0(res[idx]);
+        idx_t sz_link_list_other = *ll_other;
+
+        if (sz_link_list_other > resMmax || sz_link_list_other < 0)
+            throw std::runtime_error("Bad sz_link_list_other");
+
+        if (sz_link_list_other < resMmax) {
+            idx_t *data = ll_other + 1;
+            data[sz_link_list_other] = cur_c;
+            *ll_other = sz_link_list_other + 1;
+        } else {
+            // finding the "weakest" element to replace it with the new one
+            idx_t *data = ll_other + 1;
+            dist_t d_max = fstdistfunc(getDataByInternalId(cur_c), getDataByInternalId(res[idx]));
+            // Heuristic:
+            std::priority_queue<std::pair<dist_t, idx_t>> candidates;
+            candidates.emplace(d_max, cur_c);
+
+            for (size_t j = 0; j < sz_link_list_other; j++)
+                candidates.emplace(fstdistfunc(getDataByInternalId(data[j]), getDataByInternalId(res[idx])), data[j]);
+
+            getNeighborsByHeuristic(candidates, resMmax);
+
+            size_t indx = 0;
+            while (!candidates.empty()) {
+                data[indx] = candidates.top().second;
+                candidates.pop();
+                indx++;
+            }
+            *ll_other = indx;
+        }
+    }
+}
+
+void HierarchicalNSW::addPoint(const coord_t *point, label_t label)
+{
+    if (cur_element_count >= maxelements) {
+        throw std::runtime_error("The number of elements exceeds the specified limit");
+    }
+    idx_t cur_c = cur_element_count++;
+    memset((char *) get_linklist0(cur_c), 0, size_data_per_element);
+    memcpy(getDataByInternalId(cur_c), point, data_size);
+    memcpy(getExternalLabel(cur_c), &label, sizeof label);
+
+    // Do nothing for the first element
+    if (cur_c != 0) {
+        std::priority_queue <std::pair<dist_t, idx_t>> topResults = searchBaseLayer(point, efConstruction);
+        mutuallyConnectNewElement(point, cur_c, topResults);
+    }
+};
+
+std::priority_queue<std::pair<dist_t, label_t>> HierarchicalNSW::searchKnn(const coord_t *query, size_t k)
+{
+	std::priority_queue<std::pair<dist_t, label_t>> topResults;
+	auto topCandidates = searchBaseLayer(query, k);
+    while (topCandidates.size() > k) {
+        topCandidates.pop();
+	}
+	while (!topCandidates.empty()) {
+		std::pair<dist_t, idx_t> rez = topCandidates.top();
+		label_t label;
+		memcpy(&label, getExternalLabel(rez.second), sizeof(label));
+		topResults.push(std::pair<dist_t, label_t>(rez.first, label));
+		topCandidates.pop();
+	}
+
+    return topResults;
+};
+
+dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y)
+{
+#if defined(__x86_64__)
+    float PORTABLE_ALIGN32 TmpRes[8];
+    size_t qty16 = dim >> 4;
+	const float *pEnd1 = x + (qty16 << 4);
+#ifdef USE_AVX
+	__m256 diff, v1, v2;
+	__m256 sum = _mm256_set1_ps(0);
+
+	while (x < pEnd1) {
+		v1 = _mm256_loadu_ps(x);
+		x += 8;
+		v2 = _mm256_loadu_ps(y);
+		y += 8;
+		diff = _mm256_sub_ps(v1, v2);
+		sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
+
+		v1 = _mm256_loadu_ps(x);
+		x += 8;
+		v2 = _mm256_loadu_ps(y);
+		y += 8;
+		diff = _mm256_sub_ps(v1, v2);
+		sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
+	}
+
+	_mm256_store_ps(TmpRes, sum);
+	float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
+
+	return (res);
+#else
+    __m128 diff, v1, v2;
+    __m128 sum = _mm_set1_ps(0);
+
+    while (x < pEnd1) {
+        v1 = _mm_loadu_ps(x);
+        x += 4;
+        v2 = _mm_loadu_ps(y);
+        y += 4;
+        diff = _mm_sub_ps(v1, v2);
+        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
+
+        v1 = _mm_loadu_ps(x);
+        x += 4;
+        v2 = _mm_loadu_ps(y);
+        y += 4;
+        diff = _mm_sub_ps(v1, v2);
+        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
+
+        v1 = _mm_loadu_ps(x);
+        x += 4;
+        v2 = _mm_loadu_ps(y);
+        y += 4;
+        diff = _mm_sub_ps(v1, v2);
+        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
+
+        v1 = _mm_loadu_ps(x);
+        x += 4;
+        v2 = _mm_loadu_ps(y);
+        y += 4;
+        diff = _mm_sub_ps(v1, v2);
+        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
+    }
+    _mm_store_ps(TmpRes, sum);
+    float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
+
+    return (res);
+#endif
+#else // portable implementation
+	dist_t 	distance = 0.0;
+	size_t  n = dim;
+
+	dist_calc++;
+
+	for (size_t i = 0; i < n; i++)
+	{
+		dist_t diff = x[i] - y[i];
+		distance += diff * diff;
+	}
+	return distance;
+#endif
+}
+
+bool hnsw_search(HierarchicalNSW* hnsw, const coord_t *point, size_t efSearch, size_t* n_results, label_t** results)
+{
+	try
+	{
+		auto result = hnsw->searchKnn(point, efSearch);
+		size_t nResults = result.size();
+		*results = (label_t*)malloc(nResults*sizeof(label_t));
+		for (size_t i = nResults; i-- != 0;)
+		{
+			(*results)[i] = result.top().second;
+			result.pop();
+		}
+		*n_results = nResults;
+		return true;
+	}
+	catch (std::exception& x)
+	{
+		return false;
+	}
+}
+
+bool hnsw_add_point(HierarchicalNSW* hnsw, const coord_t *point, label_t label)
+{
+	try
+	{
+		hnsw->addPoint(point, label);
+		return true;
+	}
+	catch (std::exception& x)
+	{
+		fprintf(stderr, "Catch %s\n", x.what());
+		return false;
+	}
+}
+
+void hnsw_init(HierarchicalNSW* hnsw, size_t dims, size_t maxelements, size_t M, size_t maxM, size_t efConstruction)
+{
+	new ((void*)hnsw) HierarchicalNSW(dims, maxelements, M, maxM, efConstruction);
+}
+
+
+int hnsw_dimensions(HierarchicalNSW* hnsw)
+{
+	return (int)hnsw->dim;
+}
+
+size_t hnsw_count(HierarchicalNSW* hnsw)
+{
+	return hnsw->cur_element_count;
+}
+
+size_t hnsw_sizeof(void)
+{
+	return sizeof(HierarchicalNSW);
+}
diff --git a/pgxn/hnsw/hnswalg.h b/pgxn/hnsw/hnswalg.h
new file mode 100644
index 000000000000..b845ad2743e3
--- /dev/null
+++ b/pgxn/hnsw/hnswalg.h
@@ -0,0 +1,67 @@
+#pragma once
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <unordered_map>
+#include <unordered_set>
+#include <map>
+#include <cmath>
+#include <queue>
+#include <stdexcept>
+
+extern "C" {
+#include "hnsw.h"
+}
+
+struct HierarchicalNSW
+{
+	size_t maxelements;
+	size_t cur_element_count;
+
+	idx_t  enterpoint_node;
+
+	size_t dist_calc;
+
+	size_t dim;
+	size_t data_size;
+	size_t offset_data;
+	size_t offset_label;
+	size_t size_data_per_element;
+	size_t M;
+	size_t maxM;
+	size_t size_links_level0;
+	size_t efConstruction;
+
+	char   data_level0_memory[0]; // varying size
+
+  public:
+	HierarchicalNSW(size_t dim, size_t maxelements, size_t M, size_t maxM, size_t efConstruction);
+	~HierarchicalNSW();
+
+
+	inline coord_t *getDataByInternalId(idx_t internal_id) const {
+		return (coord_t *)&data_level0_memory[internal_id * size_data_per_element + offset_data];
+	}
+
+	inline idx_t *get_linklist0(idx_t internal_id) const {
+		return (idx_t*)&data_level0_memory[internal_id * size_data_per_element];
+	}
+
+	inline label_t *getExternalLabel(idx_t internal_id) const {
+		return (label_t *)&data_level0_memory[internal_id * size_data_per_element + offset_label];
+	}
+
+	std::priority_queue<std::pair<dist_t, idx_t>> searchBaseLayer(const coord_t *x, size_t ef);
+
+	void getNeighborsByHeuristic(std::priority_queue<std::pair<dist_t, idx_t>> &topResults, size_t NN);
+
+	void mutuallyConnectNewElement(const coord_t *x, idx_t id, std::priority_queue<std::pair<dist_t, idx_t>> topResults);
+
+	void addPoint(const coord_t *point, label_t label);
+
+	std::priority_queue<std::pair<dist_t, label_t>> searchKnn(const coord_t *query_data, size_t k);
+
+	dist_t fstdistfunc(const coord_t *x, const coord_t *y);
+};
diff --git a/pgxn/hnsw/test/expected/knn.out b/pgxn/hnsw/test/expected/knn.out
new file mode 100644
index 000000000000..a1cee4525e12
--- /dev/null
+++ b/pgxn/hnsw/test/expected/knn.out
@@ -0,0 +1,28 @@
+SET enable_seqscan = off;
+CREATE TABLE t (val real[]);
+INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
+CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
+INSERT INTO t (val) VALUES (array[1,2,4]);
+explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
+                             QUERY PLAN                             
+--------------------------------------------------------------------
+ Index Scan using t_val_idx on t  (cost=4.02..8.06 rows=3 width=36)
+   Order By: (val <-> '{3,3,3}'::real[])
+(2 rows)
+
+SELECT * FROM t ORDER BY val <-> array[3,3,3];
+   val   
+---------
+ {1,2,3}
+ {1,2,4}
+ {1,1,1}
+ {0,0,0}
+(4 rows)
+
+SELECT COUNT(*) FROM t;
+ count 
+-------
+     5
+(1 row)
+
+DROP TABLE t;
diff --git a/pgxn/hnsw/test/sql/knn.sql b/pgxn/hnsw/test/sql/knn.sql
new file mode 100644
index 000000000000..0635bda4a222
--- /dev/null
+++ b/pgxn/hnsw/test/sql/knn.sql
@@ -0,0 +1,13 @@
+SET enable_seqscan = off;
+
+CREATE TABLE t (val real[]);
+INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
+CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
+
+INSERT INTO t (val) VALUES (array[1,2,4]);
+
+explain SELECT * FROM t ORDER BY val <-> array[3,3,3];
+SELECT * FROM t ORDER BY val <-> array[3,3,3];
+SELECT COUNT(*) FROM t;
+
+DROP TABLE t;

From 8caef2c0c566416db1b60a663799e69ef3e8dd3e Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 5 Jun 2023 09:37:53 +0300
Subject: [PATCH 45/59] fix: delay `eviction_task` as well (#4397)

As seen on deployment of 2023-06-01 release, times were improving but
there were some outliers caused by:
- timelines `eviction_task` starting while activating and running
imitation
- timelines `initial logical size` calculation

This PR fixes it so that `eviction_task` is delayed like other
background tasks fixing an oversight from earlier #4372.

After this PR activation will be two phases:
1. load and activate tenants AND calculate some initial logical sizes
2. rest of initial logical sizes AND background tasks
- compaction, gc, disk usage based eviction, timelines `eviction_task`,
consumption metrics
---
 pageserver/src/bin/pageserver.rs                | 10 +++++++---
 pageserver/src/tenant.rs                        | 12 ++++++++----
 pageserver/src/tenant/mgr.rs                    |  2 ++
 pageserver/src/tenant/tasks.rs                  |  4 ++++
 pageserver/src/tenant/timeline.rs               | 10 ++++++++--
 pageserver/src/tenant/timeline/eviction_task.rs |  6 +++++-
 6 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index a2cebffc83e2..e0731ba79b2b 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -335,9 +335,13 @@ fn start_pageserver(
     // Set up remote storage client
     let remote_storage = create_remote_storage_client(conf)?;
 
-    // All tenant load operations carry this while they are ongoing; it will be dropped once those
-    // operations finish either successfully or in some other manner. However, the initial load
-    // will be then done, and we can start the global background tasks.
+    // Startup staging or optimizing:
+    //
+    // (init_done_tx, init_done_rx) are used to control when do background loops start. This is to
+    // avoid starving out the BACKGROUND_RUNTIME async worker threads doing heavy work, like
+    // initial repartitioning while we still have Loading tenants.
+    //
+    // init_done_rx is a barrier which stops waiting once all init_done_tx clones are dropped.
     let (init_done_tx, init_done_rx) = utils::completion::channel();
 
     // Scan the local 'tenants/' directory and start loading the tenants
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index af6a70c4f28b..a895b5709219 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -267,7 +267,7 @@ impl UninitializedTimeline<'_> {
         // updated it for the layers that we created during the import.
         let mut timelines = self.owning_tenant.timelines.lock().unwrap();
         let tl = self.initialize_with_lock(ctx, &mut timelines, false)?;
-        tl.activate(broker_client, ctx);
+        tl.activate(broker_client, None, ctx);
         Ok(tl)
     }
 
@@ -879,7 +879,6 @@ impl Tenant {
         ))
     }
 
-    ///
     /// Load a tenant that's available on local disk
     ///
     /// This is used at pageserver startup, to rebuild the in-memory
@@ -890,6 +889,8 @@ impl Tenant {
     /// If the loading fails for some reason, the Tenant will go into Broken
     /// state.
     ///
+    /// `init_done` is an optional channel used during initial load to delay background task
+    /// start. It is not used later.
     #[instrument(skip_all, fields(tenant_id=%tenant_id))]
     pub fn spawn_load(
         conf: &'static PageServerConf,
@@ -1358,7 +1359,7 @@ impl Tenant {
             }
         };
 
-        loaded_timeline.activate(broker_client, ctx);
+        loaded_timeline.activate(broker_client, None, ctx);
 
         if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
             // Wait for the upload of the 'index_part.json` file to finish, so that when we return
@@ -1682,6 +1683,9 @@ impl Tenant {
     }
 
     /// Changes tenant status to active, unless shutdown was already requested.
+    ///
+    /// `init_done` is an optional channel used during initial load to delay background task
+    /// start. It is not used later.
     fn activate(
         self: &Arc<Self>,
         broker_client: BrokerClientChannel,
@@ -1723,7 +1727,7 @@ impl Tenant {
             let mut activated_timelines = 0;
 
             for timeline in not_broken_timelines {
-                timeline.activate(broker_client.clone(), ctx);
+                timeline.activate(broker_client.clone(), init_done, ctx);
                 activated_timelines += 1;
             }
 
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 4318749777f7..05874bdd72cf 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -155,6 +155,8 @@ pub async fn init_tenant_mgr(
     Ok(())
 }
 
+/// `init_done` is an optional channel used during initial load to delay background task
+/// start. It is not used later.
 pub fn schedule_local_tenant_processing(
     conf: &'static PageServerConf,
     tenant_path: &Path,
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 02aed11114d2..1bbc1b1c084a 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -14,6 +14,10 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::completion;
 
+/// Start per tenant background loops: compaction and gc.
+///
+/// `init_done` is an optional channel used during initial load to delay background task
+/// start. It is not used later.
 pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completion::Barrier>) {
     let tenant_id = tenant.tenant_id;
     task_mgr::spawn(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index b45dcc4e42a9..3db78401f6bb 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -57,6 +57,7 @@ use pageserver_api::reltag::RelTag;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::to_pg_timestamp;
 use utils::{
+    completion,
     id::{TenantId, TimelineId},
     lsn::{AtomicLsn, Lsn, RecordLsn},
     seqwait::SeqWait,
@@ -928,10 +929,15 @@ impl Timeline {
         Ok(())
     }
 
-    pub fn activate(self: &Arc<Self>, broker_client: BrokerClientChannel, ctx: &RequestContext) {
+    pub fn activate(
+        self: &Arc<Self>,
+        broker_client: BrokerClientChannel,
+        init_done: Option<&completion::Barrier>,
+        ctx: &RequestContext,
+    ) {
         self.launch_wal_receiver(ctx, broker_client);
         self.set_state(TimelineState::Active);
-        self.launch_eviction_task();
+        self.launch_eviction_task(init_done);
     }
 
     pub fn set_state(&self, new_state: TimelineState) {
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 558600692ec7..7029d75d63b8 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -34,6 +34,8 @@ use crate::{
     },
 };
 
+use utils::completion;
+
 use super::Timeline;
 
 #[derive(Default)]
@@ -47,8 +49,9 @@ pub struct EvictionTaskTenantState {
 }
 
 impl Timeline {
-    pub(super) fn launch_eviction_task(self: &Arc<Self>) {
+    pub(super) fn launch_eviction_task(self: &Arc<Self>, init_done: Option<&completion::Barrier>) {
         let self_clone = Arc::clone(self);
+        let init_done = init_done.cloned();
         task_mgr::spawn(
             BACKGROUND_RUNTIME.handle(),
             TaskKind::Eviction,
@@ -57,6 +60,7 @@ impl Timeline {
             &format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id),
             false,
             async move {
+                completion::Barrier::maybe_wait(init_done).await;
                 self_clone.eviction_task(task_mgr::shutdown_token()).await;
                 info!("eviction task finishing");
                 Ok(())

From b9871158ba4cd2e08a8b5a4cd001944fa461ceac Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Mon, 5 Jun 2023 11:52:13 +0200
Subject: [PATCH 46/59] Compile PGX ULID extension (#4413)

Create pgx_ulid extension

```
postgres=# create extension ulid;
CREATE EXTENSION
postgres=# CREATE TABLE users (
  id ulid NOT NULL DEFAULT gen_ulid() PRIMARY KEY,
  name text NOT NULL
);
CREATE TABLE
postgres=# insert into users (name) values ('vadim');
INSERT 0 1
postgres=# select * from users;
             id             | name
----------------------------+-------
 01H25DDG3KYMYZTNR41X38E256 | vadim
```
---
 Dockerfile.compute-node | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 8446ef9fa0d0..f8429e72b868 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -517,6 +517,22 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405
     cargo pgx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
 
+#########################################################################################
+#
+# Layer "pg-pgx-ulid-build"
+# Compile "pgx_ulid" extension
+#
+#########################################################################################
+
+FROM rust-extensions-build AS pg-pgx-ulid-build
+
+RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
+    echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
+    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
+    sed -i 's/pgx        = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgx install --release && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/pgx_ulid.control
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -547,6 +563,7 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \

From 8142edda0166318ad8f868584657905ddcfa17be Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 5 Jun 2023 15:43:52 +0300
Subject: [PATCH 47/59] test: Less flaky gc (#4416)

Solves a flaky test error in the wild[^1] by:

- Make the gc shutdown signal reading an `allowed_error`
- Note the gc shutdown signal readings as being in `allowed_error`s
- Allow passing tenant conf to init_start to avoid unncessary tenants

[^1]:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4399/5176432780/index.html#suites/b97efae3a617afb71cb8142f5afa5224/2cd76021ea011f93
---
 pageserver/src/tenant.rs                                  | 1 +
 pageserver/src/tenant/timeline.rs                         | 1 +
 test_runner/fixtures/neon_fixtures.py                     | 7 +++++--
 test_runner/regress/test_ondemand_download.py             | 8 ++------
 .../regress/test_pageserver_restarts_under_workload.py    | 6 ------
 5 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a895b5709219..bcf4495ac2e4 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1395,6 +1395,7 @@ impl Tenant {
         pitr: Duration,
         ctx: &RequestContext,
     ) -> anyhow::Result<GcResult> {
+        // there is a global allowed_error for this
         anyhow::ensure!(
             self.is_active(),
             "Cannot run GC iteration on inactive tenant"
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3db78401f6bb..fdaad58e1602 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3749,6 +3749,7 @@ impl Timeline {
         // Is the timeline being deleted?
         let state = *self.state.borrow();
         if state == TimelineState::Stopping {
+            // there's a global allowed_error for this
             anyhow::bail!("timeline is Stopping");
         }
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 1007cb11b5ba..5017c8dcd36e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -629,7 +629,7 @@ def start(self):
         assert self.env is not None, "environment is not already initialized, call init() first"
         self.env.start()
 
-    def init_start(self) -> NeonEnv:
+    def init_start(self, initial_tenant_conf: Optional[Dict[str, str]] = None) -> NeonEnv:
         env = self.init_configs()
         self.start()
 
@@ -638,7 +638,9 @@ def init_start(self) -> NeonEnv:
         log.info(
             f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
         )
-        initial_tenant, initial_timeline = env.neon_cli.create_tenant(tenant_id=env.initial_tenant)
+        initial_tenant, initial_timeline = env.neon_cli.create_tenant(
+            tenant_id=env.initial_tenant, conf=initial_tenant_conf
+        )
         env.initial_timeline = initial_timeline
         log.info(f"Initial timeline {initial_tenant}/{initial_timeline} created successfully")
 
@@ -1613,6 +1615,7 @@ def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional
             ".*could not flush frozen layer.*queue is in state Stopped",  # when schedule layer upload fails because queued got closed before compaction got killed
             ".*wait for layer upload ops to complete.*",  # .*Caused by:.*wait_completion aborted because upload queue was stopped
             ".*gc_loop.*Gc failed, retrying in.*timeline is Stopping",  # When gc checks timeline state after acquiring layer_removal_cs
+            ".*gc_loop.*Gc failed, retrying in.*: Cannot run GC iteration on inactive tenant",  # Tenant::gc precondition
             ".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping",  # When compaction checks timeline state after acquiring layer_removal_cs
             ".*query handler for 'pagestream.*failed: Timeline .* was not found",  # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
             ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 1414b4ed8e65..c26ec76172bc 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -58,11 +58,8 @@ def test_ondemand_download_large_rel(
     )
 
     ##### First start, insert secret data and upload it to the remote storage
-    env = neon_env_builder.init_start()
-
-    # Override defaults, to create more layers
-    tenant, _ = env.neon_cli.create_tenant(
-        conf={
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
             # disable background GC
             "gc_period": "0s",
             "gc_horizon": f"{10 * 1024 ** 3}",  # 10 GB
@@ -75,7 +72,6 @@ def test_ondemand_download_large_rel(
             "compaction_period": "0s",
         }
     )
-    env.initial_tenant = tenant
 
     endpoint = env.endpoints.create_start("main")
 
diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py
index bc3f3f2be4de..fc93dcffbb7e 100644
--- a/test_runner/regress/test_pageserver_restarts_under_workload.py
+++ b/test_runner/regress/test_pageserver_restarts_under_workload.py
@@ -17,12 +17,6 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB
     n_restarts = 10
     scale = 10
 
-    # the background task may complete the init task delay after finding an
-    # active tenant, but shutdown starts right before Tenant::gc_iteration
-    env.pageserver.allowed_errors.append(
-        r".*Gc failed, retrying in \S+: Cannot run GC iteration on inactive tenant"
-    )
-
     def run_pgbench(connstr: str):
         log.info(f"Start a pgbench workload on pg {connstr}")
         pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])

From 77598f5d0ae3ad89debcc5d23143373332771632 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 5 Jun 2023 17:35:23 +0300
Subject: [PATCH 48/59] Better walreceiver logging (#4402)

walreceiver logs are a bit hard to understand because of partial span
usage, extra messages, ignored errors popping up as huge stacktraces.

Fixes #3330 (by spans, also demote info -> debug).

- arrange walreceivers spans into a hiearchy:
    - `wal_connection_manager{tenant_id, timeline_id}` ->
      `connection{node_id}` -> `poller`
- unifies the error reporting inside `wal_receiver`:
- All ok errors are now `walreceiver connection handling ended: {e:#}`
- All unknown errors are still stacktraceful task_mgr reported errors
  with context `walreceiver connection handling failure`
- Remove `connect` special casing, was: `DB connection stream finished`
  for ok errors
- Remove `done replicating` special casing, was `Replication stream
  finished` for ok errors
- lowered log levels for (non-exhaustive list):
    - `WAL receiver manager started, connecting to broker` (at startup)
    - `WAL receiver shutdown requested, shutting down` (at shutdown)
    - `Connection manager loop ended, shutting down` (at shutdown)
    - `sender is dropped while join handle is still alive` (at lucky
      shutdown, see #2885)
    - `timeline entered terminal state {:?}, stopping wal connection manager
      loop` (at shutdown)
    - `connected!` (at startup)
- `Walreceiver db connection closed` (at disconnects?, was without span)
    - `Connection cancelled` (at shutdown, was without span)
- `observed timeline state change, new state is {new_state:?}` (never
  after Timeline::activate was made infallible)
- changed:
    - `Timeline dropped state updates sender, stopping wal connection
      manager loop`
    - was out of date; sender is not dropped but `Broken | Stopping` state
      transition
        - also made `debug!`
    - `Timeline dropped state updates sender before becoming active,
      stopping wal connection manager loop`
    - was out of date: sender is again not dropped but `Broken | Stopping`
      state transition
        - also made `debug!`
- log fixes:
    - stop double reporting panics via JoinError
---
 pageserver/src/tenant/timeline/walreceiver.rs |  54 ++++----
 .../walreceiver/connection_manager.rs         |  48 ++++---
 .../walreceiver/walreceiver_connection.rs     | 119 ++++++++++--------
 test_runner/fixtures/neon_fixtures.py         |   9 +-
 4 files changed, 121 insertions(+), 109 deletions(-)

diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index 7ebf3cf17245..ccff735c3c6b 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -25,6 +25,7 @@ mod walreceiver_connection;
 
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME};
+use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::timeline::walreceiver::connection_manager::{
     connection_manager_loop_step, ConnectionManagerState,
 };
@@ -85,7 +86,8 @@ impl WalReceiver {
             &format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
             false,
             async move {
-                info!("WAL receiver manager started, connecting to broker");
+                debug_assert_current_span_has_tenant_and_timeline_id();
+                debug!("WAL receiver manager started, connecting to broker");
                 let mut connection_manager_state = ConnectionManagerState::new(
                     timeline,
                     conf,
@@ -93,7 +95,7 @@ impl WalReceiver {
                 loop {
                     select! {
                         _ = task_mgr::shutdown_watcher() => {
-                            info!("WAL receiver shutdown requested, shutting down");
+                            trace!("WAL receiver shutdown requested, shutting down");
                             break;
                         },
                         loop_step_result = connection_manager_loop_step(
@@ -104,7 +106,7 @@ impl WalReceiver {
                         ) => match loop_step_result {
                             ControlFlow::Continue(()) => continue,
                             ControlFlow::Break(()) => {
-                                info!("Connection manager loop ended, shutting down");
+                                trace!("Connection manager loop ended, shutting down");
                                 break;
                             }
                         },
@@ -115,7 +117,7 @@ impl WalReceiver {
                 *loop_status.write().unwrap() = None;
                 Ok(())
             }
-            .instrument(info_span!(parent: None, "wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id))
+            .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id))
         );
 
         Self {
@@ -198,29 +200,19 @@ impl<E: Clone> TaskHandle<E> {
                 TaskEvent::End(match self.join_handle.as_mut() {
                     Some(jh) => {
                         if !jh.is_finished() {
-                            // Barring any implementation errors in this module, we can
-                            // only arrive here while the task that executes the future
-                            // passed to `Self::spawn()` is still execution. Cf the comment
-                            // in Self::spawn().
-                            //
-                            // This was logging at warning level in earlier versions, presumably
-                            // to leave some breadcrumbs in case we had an implementation
-                            // error that would would make us get stuck in `jh.await`.
-                            //
-                            // There hasn't been such a bug so far.
-                            // But in a busy system, e.g., during pageserver restart,
-                            // we arrive here often enough that the warning-level logs
-                            // became a distraction.
-                            // So, tone them down to info-level.
-                            //
-                            // XXX: rewrite this module to eliminate the race condition.
-                            info!("sender is dropped while join handle is still alive");
+                            // See: https://github.com/neondatabase/neon/issues/2885
+                            trace!("sender is dropped while join handle is still alive");
                         }
 
-                        let res = jh
-                            .await
-                            .map_err(|e| anyhow::anyhow!("Failed to join task: {e}"))
-                            .and_then(|x| x);
+                        let res = match jh.await {
+                            Ok(res) => res,
+                            Err(je) if je.is_cancelled() => unreachable!("not used"),
+                            Err(je) if je.is_panic() => {
+                                // already logged
+                                Ok(())
+                            }
+                            Err(je) => Err(anyhow::Error::new(je).context("join walreceiver task")),
+                        };
 
                         // For cancellation-safety, drop join_handle only after successful .await.
                         self.join_handle = None;
@@ -243,12 +235,12 @@ impl<E: Clone> TaskHandle<E> {
             match jh.await {
                 Ok(Ok(())) => debug!("Shutdown success"),
                 Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
-                Err(join_error) => {
-                    if join_error.is_cancelled() {
-                        error!("Shutdown task was cancelled");
-                    } else {
-                        error!("Shutdown task join error: {join_error}")
-                    }
+                Err(je) if je.is_cancelled() => unreachable!("not used"),
+                Err(je) if je.is_panic() => {
+                    // already logged
+                }
+                Err(je) => {
+                    error!("Shutdown task join error: {je}")
                 }
             }
         }
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 6b65e1fd429c..e235fab425b5 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -18,7 +18,7 @@ use crate::metrics::{
     WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
 };
 use crate::task_mgr::TaskKind;
-use crate::tenant::Timeline;
+use crate::tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline};
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
 use pageserver_api::models::TimelineState;
@@ -55,8 +55,11 @@ pub(super) async fn connection_manager_loop_step(
         .await
     {
         Ok(()) => {}
-        Err(_) => {
-            info!("Timeline dropped state updates sender before becoming active, stopping wal connection manager loop");
+        Err(new_state) => {
+            debug!(
+                ?new_state,
+                "state changed, stopping wal connection manager loop"
+            );
             return ControlFlow::Break(());
         }
     }
@@ -79,7 +82,7 @@ pub(super) async fn connection_manager_loop_step(
     // with other streams on this client (other connection managers). When
     // object goes out of scope, stream finishes in drop() automatically.
     let mut broker_subscription = subscribe_for_timeline_updates(broker_client, id).await;
-    info!("Subscribed for broker timeline updates");
+    debug!("Subscribed for broker timeline updates");
 
     loop {
         let time_until_next_retry = connection_manager_state.time_until_next_retry();
@@ -151,12 +154,12 @@ pub(super) async fn connection_manager_loop_step(
                                 // we're already active as walreceiver, no need to reactivate
                                 TimelineState::Active => continue,
                                 TimelineState::Broken | TimelineState::Stopping => {
-                                    info!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
+                                    debug!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
                                     return ControlFlow::Break(());
                                 }
                                 TimelineState::Loading => {
                                     warn!("timeline transitioned back to Loading state, that should not happen");
-                                    return ControlFlow::Continue(new_state);
+                                    return ControlFlow::Continue(());
                                 }
                             }
                         }
@@ -164,12 +167,11 @@ pub(super) async fn connection_manager_loop_step(
                     }
                 }
             } => match new_event {
-                ControlFlow::Continue(new_state) => {
-                    info!("observed timeline state change, new state is {new_state:?}");
+                ControlFlow::Continue(()) => {
                     return ControlFlow::Continue(());
                 }
                 ControlFlow::Break(()) => {
-                    info!("Timeline dropped state updates sender, stopping wal connection manager loop");
+                    debug!("Timeline is no longer active, stopping wal connection manager loop");
                     return ControlFlow::Break(());
                 }
             },
@@ -390,7 +392,6 @@ impl ConnectionManagerState {
 
         self.drop_old_connection(true).await;
 
-        let id = self.id;
         let node_id = new_sk.safekeeper_id;
         let connect_timeout = self.conf.wal_connect_timeout;
         let timeline = Arc::clone(&self.timeline);
@@ -398,9 +399,13 @@ impl ConnectionManagerState {
             TaskKind::WalReceiverConnectionHandler,
             DownloadBehavior::Download,
         );
+
+        let span = info_span!("connection", %node_id);
         let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
             async move {
-                super::walreceiver_connection::handle_walreceiver_connection(
+                debug_assert_current_span_has_tenant_and_timeline_id();
+
+                let res = super::walreceiver_connection::handle_walreceiver_connection(
                     timeline,
                     new_sk.wal_source_connconf,
                     events_sender,
@@ -409,12 +414,23 @@ impl ConnectionManagerState {
                     ctx,
                     node_id,
                 )
-                .await
-                .context("walreceiver connection handling failure")
+                .await;
+
+                match res {
+                    Ok(()) => Ok(()),
+                    Err(e) => {
+                        use super::walreceiver_connection::ExpectedError;
+                        if e.is_expected() {
+                            info!("walreceiver connection handling ended: {e:#}");
+                            Ok(())
+                        } else {
+                            // give out an error to have task_mgr give it a really verbose logging
+                            Err(e).context("walreceiver connection handling failure")
+                        }
+                    }
+                }
             }
-            .instrument(
-                info_span!("walreceiver_connection", tenant_id = %id.tenant_id, timeline_id = %id.timeline_id, %node_id),
-            )
+            .instrument(span)
         });
 
         let now = Utc::now().naive_utc();
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 1cbed3416c23..41f6c63d40a0 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -21,16 +21,16 @@ use postgres_types::PgLsn;
 use tokio::{select, sync::watch, time};
 use tokio_postgres::{replication::ReplicationStream, Client};
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, trace, warn};
+use tracing::{debug, error, info, trace, warn, Instrument};
 
 use super::TaskStateUpdate;
-use crate::metrics::LIVE_CONNECTIONS_COUNT;
-use crate::{context::RequestContext, metrics::WALRECEIVER_STARTED_CONNECTIONS};
 use crate::{
+    context::RequestContext,
+    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
     task_mgr,
     task_mgr::TaskKind,
     task_mgr::WALRECEIVER_RUNTIME,
-    tenant::{Timeline, WalReceiverInfo},
+    tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
     walingest::WalIngest,
     walrecord::DecodedWALRecord,
 };
@@ -81,13 +81,8 @@ pub(super) async fn handle_walreceiver_connection(
         config.application_name("pageserver");
         config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
         match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
-            Ok(Ok(client_and_conn)) => client_and_conn,
-            Ok(Err(conn_err)) => {
-                let expected_error = ignore_expected_errors(conn_err)?;
-                info!("DB connection stream finished: {expected_error}");
-                return Ok(());
-            }
-            Err(_) => {
+            Ok(client_and_conn) => client_and_conn?,
+            Err(_elapsed) => {
                 // Timing out to connect to a safekeeper node could happen long time, due to
                 // many reasons that pageserver cannot control.
                 // Do not produce an error, but make it visible, that timeouts happen by logging the `event.
@@ -97,7 +92,7 @@ pub(super) async fn handle_walreceiver_connection(
         }
     };
 
-    info!("connected!");
+    debug!("connected!");
     let mut connection_status = WalConnectionStatus {
         is_connected: true,
         has_processed_wal: false,
@@ -127,20 +122,24 @@ pub(super) async fn handle_walreceiver_connection(
         "walreceiver connection",
         false,
         async move {
+            debug_assert_current_span_has_tenant_and_timeline_id();
+
             select! {
                 connection_result = connection => match connection_result {
-                    Ok(()) => info!("Walreceiver db connection closed"),
+                    Ok(()) => debug!("Walreceiver db connection closed"),
                     Err(connection_error) => {
-                        if let Err(e) = ignore_expected_errors(connection_error) {
-                            warn!("Connection aborted: {e:#}")
+                        if connection_error.is_expected() {
+                            // silence
+                        } else {
+                            warn!("Connection aborted: {connection_error:#}")
                         }
                     }
                 },
-                // Future: replace connection_cancellation with connection_ctx cancellation
-                _ = connection_cancellation.cancelled() => info!("Connection cancelled"),
+                _ = connection_cancellation.cancelled() => debug!("Connection cancelled"),
             }
             Ok(())
-        },
+        }
+        .instrument(tracing::info_span!("poller")),
     );
 
     // Immediately increment the gauge, then create a job to decrement it on task exit.
@@ -203,20 +202,13 @@ pub(super) async fn handle_walreceiver_connection(
     while let Some(replication_message) = {
         select! {
             _ = cancellation.cancelled() => {
-                info!("walreceiver interrupted");
+                debug!("walreceiver interrupted");
                 None
             }
             replication_message = physical_stream.next() => replication_message,
         }
     } {
-        let replication_message = match replication_message {
-            Ok(message) => message,
-            Err(replication_error) => {
-                let expected_error = ignore_expected_errors(replication_error)?;
-                info!("Replication stream finished: {expected_error}");
-                return Ok(());
-            }
-        };
+        let replication_message = replication_message?;
 
         let now = Utc::now().naive_utc();
         let last_rec_lsn_before_msg = last_rec_lsn;
@@ -261,8 +253,6 @@ pub(super) async fn handle_walreceiver_connection(
                     let mut decoded = DecodedWALRecord::default();
                     let mut modification = timeline.begin_modification(endlsn);
                     while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                        // let _enter = info_span!("processing record", lsn = %lsn).entered();
-
                         // It is important to deal with the aligned records as lsn in getPage@LSN is
                         // aligned and can be several bytes bigger. Without this alignment we are
                         // at risk of hitting a deadlock.
@@ -421,31 +411,50 @@ async fn identify_system(client: &mut Client) -> anyhow::Result<IdentifySystem>
     }
 }
 
-/// We don't want to report connectivity problems as real errors towards connection manager because
-/// 1. they happen frequently enough to make server logs hard to read and
-/// 2. the connection manager can retry other safekeeper.
-///
-/// If this function returns `Ok(pg_error)`, it's such an error.
-/// The caller should log it at info level and then report to connection manager that we're done handling this connection.
-/// Connection manager will then handle reconnections.
-///
-/// If this function returns an `Err()`, the caller can bubble it up using `?`.
-/// The connection manager will log the error at ERROR level.
-fn ignore_expected_errors(pg_error: postgres::Error) -> anyhow::Result<postgres::Error> {
-    if pg_error.is_closed()
-        || pg_error
-            .source()
-            .and_then(|source| source.downcast_ref::<std::io::Error>())
-            .map(is_expected_io_error)
-            .unwrap_or(false)
-    {
-        return Ok(pg_error);
-    } else if let Some(db_error) = pg_error.as_db_error() {
-        if db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
-            && db_error.message().contains("ending streaming")
-        {
-            return Ok(pg_error);
-        }
+/// Trait for avoid reporting walreceiver specific expected or "normal" or "ok" errors.
+pub(super) trait ExpectedError {
+    /// Test if this error is an ok error.
+    ///
+    /// We don't want to report connectivity problems as real errors towards connection manager because
+    /// 1. they happen frequently enough to make server logs hard to read and
+    /// 2. the connection manager can retry other safekeeper.
+    ///
+    /// If this function returns `true`, it's such an error.
+    /// The caller should log it at info level and then report to connection manager that we're done handling this connection.
+    /// Connection manager will then handle reconnections.
+    ///
+    /// If this function returns an `false` the error should be propagated and the connection manager
+    /// will log the error at ERROR level.
+    fn is_expected(&self) -> bool;
+}
+
+impl ExpectedError for postgres::Error {
+    fn is_expected(&self) -> bool {
+        self.is_closed()
+            || self
+                .source()
+                .and_then(|source| source.downcast_ref::<std::io::Error>())
+                .map(is_expected_io_error)
+                .unwrap_or(false)
+            || self
+                .as_db_error()
+                .filter(|db_error| {
+                    db_error.code() == &SqlState::SUCCESSFUL_COMPLETION
+                        && db_error.message().contains("ending streaming")
+                })
+                .is_some()
+    }
+}
+
+impl ExpectedError for anyhow::Error {
+    fn is_expected(&self) -> bool {
+        let head = self.downcast_ref::<postgres::Error>();
+
+        let tail = self
+            .chain()
+            .filter_map(|e| e.downcast_ref::<postgres::Error>());
+
+        // check if self or any of the chained/sourced errors are expected
+        head.into_iter().chain(tail).any(|e| e.is_expected())
     }
-    Err(pg_error).context("connection error")
 }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5017c8dcd36e..e23ed1287846 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1585,13 +1585,10 @@ def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional
             ".*serving compute connection task.*exited with error: Postgres connection error.*",
             ".*serving compute connection task.*exited with error: Connection reset by peer.*",
             ".*serving compute connection task.*exited with error: Postgres query error.*",
-            ".*Connection aborted: connection error: error communicating with the server: Broken pipe.*",
-            ".*Connection aborted: connection error: error communicating with the server: Transport endpoint is not connected.*",
-            ".*Connection aborted: connection error: error communicating with the server: Connection reset by peer.*",
             # FIXME: replication patch for tokio_postgres regards  any but CopyDone/CopyData message in CopyBoth stream as unexpected
-            ".*Connection aborted: connection error: unexpected message from server*",
+            ".*Connection aborted: unexpected message from server*",
             ".*kill_and_wait_impl.*: wait successful.*",
-            ".*Replication stream finished: db error:.*ending streaming to Some*",
+            ".*: db error:.*ending streaming to Some.*",
             ".*query handler for 'pagestream.*failed: Broken pipe.*",  # pageserver notices compute shut down
             ".*query handler for 'pagestream.*failed: Connection reset by peer.*",  # pageserver notices compute shut down
             # safekeeper connection can fail with this, in the window between timeline creation
@@ -1608,8 +1605,6 @@ def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional
             ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*",
             ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*",
             ".*Removing intermediate uninit mark file.*",
-            # FIXME: known race condition in TaskHandle: https://github.com/neondatabase/neon/issues/2885
-            ".*sender is dropped while join handle is still alive.*",
             # Tenant::delete_timeline() can cause any of the four following errors.
             # FIXME: we shouldn't be considering it an error: https://github.com/neondatabase/neon/issues/2946
             ".*could not flush frozen layer.*queue is in state Stopped",  # when schedule layer upload fails because queued got closed before compaction got killed

From e0bd81ce1f9204a50a74c4f71baacee8220be5a6 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 5 Jun 2023 18:12:58 +0300
Subject: [PATCH 49/59] test: fix flaky warning on attach (#4415)

added the `allowed_error` to the `positive_env` so any tests completing
the attach are allowed have this print out. they are allowed to do so,
because the `random_init_delay` can produce close to zero and thus the
first run will be near attach. Though... Unsure if we ever really need
the eviction task to run **before** it can evict something, as in after
20min or 24h.

in the failed test case however period is 20s so interesting that we
didn't run into this sooner.

evidence of flaky:
https://github.com/neondatabase/neon/actions/runs/5175677035/jobs/9323705929?pr=4399#step:4:38536
---
 test_runner/regress/test_attach_tenant_config.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 6261ec28dbdf..4df5ae18d6f0 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -20,6 +20,11 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
         test_name="test_attach_tenant_config",
     )
     env = neon_env_builder.init_start()
+
+    # eviction might be the first one after an attach to access the layers
+    env.pageserver.allowed_errors.append(
+        ".*unexpectedly on-demand downloading remote layer remote.* for task kind Eviction"
+    )
     assert isinstance(env.remote_storage, LocalFsStorage)
     return env
 

From 8e1b5e12245ff386c6d0252d6da3b1a1c2d817ea Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 5 Jun 2023 20:10:19 +0300
Subject: [PATCH 50/59] =?UTF-8?q?Remove=20=20-ftree-vectorizer-verbose=3D0?=
 =?UTF-8?q?=20option=20notrecognized=20by=20MaxOS/X=20c=E2=80=A6=20(#4412)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ompiler

## Problem

## Summary of changes

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 pgxn/hnsw/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/hnsw/Makefile b/pgxn/hnsw/Makefile
index 9bdd87430c3b..79a4d907614b 100644
--- a/pgxn/hnsw/Makefile
+++ b/pgxn/hnsw/Makefile
@@ -12,7 +12,7 @@ REGRESS_OPTS = --inputdir=test --load-extension=hnsw
 # For auto-vectorization:
 # - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html
 PG_CFLAGS += -O3
-PG_CPPFLAGS +=  -msse4.1 -O3 -march=native -ftree-vectorize -ftree-vectorizer-verbose=0
+PG_CXXFLAGS +=  -msse4 -mavx2 -O3 -std=c++11
 PG_LDFLAGS += -lstdc++
 
 all: $(EXTENSION)--$(EXTVERSION).sql

From ac11e7c32dacf5090efd95bd9c425150d1ceca33 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <sasha@neon.tech>
Date: Mon, 5 Jun 2023 22:04:15 -0800
Subject: [PATCH 51/59] Remove arch-specific stuff from HNSW extension (#4423)

---
 pgxn/hnsw/Makefile    |   2 +-
 pgxn/hnsw/hnswalg.cpp | 123 +++++++++++++++++++++++-------------------
 pgxn/hnsw/hnswalg.h   |   6 ++-
 3 files changed, 72 insertions(+), 59 deletions(-)

diff --git a/pgxn/hnsw/Makefile b/pgxn/hnsw/Makefile
index 79a4d907614b..66436b592090 100644
--- a/pgxn/hnsw/Makefile
+++ b/pgxn/hnsw/Makefile
@@ -12,7 +12,7 @@ REGRESS_OPTS = --inputdir=test --load-extension=hnsw
 # For auto-vectorization:
 # - GCC (needs -ftree-vectorize OR -O3) - https://gcc.gnu.org/projects/tree-ssa/vectorization.html
 PG_CFLAGS += -O3
-PG_CXXFLAGS +=  -msse4 -mavx2 -O3 -std=c++11
+PG_CXXFLAGS +=  -O3 -std=c++11
 PG_LDFLAGS += -lstdc++
 
 all: $(EXTENSION)--$(EXTVERSION).sql
diff --git a/pgxn/hnsw/hnswalg.cpp b/pgxn/hnsw/hnswalg.cpp
index 226dcbd53d7f..f6de3b8314f0 100644
--- a/pgxn/hnsw/hnswalg.cpp
+++ b/pgxn/hnsw/hnswalg.cpp
@@ -1,22 +1,11 @@
 #include "hnswalg.h"
 
-
-#if defined(__x86_64__)
-
-#include <x86intrin.h>
-#define USE_AVX
 #if defined(__GNUC__)
 #define PORTABLE_ALIGN32 __attribute__((aligned(32)))
+#define PREFETCH(addr,hint) __builtin_prefetch(addr, 0, hint)
 #else
 #define PORTABLE_ALIGN32 __declspec(align(32))
-#endif
-
-#define PREFETCH(addr,hint) _mm_prefetch(addr, hint)
-
-#else
-
 #define PREFETCH(addr,hint)
-
 #endif
 
 HierarchicalNSW::HierarchicalNSW(size_t dim_, size_t maxelements_, size_t M_, size_t maxM_, size_t efConstruction_)
@@ -36,7 +25,9 @@ HierarchicalNSW::HierarchicalNSW(size_t dim_, size_t maxelements_, size_t M_, si
 
     enterpoint_node = 0;
     cur_element_count = 0;
-	dist_calc = 0;
+#ifdef __x86_64__
+    use_avx2 = __builtin_cpu_supports("avx2");
+#endif
 }
 
 std::priority_queue<std::pair<dist_t, idx_t>> HierarchicalNSW::searchBaseLayer(const coord_t *point, size_t ef)
@@ -66,12 +57,12 @@ std::priority_queue<std::pair<dist_t, idx_t>> HierarchicalNSW::searchBaseLayer(c
         idx_t* data = get_linklist0(curNodeNum);
         size_t size = *data++;
 
-        PREFETCH(getDataByInternalId(*data), _MM_HINT_T0);
+        PREFETCH(getDataByInternalId(*data), 0);
 
         for (size_t j = 0; j < size; ++j) {
             size_t tnum = *(data + j);
 
-            PREFETCH(getDataByInternalId(*(data + j + 1)), _MM_HINT_T0);
+            PREFETCH(getDataByInternalId(*(data + j + 1)), 0);
 
             if (!(visited[tnum >> 5] & (1 << (tnum & 31)))) {
 				visited[tnum >> 5] |= 1 << (tnum & 31);
@@ -81,7 +72,7 @@ std::priority_queue<std::pair<dist_t, idx_t>> HierarchicalNSW::searchBaseLayer(c
                 if (topResults.top().first > dist || topResults.size() < ef) {
                     candidateSet.emplace(-dist, tnum);
 
-                    PREFETCH(get_linklist0(candidateSet.top().second), _MM_HINT_T0);
+                    PREFETCH(get_linklist0(candidateSet.top().second), 0);
                     topResults.emplace(dist, tnum);
 
                     if (topResults.size() > ef)
@@ -228,37 +219,59 @@ std::priority_queue<std::pair<dist_t, label_t>> HierarchicalNSW::searchKnn(const
     return topResults;
 };
 
-dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y)
+dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n)
 {
-#if defined(__x86_64__)
-    float PORTABLE_ALIGN32 TmpRes[8];
-    size_t qty16 = dim >> 4;
-	const float *pEnd1 = x + (qty16 << 4);
-#ifdef USE_AVX
-	__m256 diff, v1, v2;
-	__m256 sum = _mm256_set1_ps(0);
-
-	while (x < pEnd1) {
-		v1 = _mm256_loadu_ps(x);
-		x += 8;
-		v2 = _mm256_loadu_ps(y);
-		y += 8;
-		diff = _mm256_sub_ps(v1, v2);
-		sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
-
-		v1 = _mm256_loadu_ps(x);
-		x += 8;
-		v2 = _mm256_loadu_ps(y);
-		y += 8;
-		diff = _mm256_sub_ps(v1, v2);
-		sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
-	}
+    dist_t 	distance = 0.0;
+
+    for (size_t i = 0; i < n; i++)
+    {
+        dist_t diff = x[i] - y[i];
+        distance += diff * diff;
+    }
+    return distance;
+
+}
 
-	_mm256_store_ps(TmpRes, sum);
-	float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
+#ifdef __x86_64__
+#include <immintrin.h>
+
+__attribute__((target("avx2")))
+dist_t fstdistfunc_avx2(const coord_t *x, const coord_t *y, size_t n)
+{
+    const size_t TmpResSz = sizeof(__m256) / sizeof(float);
+    float PORTABLE_ALIGN32 TmpRes[TmpResSz];
+    size_t qty16 = n / 16;
+    const float *pEnd1 = x + (qty16 * 16);
+    __m256 diff, v1, v2;
+    __m256 sum = _mm256_set1_ps(0);
+
+    while (x < pEnd1) {
+        v1 = _mm256_loadu_ps(x);
+        x += 8;
+        v2 = _mm256_loadu_ps(y);
+        y += 8;
+        diff = _mm256_sub_ps(v1, v2);
+        sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
+
+        v1 = _mm256_loadu_ps(x);
+        x += 8;
+        v2 = _mm256_loadu_ps(y);
+        y += 8;
+        diff = _mm256_sub_ps(v1, v2);
+        sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
+    }
+    _mm256_store_ps(TmpRes, sum);
+    float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
+    return (res);
+}
+
+dist_t fstdistfunc_sse(const coord_t *x, const coord_t *y, size_t n)
+{
+    const size_t TmpResSz = sizeof(__m128) / sizeof(float);
+    float PORTABLE_ALIGN32 TmpRes[TmpResSz];
+    size_t qty16 = n / 16;
+    const float *pEnd1 = x + (qty16 * 16);
 
-	return (res);
-#else
     __m128 diff, v1, v2;
     __m128 sum = _mm_set1_ps(0);
 
@@ -293,21 +306,19 @@ dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y)
     }
     _mm_store_ps(TmpRes, sum);
     float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
-
-    return (res);
+    return res;
+}
 #endif
-#else // portable implementation
-	dist_t 	distance = 0.0;
-	size_t  n = dim;
 
-	dist_calc++;
+dist_t HierarchicalNSW::fstdistfunc(const coord_t *x, const coord_t *y)
+{
+#ifndef __x86_64__
+    return fstdistfunc_scalar(x, y, dim);
+#else
+    if(use_avx2)
+        return fstdistfunc_avx2(x, y, dim);
 
-	for (size_t i = 0; i < n; i++)
-	{
-		dist_t diff = x[i] - y[i];
-		distance += diff * diff;
-	}
-	return distance;
+    return fstdistfunc_sse(x, y, dim);
 #endif
 }
 
diff --git a/pgxn/hnsw/hnswalg.h b/pgxn/hnsw/hnswalg.h
index b845ad2743e3..f38aeac36285 100644
--- a/pgxn/hnsw/hnswalg.h
+++ b/pgxn/hnsw/hnswalg.h
@@ -22,8 +22,6 @@ struct HierarchicalNSW
 
 	idx_t  enterpoint_node;
 
-	size_t dist_calc;
-
 	size_t dim;
 	size_t data_size;
 	size_t offset_data;
@@ -34,6 +32,10 @@ struct HierarchicalNSW
 	size_t size_links_level0;
 	size_t efConstruction;
 
+#ifdef __x86_64__
+	bool	use_avx2;
+#endif
+
 	char   data_level0_memory[0]; // varying size
 
   public:

From 18a9d47f8e8f0662596c6085dfcbd98a0fb1d914 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 6 Jun 2023 13:51:39 +0300
Subject: [PATCH 52/59] test: restore NotConnected being allowed globally
 (#4426)

Flakyness introduced by #4402 evidence [^1].

I had assumed the NotConnected would had been an expected io error, but
it's not. Restore the global `allowed_error`.

[^1]:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4407/5185897757/index.html#suites/82004ab4e3720b47bf78f312dabe7c55/14f636d0ecd3939d/
---
 .../src/tenant/timeline/walreceiver/walreceiver_connection.rs  | 3 ++-
 test_runner/fixtures/neon_fixtures.py                          | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 41f6c63d40a0..a16afe2b3c17 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -129,7 +129,8 @@ pub(super) async fn handle_walreceiver_connection(
                     Ok(()) => debug!("Walreceiver db connection closed"),
                     Err(connection_error) => {
                         if connection_error.is_expected() {
-                            // silence
+                            // silence, because most likely we've already exited the outer call
+                            // with a similar error.
                         } else {
                             warn!("Connection aborted: {connection_error:#}")
                         }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index e23ed1287846..0c63fd126217 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1585,6 +1585,7 @@ def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional
             ".*serving compute connection task.*exited with error: Postgres connection error.*",
             ".*serving compute connection task.*exited with error: Connection reset by peer.*",
             ".*serving compute connection task.*exited with error: Postgres query error.*",
+            ".*Connection aborted: error communicating with the server: Transport endpoint is not connected.*",
             # FIXME: replication patch for tokio_postgres regards  any but CopyDone/CopyData message in CopyBoth stream as unexpected
             ".*Connection aborted: unexpected message from server*",
             ".*kill_and_wait_impl.*: wait successful.*",

From 0cef7e977dc076b8da24cebf988b529dfef3405c Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 6 Jun 2023 15:30:55 +0300
Subject: [PATCH 53/59] refactor: just one way to shutdown a tenant (#4407)

We have 2 ways of tenant shutdown, we should have just one.

Changes are mostly mechanical simple refactorings.

Added `warn!` on the "shutdown all remaining tasks" should trigger test
failures in the between time of not having solved the "tenant/timeline
owns all spawned tasks" issue.

Cc: #4327.
---
 pageserver/src/lib.rs                       |   6 -
 pageserver/src/task_mgr.rs                  |  21 +++-
 pageserver/src/tenant.rs                    | 127 +++++++++++++++++---
 pageserver/src/tenant/mgr.rs                | 123 +++++--------------
 test_runner/fixtures/neon_fixtures.py       |   3 +
 test_runner/regress/test_remote_storage.py  |   6 +-
 test_runner/regress/test_timeline_delete.py |   3 +-
 7 files changed, 167 insertions(+), 122 deletions(-)

diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 776cf0dac1b4..40a672bee3fb 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -58,12 +58,6 @@ pub async fn shutdown_pageserver(exit_code: i32) {
     // the checkpoint and GC tasks.
     tenant::mgr::shutdown_all_tenants().await;
 
-    // Stop syncing with remote storage.
-    //
-    // FIXME: Does this wait for the sync tasks to finish syncing what's queued up?
-    // Should it?
-    task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), None, None).await;
-
     // Shut down the HTTP endpoint last, so that you can still check the server's
     // status while it's shutting down.
     // FIXME: We should probably stop accepting commands like attach/detach earlier.
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 82aebc6c07a1..4df0e4e6f22c 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -476,18 +476,35 @@ pub async fn shutdown_tasks(
                 && (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
             {
                 task.cancel.cancel();
-                victim_tasks.push(Arc::clone(task));
+                victim_tasks.push((
+                    Arc::clone(task),
+                    task.kind,
+                    task_mut.tenant_id,
+                    task_mut.timeline_id,
+                ));
             }
         }
     }
 
-    for task in victim_tasks {
+    let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none();
+
+    for (task, task_kind, tenant_id, timeline_id) in victim_tasks {
         let join_handle = {
             let mut task_mut = task.mutable.lock().unwrap();
             task_mut.join_handle.take()
         };
         if let Some(mut join_handle) = join_handle {
+            if log_all {
+                if tenant_id.is_none() {
+                    // there are quite few of these
+                    info!(name = task.name, kind = ?task_kind, "stopping global task");
+                } else {
+                    // warn to catch these in tests; there shouldn't be any
+                    warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
+                }
+            }
             let completed = tokio::select! {
+                biased;
                 _ = &mut join_handle => { true },
                 _ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
                     // allow some time to elapse before logging to cut down the number of log
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index bcf4495ac2e4..7ce0ed81bc31 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -486,6 +486,10 @@ impl std::fmt::Display for WaitToBecomeActiveError {
     }
 }
 
+pub(crate) enum ShutdownError {
+    AlreadyStopping,
+}
+
 impl Tenant {
     /// Yet another helper for timeline initialization.
     /// Contains the common part of `load_local_timeline` and `load_remote_timeline`.
@@ -1439,28 +1443,63 @@ impl Tenant {
         Ok(())
     }
 
-    /// Flush all in-memory data to disk.
+    /// Flush all in-memory data to disk and remote storage, if any.
     ///
     /// Used at graceful shutdown.
-    ///
-    pub async fn freeze_and_flush(&self) -> anyhow::Result<()> {
-        // Scan through the hashmap and collect a list of all the timelines,
-        // while holding the lock. Then drop the lock and actually perform the
-        // flushing. We don't want to block everything else while the
-        // flushing is performed.
-        let timelines_to_flush = {
+    async fn freeze_and_flush_on_shutdown(&self) {
+        let mut js = tokio::task::JoinSet::new();
+
+        // execute on each timeline on the JoinSet, join after.
+        let per_timeline = |timeline_id: TimelineId, timeline: Arc<Timeline>| {
+            async move {
+                debug_assert_current_span_has_tenant_and_timeline_id();
+
+                match timeline.freeze_and_flush().await {
+                    Ok(()) => {}
+                    Err(e) => {
+                        warn!("failed to freeze and flush: {e:#}");
+                        return;
+                    }
+                }
+
+                let res = if let Some(client) = timeline.remote_client.as_ref() {
+                    // if we did not wait for completion here, it might be our shutdown process
+                    // didn't wait for remote uploads to complete at all, as new tasks can forever
+                    // be spawned.
+                    //
+                    // what is problematic is the shutting down of RemoteTimelineClient, because
+                    // obviously it does not make sense to stop while we wait for it, but what
+                    // about corner cases like s3 suddenly hanging up?
+                    client.wait_completion().await
+                } else {
+                    Ok(())
+                };
+
+                if let Err(e) = res {
+                    warn!("failed to await for frozen and flushed uploads: {e:#}");
+                }
+            }
+            .instrument(tracing::info_span!("freeze_and_flush_on_shutdown", %timeline_id))
+        };
+
+        {
             let timelines = self.timelines.lock().unwrap();
             timelines
                 .iter()
-                .map(|(_id, timeline)| Arc::clone(timeline))
-                .collect::<Vec<_>>()
+                .map(|(id, tl)| (*id, Arc::clone(tl)))
+                .for_each(|(timeline_id, timeline)| {
+                    js.spawn(per_timeline(timeline_id, timeline));
+                })
         };
 
-        for timeline in &timelines_to_flush {
-            timeline.freeze_and_flush().await?;
+        while let Some(res) = js.join_next().await {
+            match res {
+                Ok(()) => {}
+                Err(je) if je.is_cancelled() => unreachable!("no cancelling used"),
+                Err(je) if je.is_panic() => { /* logged already */ }
+                Err(je) => warn!("unexpected JoinError: {je:?}"),
+            }
         }
-
-        Ok(())
     }
 
     /// Shuts down a timeline's tasks, removes its in-memory structures, and deletes its
@@ -1756,12 +1795,70 @@ impl Tenant {
         }
     }
 
+    /// Shutdown the tenant and join all of the spawned tasks.
+    ///
+    /// The method caters for all use-cases:
+    /// - pageserver shutdown (freeze_and_flush == true)
+    /// - detach + ignore (freeze_and_flush == false)
+    ///
+    /// This will attempt to shutdown even if tenant is broken.
+    pub(crate) async fn shutdown(&self, freeze_and_flush: bool) -> Result<(), ShutdownError> {
+        debug_assert_current_span_has_tenant_id();
+        // Set tenant (and its timlines) to Stoppping state.
+        //
+        // Since we can only transition into Stopping state after activation is complete,
+        // run it in a JoinSet so all tenants have a chance to stop before we get SIGKILLed.
+        //
+        // Transitioning tenants to Stopping state has a couple of non-obvious side effects:
+        // 1. Lock out any new requests to the tenants.
+        // 2. Signal cancellation to WAL receivers (we wait on it below).
+        // 3. Signal cancellation for other tenant background loops.
+        // 4. ???
+        //
+        // The waiting for the cancellation is not done uniformly.
+        // We certainly wait for WAL receivers to shut down.
+        // That is necessary so that no new data comes in before the freeze_and_flush.
+        // But the tenant background loops are joined-on in our caller.
+        // It's mesed up.
+        // we just ignore the failure to stop
+        match self.set_stopping().await {
+            Ok(()) => {}
+            Err(SetStoppingError::Broken) => {
+                // assume that this is acceptable
+            }
+            Err(SetStoppingError::AlreadyStopping) => return Err(ShutdownError::AlreadyStopping),
+        };
+
+        if freeze_and_flush {
+            // walreceiver has already began to shutdown with TenantState::Stopping, but we need to
+            // await for them to stop.
+            task_mgr::shutdown_tasks(
+                Some(TaskKind::WalReceiverManager),
+                Some(self.tenant_id),
+                None,
+            )
+            .await;
+
+            // this will wait for uploads to complete; in the past, it was done outside tenant
+            // shutdown in pageserver::shutdown_pageserver.
+            self.freeze_and_flush_on_shutdown().await;
+        }
+
+        // shutdown all tenant and timeline tasks: gc, compaction, page service
+        // No new tasks will be started for this tenant because it's in `Stopping` state.
+        //
+        // this will additionally shutdown and await all timeline tasks.
+        task_mgr::shutdown_tasks(None, Some(self.tenant_id), None).await;
+
+        Ok(())
+    }
+
     /// Change tenant status to Stopping, to mark that it is being shut down.
     ///
     /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
     ///
     /// This function is not cancel-safe!
-    pub async fn set_stopping(&self) -> Result<(), SetStoppingError> {
+    async fn set_stopping(&self) -> Result<(), SetStoppingError> {
         let mut rx = self.state.subscribe();
 
         // cannot stop before we're done activating, so wait out until we're done activating
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 05874bdd72cf..740f9621b695 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -20,9 +20,7 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::{
-    create_tenant_files, CreateTenantFilesMode, SetStoppingError, Tenant, TenantState,
-};
+use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
 use crate::IGNORED_TENANT_FILE_NAME;
 
 use utils::completion;
@@ -255,46 +253,28 @@ pub async fn shutdown_all_tenants() {
                 tenants_clone
             }
             TenantsMap::ShuttingDown(_) => {
+                // TODO: it is possible that detach and shutdown happen at the same time. as a
+                // result, during shutdown we do not wait for detach.
                 error!("already shutting down, this function isn't supposed to be called more than once");
                 return;
             }
         }
     };
 
-    // Set tenant (and its timlines) to Stoppping state.
-    //
-    // Since we can only transition into Stopping state after activation is complete,
-    // run it in a JoinSet so all tenants have a chance to stop before we get SIGKILLed.
-    //
-    // Transitioning tenants to Stopping state has a couple of non-obvious side effects:
-    // 1. Lock out any new requests to the tenants.
-    // 2. Signal cancellation to WAL receivers (we wait on it below).
-    // 3. Signal cancellation for other tenant background loops.
-    // 4. ???
-    //
-    // The waiting for the cancellation is not done uniformly.
-    // We certainly wait for WAL receivers to shut down.
-    // That is necessary so that no new data comes in before the freeze_and_flush.
-    // But the tenant background loops are joined-on in our caller.
-    // It's mesed up.
     let mut join_set = JoinSet::new();
-    let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len());
     for (tenant_id, tenant) in tenants_to_shut_down {
         join_set.spawn(
             async move {
-                match tenant.set_stopping().await {
+                let freeze_and_flush = true;
+
+                match tenant.shutdown(freeze_and_flush).await {
                     Ok(()) => debug!("tenant successfully stopped"),
-                    Err(SetStoppingError::Broken) => {
-                        info!("tenant is broken, so stopping failed, freeze_and_flush is likely going to make noise as well");
-                    },
-                    Err(SetStoppingError::AlreadyStopping) => {
-                        // our task_mgr::shutdown_tasks are going to coalesce on that just fine
+                    Err(super::ShutdownError::AlreadyStopping) => {
+                        warn!("tenant was already shutting down")
                     }
                 }
-
-                tenant
             }
-            .instrument(info_span!("set_stopping", %tenant_id)),
+            .instrument(info_span!("shutdown", %tenant_id)),
         );
     }
 
@@ -302,6 +282,7 @@ pub async fn shutdown_all_tenants() {
 
     while let Some(res) = join_set.join_next().await {
         match res {
+            Ok(()) => {}
             Err(join_error) if join_error.is_cancelled() => {
                 unreachable!("we are not cancelling any of the futures");
             }
@@ -312,50 +293,11 @@ pub async fn shutdown_all_tenants() {
             Err(join_error) => {
                 warn!("unknown kind of JoinError: {join_error}");
             }
-            Ok(tenant) => tenants_to_freeze_and_flush.push(tenant),
         }
     }
 
     if panicked > 0 {
-        warn!(panicked, "observed panicks while stopping tenants");
-    }
-
-    // Shut down all existing walreceiver connections and stop accepting the new ones.
-    task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
-
-    // Ok, no background tasks running anymore. Flush any remaining data in
-    // memory to disk.
-    //
-    // We assume that any incoming connections that might request pages from
-    // the tenant have already been terminated by the caller, so there
-    // should be no more activity in any of the repositories.
-    //
-    // On error, log it but continue with the shutdown for other tenants.
-
-    let mut join_set = tokio::task::JoinSet::new();
-
-    for tenant in tenants_to_freeze_and_flush {
-        let tenant_id = tenant.tenant_id();
-
-        join_set.spawn(
-            async move {
-                if let Err(err) = tenant.freeze_and_flush().await {
-                    warn!("Could not checkpoint tenant during shutdown: {err:?}");
-                }
-            }
-            .instrument(info_span!("freeze_and_flush", %tenant_id)),
-        );
-    }
-
-    while let Some(next) = join_set.join_next().await {
-        match next {
-            Ok(()) => {}
-            Err(join_error) if join_error.is_cancelled() => {
-                unreachable!("no cancelling")
-            }
-            Err(join_error) if join_error.is_panic() => { /* reported already */ }
-            Err(join_error) => warn!("unknown kind of JoinError: {join_error}"),
-        }
+        warn!(panicked, "observed panicks while shutting down tenants");
     }
 }
 
@@ -671,35 +613,26 @@ where
     // The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
     // tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
     // avoid holding the lock for the entire process.
-    {
-        let tenants_accessor = TENANTS.write().await;
-        match tenants_accessor.get(&tenant_id) {
-            Some(tenant) => {
-                let tenant = Arc::clone(tenant);
-                // don't hold TENANTS lock while set_stopping waits for activation to finish
-                drop(tenants_accessor);
-                match tenant.set_stopping().await {
-                    Ok(()) => {
-                        // we won, continue stopping procedure
-                    }
-                    Err(SetStoppingError::Broken) => {
-                        // continue the procedure, let's hope the closure can deal with broken tenants
-                    }
-                    Err(SetStoppingError::AlreadyStopping) => {
-                        // the tenant is already stopping or broken, don't do anything
-                        return Err(TenantStateError::IsStopping(tenant_id));
-                    }
-                }
-            }
-            None => return Err(TenantStateError::NotFound(tenant_id)),
+    let tenant = {
+        TENANTS
+            .write()
+            .await
+            .get(&tenant_id)
+            .cloned()
+            .ok_or(TenantStateError::NotFound(tenant_id))?
+    };
+
+    let freeze_and_flush = false;
+
+    // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
+    // that we can continue safely to cleanup.
+    match tenant.shutdown(freeze_and_flush).await {
+        Ok(()) => {}
+        Err(super::ShutdownError::AlreadyStopping) => {
+            return Err(TenantStateError::IsStopping(tenant_id))
         }
     }
 
-    // shutdown all tenant and timeline tasks: gc, compaction, page service)
-    // No new tasks will be started for this tenant because it's in `Stopping` state.
-    // Hence, once we're done here, the `tenant_cleanup` callback can mutate tenant on-disk state freely.
-    task_mgr::shutdown_tasks(None, Some(tenant_id), None).await;
-
     match tenant_cleanup
         .await
         .with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0c63fd126217..a810c367d808 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1689,6 +1689,9 @@ def assert_no_errors(self):
                 else:
                     errors.append(line)
 
+        for error in errors:
+            log.info(f"not allowed error: {error.strip()}")
+
         assert not errors
 
     def log_contains(self, pattern: str) -> Optional[str]:
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index aefc8befeb4d..baef8ecacc34 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -693,15 +693,15 @@ def test_empty_branch_remote_storage_upload_on_restart(
         f".*POST.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
     )
 
-    # index upload is now hitting the failpoint, should not block the shutdown
-    env.pageserver.stop()
+    # index upload is now hitting the failpoint, it should block the shutdown
+    env.pageserver.stop(immediate=True)
 
     timeline_path = (
         Path("tenants") / str(env.initial_tenant) / "timelines" / str(new_branch_timeline_id)
     )
 
     local_metadata = env.repo_dir / timeline_path / "metadata"
-    assert local_metadata.is_file(), "timeout cancelled timeline branching, not the upload"
+    assert local_metadata.is_file()
 
     assert isinstance(env.remote_storage, LocalFsStorage)
     new_branch_on_remote_storage = env.remote_storage.root / timeline_path
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 1e15a8e7cb23..be79538843c0 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -271,8 +271,9 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
     env.pageserver.allowed_errors.append(
         ".*Ignoring new state, equal to the existing one: Stopping"
     )
+    # this happens, because the stuck timeline is visible to shutdown
     env.pageserver.allowed_errors.append(
-        ".*during shutdown: cannot flush frozen layers when flush_loop is not running, state is Exited"
+        ".*freeze_and_flush_on_shutdown.+: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited"
     )
 
     ps_http = env.pageserver.http_client()

From df3bae2ce362f285f83b88fff96cf98094b40a9a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 6 Jun 2023 09:59:36 -0400
Subject: [PATCH 54/59] Use `compute_ctl` to manage Postgres in tests.  (#3886)

This adds test coverage for 'compute_ctl', as it is now used by all
the python tests.

There are a few differences in how 'compute_ctl' is called in the
tests, compared to the real web console:

- In the tests, the postgresql.conf file is included as one large
  string in the spec file, and it is written out as it is to the data
  directory.  I added a new field for that to the spec file. The real
  web console, however, sets all the necessary settings in the
  'settings' field, and 'compute_ctl' creates the postgresql.conf from
  those settings.

- In the tests, the information needed to connect to the storage, i.e.
  tenant_id, timeline_id, connection strings to pageserver and
  safekeepers, are now passed as new fields in the spec file. The real
  web console includes them as the GUCs in the 'settings' field. (Both
  of these are different from what the test control plane used to do:
  It used to write the GUCs directly in the postgresql.conf file). The
  plan is to change the control plane to use the new method, and
  remove the old method, but for now, support both.

Some tests that were sensitive to the amount of WAL generated needed
small changes, to accommodate that compute_ctl runs the background
health monitor which makes a few small updates. Also some tests shut
down the pageserver, and now that the background health check can run
some queries while the pageserver is down, that can produce a few
extra errors in the logs, which needed to be allowlisted.

Other changes:
- remove obsolete comments about PostgresNode;
- create standby.signal file for Static compute node;
- log output of `compute_ctl` and `postgres` is merged into
`endpoints/compute.log`.

---------

Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs          |  14 +-
 compute_tools/src/compute.rs                  |  70 ++-
 compute_tools/src/config.rs                   |  44 +-
 compute_tools/src/http/api.rs                 |   8 +-
 compute_tools/src/pg_helpers.rs               |   2 +-
 control_plane/src/bin/neon_local.rs           |  70 ++-
 control_plane/src/broker.rs                   |   6 +
 control_plane/src/endpoint.rs                 | 468 ++++++++++--------
 control_plane/src/local_env.rs                |   2 +-
 control_plane/src/pageserver.rs               |   6 +
 control_plane/src/safekeeper.rs               |   6 +
 libs/compute_api/src/responses.rs             |   8 +-
 libs/compute_api/src/spec.rs                  |  46 +-
 test_runner/fixtures/neon_fixtures.py         |  78 +--
 test_runner/regress/test_compatibility.py     |   6 +-
 test_runner/regress/test_compute_ctl.py       | 253 ----------
 test_runner/regress/test_neon_local_cli.py    |  11 +-
 test_runner/regress/test_tenant_detach.py     |  78 ++-
 test_runner/regress/test_tenant_size.py       |   9 +-
 test_runner/regress/test_tenants.py           |   1 +
 test_runner/regress/test_wal_acceptor.py      |  16 +-
 .../regress/test_wal_acceptor_async.py        |  36 +-
 test_runner/regress/test_wal_receiver.py      |   3 +-
 .../test_walredo_not_left_behind_on_detach.py |   3 +
 24 files changed, 634 insertions(+), 610 deletions(-)
 delete mode 100644 test_runner/regress/test_compute_ctl.py

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 2f515c9bf1d4..c6cfde1d1a4d 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -59,6 +59,9 @@ fn main() -> Result<()> {
 
     let matches = cli().get_matches();
 
+    let http_port = *matches
+        .get_one::<u16>("http-port")
+        .expect("http-port is required");
     let pgdata = matches
         .get_one::<String>("pgdata")
         .expect("PGDATA path is required");
@@ -178,7 +181,8 @@ fn main() -> Result<()> {
 
     // Launch http service first, so we were able to serve control-plane
     // requests, while configuration is still in progress.
-    let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
+    let _http_handle =
+        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
 
     if !spec_set {
         // No spec provided, hang waiting for it.
@@ -286,6 +290,14 @@ fn cli() -> clap::Command {
     let version = option_env!("CARGO_PKG_VERSION").unwrap_or("unknown");
     clap::Command::new("compute_ctl")
         .version(version)
+        .arg(
+            Arg::new("http-port")
+                .long("http-port")
+                .value_name("HTTP_PORT")
+                .default_value("3080")
+                .value_parser(clap::value_parser!(u16))
+                .required(false),
+        )
         .arg(
             Arg::new("connstr")
                 .short('C')
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index a7746629a858..617b330704d7 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,19 +1,3 @@
-//
-// XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`,
-// but there are several things that makes `PostgresNode` usage inconvenient in the
-// cloud:
-// - it inherits from `LocalEnv`, which contains **all-all** the information about
-//   a complete service running
-// - it uses `PageServerNode` with information about http endpoint, which we do not
-//   need in the cloud again
-// - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud
-//
-// Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required
-// attributes (not required for the cloud). Yet, it is still tempting to unify these
-// `PostgresNode` and `ComputeNode` and use one in both places.
-//
-// TODO: stabilize `ComputeNode` and think about using it in the `control_plane`.
-//
 use std::fs;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
@@ -106,26 +90,38 @@ pub struct ParsedSpec {
 impl TryFrom<ComputeSpec> for ParsedSpec {
     type Error = String;
     fn try_from(spec: ComputeSpec) -> Result<Self, String> {
+        // Extract the options from the spec file that are needed to connect to
+        // the storage system.
+        //
+        // For backwards-compatibility, the top-level fields in the spec file
+        // may be empty. In that case, we need to dig them from the GUCs in the
+        // cluster.settings field.
         let pageserver_connstr = spec
-            .cluster
-            .settings
-            .find("neon.pageserver_connstring")
+            .pageserver_connstring
+            .clone()
+            .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
             .ok_or("pageserver connstr should be provided")?;
         let storage_auth_token = spec.storage_auth_token.clone();
-        let tenant_id: TenantId = spec
-            .cluster
-            .settings
-            .find("neon.tenant_id")
-            .ok_or("tenant id should be provided")
-            .map(|s| TenantId::from_str(&s))?
-            .or(Err("invalid tenant id"))?;
-        let timeline_id: TimelineId = spec
-            .cluster
-            .settings
-            .find("neon.timeline_id")
-            .ok_or("timeline id should be provided")
-            .map(|s| TimelineId::from_str(&s))?
-            .or(Err("invalid timeline id"))?;
+        let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
+            tenant_id
+        } else {
+            spec.cluster
+                .settings
+                .find("neon.tenant_id")
+                .ok_or("tenant id should be provided")
+                .map(|s| TenantId::from_str(&s))?
+                .or(Err("invalid tenant id"))?
+        };
+        let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id {
+            timeline_id
+        } else {
+            spec.cluster
+                .settings
+                .find("neon.timeline_id")
+                .ok_or("timeline id should be provided")
+                .map(|s| TimelineId::from_str(&s))?
+                .or(Err("invalid timeline id"))?
+        };
 
         Ok(ParsedSpec {
             spec,
@@ -295,8 +291,8 @@ impl ComputeNode {
         update_pg_hba(pgdata_path)?;
 
         match spec.mode {
-            ComputeMode::Primary | ComputeMode::Static(..) => {}
-            ComputeMode::Replica => {
+            ComputeMode::Primary => {}
+            ComputeMode::Replica | ComputeMode::Static(..) => {
                 add_standby_signal(pgdata_path)?;
             }
         }
@@ -376,7 +372,7 @@ impl ComputeNode {
 
         info!(
             "finished configuration of compute for project {}",
-            spec.cluster.cluster_id
+            spec.cluster.cluster_id.as_deref().unwrap_or("None")
         );
 
         Ok(())
@@ -434,7 +430,7 @@ impl ComputeNode {
         let spec = compute_state.pspec.as_ref().expect("spec must be set");
         info!(
             "starting compute for project {}, operation {}, tenant {}, timeline {}",
-            spec.spec.cluster.cluster_id,
+            spec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
             spec.spec.operation_uuid.as_deref().unwrap_or("None"),
             spec.tenant_id,
             spec.timeline_id,
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 1168f3876af3..99346433d062 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -5,6 +5,7 @@ use std::path::Path;
 
 use anyhow::Result;
 
+use crate::pg_helpers::escape_conf_value;
 use crate::pg_helpers::PgOptionsSerialize;
 use compute_api::spec::{ComputeMode, ComputeSpec};
 
@@ -36,10 +37,44 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
     // File::create() destroys the file content if it exists.
     let mut file = File::create(path)?;
 
-    writeln!(file, "# Managed by compute_ctl: begin")?;
+    // Write the postgresql.conf content from the spec file as is.
+    if let Some(conf) = &spec.cluster.postgresql_conf {
+        writeln!(file, "{}", conf)?;
+    }
 
     write!(file, "{}", &spec.cluster.settings.as_pg_settings())?;
 
+    // Add options for connecting to storage
+    writeln!(file, "# Neon storage settings")?;
+    if let Some(s) = &spec.pageserver_connstring {
+        writeln!(
+            file,
+            "neon.pageserver_connstring='{}'",
+            escape_conf_value(s)
+        )?;
+    }
+    if !spec.safekeeper_connstrings.is_empty() {
+        writeln!(
+            file,
+            "neon.safekeepers='{}'",
+            escape_conf_value(&spec.safekeeper_connstrings.join(","))
+        )?;
+    }
+    if let Some(s) = &spec.tenant_id {
+        writeln!(
+            file,
+            "neon.tenant_id='{}'",
+            escape_conf_value(&s.to_string())
+        )?;
+    }
+    if let Some(s) = &spec.timeline_id {
+        writeln!(
+            file,
+            "neon.timeline_id='{}'",
+            escape_conf_value(&s.to_string())
+        )?;
+    }
+
     match spec.mode {
         ComputeMode::Primary => {}
         ComputeMode::Static(lsn) => {
@@ -53,7 +88,12 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
         }
     }
 
-    writeln!(file, "# Managed by compute_ctl: end")?;
+    // If there are any extra options in the 'settings' field, append those
+    if spec.cluster.settings.is_some() {
+        writeln!(file, "# Managed by compute_ctl: begin")?;
+        write!(file, "{}", spec.cluster.settings.as_pg_settings())?;
+        writeln!(file, "# Managed by compute_ctl: end")?;
+    }
 
     Ok(())
 }
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 4468f6f5e49a..afd9c2fb5479 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -220,8 +220,8 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
 
 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
-async fn serve(state: Arc<ComputeNode>) {
-    let addr = SocketAddr::from(([0, 0, 0, 0], 3080));
+async fn serve(port: u16, state: Arc<ComputeNode>) {
+    let addr = SocketAddr::from(([0, 0, 0, 0], port));
 
     let make_service = make_service_fn(move |_conn| {
         let state = state.clone();
@@ -256,10 +256,10 @@ async fn serve(state: Arc<ComputeNode>) {
 }
 
 /// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
-pub fn launch_http_server(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
+pub fn launch_http_server(port: u16, state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
     let state = Arc::clone(state);
 
     Ok(thread::Builder::new()
         .name("http-endpoint".into())
-        .spawn(move || serve(state))?)
+        .spawn(move || serve(port, state))?)
 }
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index ed00485d5ac3..d5c845e9eaae 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -23,7 +23,7 @@ fn escape_literal(s: &str) -> String {
 
 /// Escape a string so that it can be used in postgresql.conf.
 /// Same as escape_literal, currently.
-fn escape_conf_value(s: &str) -> String {
+pub fn escape_conf_value(s: &str) -> String {
     s.replace('\'', "''").replace('\\', "\\\\")
 }
 
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 39551642c0b6..52af936d7b7d 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -476,10 +476,11 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
 
             println!("Creating endpoint for imported timeline ...");
             cplane.new_endpoint(
-                tenant_id,
                 name,
+                tenant_id,
                 timeline_id,
                 None,
+                None,
                 pg_version,
                 ComputeMode::Primary,
             )?;
@@ -591,7 +592,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
 
                 table.add_row([
                     endpoint_id.as_str(),
-                    &endpoint.address.to_string(),
+                    &endpoint.pg_address.to_string(),
                     &endpoint.timeline_id.to_string(),
                     branch_name,
                     lsn_str.as_str(),
@@ -620,8 +621,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                 .get_branch_timeline_id(branch_name, tenant_id)
                 .ok_or_else(|| anyhow!("Found no timeline id for branch name '{branch_name}'"))?;
 
-            let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
-
+            let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
+            let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
             let pg_version = sub_args
                 .get_one::<u32>("pg-version")
                 .copied()
@@ -639,14 +640,38 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                 (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
             };
 
-            cplane.new_endpoint(tenant_id, &endpoint_id, timeline_id, port, pg_version, mode)?;
+            cplane.new_endpoint(
+                &endpoint_id,
+                tenant_id,
+                timeline_id,
+                pg_port,
+                http_port,
+                pg_version,
+                mode,
+            )?;
         }
         "start" => {
-            let port: Option<u16> = sub_args.get_one::<u16>("port").copied();
+            let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
+            let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
             let endpoint_id = sub_args
                 .get_one::<String>("endpoint_id")
                 .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
 
+            // If --safekeepers argument is given, use only the listed safekeeper nodes.
+            let safekeepers =
+                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
+                    let mut safekeepers: Vec<NodeId> = Vec::new();
+                    for sk_id in safekeepers_str.split(',').map(str::trim) {
+                        let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| {
+                            anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list")
+                        })?);
+                        safekeepers.push(sk_id);
+                    }
+                    safekeepers
+                } else {
+                    env.safekeepers.iter().map(|sk| sk.id).collect()
+                };
+
             let endpoint = cplane.endpoints.get(endpoint_id.as_str());
 
             let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
@@ -673,7 +698,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                     _ => {}
                 }
                 println!("Starting existing endpoint {endpoint_id}...");
-                endpoint.start(&auth_token)?;
+                endpoint.start(&auth_token, safekeepers)?;
             } else {
                 let branch_name = sub_args
                     .get_one::<String>("branch-name")
@@ -709,14 +734,15 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                 println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");
 
                 let ep = cplane.new_endpoint(
-                    tenant_id,
                     endpoint_id,
+                    tenant_id,
                     timeline_id,
-                    port,
+                    pg_port,
+                    http_port,
                     pg_version,
                     mode,
                 )?;
-                ep.start(&auth_token)?;
+                ep.start(&auth_token, safekeepers)?;
             }
         }
         "stop" => {
@@ -944,11 +970,22 @@ fn cli() -> Command {
         .value_parser(value_parser!(u32))
         .default_value(DEFAULT_PG_VERSION);
 
-    let port_arg = Arg::new("port")
-        .long("port")
+    let pg_port_arg = Arg::new("pg-port")
+        .long("pg-port")
         .required(false)
         .value_parser(value_parser!(u16))
-        .value_name("port");
+        .value_name("pg-port");
+
+    let http_port_arg = Arg::new("http-port")
+        .long("http-port")
+        .required(false)
+        .value_parser(value_parser!(u16))
+        .value_name("http-port");
+
+    let safekeepers_arg = Arg::new("safekeepers")
+        .long("safekeepers")
+        .required(false)
+        .value_name("safekeepers");
 
     let stop_mode_arg = Arg::new("stop-mode")
         .short('m')
@@ -1093,7 +1130,8 @@ fn cli() -> Command {
                     .arg(branch_name_arg.clone())
                     .arg(tenant_id_arg.clone())
                     .arg(lsn_arg.clone())
-                    .arg(port_arg.clone())
+                    .arg(pg_port_arg.clone())
+                    .arg(http_port_arg.clone())
                     .arg(
                         Arg::new("config-only")
                             .help("Don't do basebackup, create endpoint directory with only config files")
@@ -1109,9 +1147,11 @@ fn cli() -> Command {
                     .arg(branch_name_arg)
                     .arg(timeline_id_arg)
                     .arg(lsn_arg)
-                    .arg(port_arg)
+                    .arg(pg_port_arg)
+                    .arg(http_port_arg)
                     .arg(pg_version_arg)
                     .arg(hot_standby_arg)
+                    .arg(safekeepers_arg)
                 )
                 .subcommand(
                     Command::new("stop")
diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs
index 6c0604a0765e..ad19dfa2049a 100644
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -1,3 +1,9 @@
+//! Code to manage the storage broker
+//!
+//! In the local test environment, the data for each safekeeper is stored in
+//!
+//!   .neon/safekeepers/<safekeeper id>
+//!
 use anyhow::Context;
 
 use std::path::PathBuf;
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index cc5a7a416890..b28315a35deb 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -1,40 +1,71 @@
+//! Code to manage compute endpoints
+//!
+//! In the local test environment, the data for each endpoint is stored in
+//!
+//!   .neon/endpoints/<endpoint id>
+//!
+//! Some basic information about the endpoint, like the tenant and timeline IDs,
+//! are stored in the `endpoint.json` file. The `endpoint.json` file is created
+//! when the endpoint is created, and doesn't change afterwards.
+//!
+//! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
+//! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
+//! the basebackup from the pageserver to initialize the the data directory, and
+//! finally launches the PostgreSQL process. It watches the PostgreSQL process
+//! until it exits.
+//!
+//! When an endpoint is created, a `postgresql.conf` file is also created in
+//! the endpoint's directory. The file can be modified before starting PostgreSQL.
+//! However, the `postgresql.conf` file in the endpoint directory is not used directly
+//! by PostgreSQL. It is passed to `compute_ctl`, and `compute_ctl` writes another
+//! copy of it in the data directory.
+//!
+//! Directory contents:
+//!
+//! ```ignore
+//! .neon/endpoints/main/
+//!     compute.log               - log output of `compute_ctl` and `postgres`
+//!     endpoint.json             - serialized `EndpointConf` struct
+//!     postgresql.conf           - postgresql settings
+//!     spec.json                 - passed to `compute_ctl`
+//!     pgdata/
+//!         postgresql.conf       - copy of postgresql.conf created by `compute_ctl`
+//!         zenith.signal
+//!         <other PostgreSQL files>
+//! ```
+//!
 use std::collections::BTreeMap;
-use std::fs::{self, File};
-use std::io::Write;
 use std::net::SocketAddr;
 use std::net::TcpStream;
-use std::os::unix::fs::PermissionsExt;
 use std::path::PathBuf;
-use std::process::{Command, Stdio};
-use std::str::FromStr;
+use std::process::Command;
 use std::sync::Arc;
 use std::time::Duration;
 
-use anyhow::{Context, Result};
+use anyhow::{anyhow, bail, Context, Result};
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
+use utils::id::{NodeId, TenantId, TimelineId};
 
 use crate::local_env::LocalEnv;
 use crate::pageserver::PageServerNode;
 use crate::postgresql_conf::PostgresConf;
 
-use compute_api::spec::ComputeMode;
+use compute_api::responses::{ComputeState, ComputeStatus};
+use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
 
 // contents of a endpoint.json file
 #[serde_as]
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct EndpointConf {
-    name: String,
+    endpoint_id: String,
     #[serde_as(as = "DisplayFromStr")]
     tenant_id: TenantId,
     #[serde_as(as = "DisplayFromStr")]
     timeline_id: TimelineId,
     mode: ComputeMode,
-    port: u16,
+    pg_port: u16,
+    http_port: u16,
     pg_version: u32,
 }
 
@@ -57,11 +88,11 @@ impl ComputeControlPlane {
         let pageserver = Arc::new(PageServerNode::from_env(&env));
 
         let mut endpoints = BTreeMap::default();
-        for endpoint_dir in fs::read_dir(env.endpoints_path())
+        for endpoint_dir in std::fs::read_dir(env.endpoints_path())
             .with_context(|| format!("failed to list {}", env.endpoints_path().display()))?
         {
             let ep = Endpoint::from_dir_entry(endpoint_dir?, &env, &pageserver)?;
-            endpoints.insert(ep.name.clone(), Arc::new(ep));
+            endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep));
         }
 
         Ok(ComputeControlPlane {
@@ -76,25 +107,28 @@ impl ComputeControlPlane {
         1 + self
             .endpoints
             .values()
-            .map(|ep| ep.address.port())
+            .map(|ep| std::cmp::max(ep.pg_address.port(), ep.http_address.port()))
             .max()
             .unwrap_or(self.base_port)
     }
 
+    #[allow(clippy::too_many_arguments)]
     pub fn new_endpoint(
         &mut self,
+        endpoint_id: &str,
         tenant_id: TenantId,
-        name: &str,
         timeline_id: TimelineId,
-        port: Option<u16>,
+        pg_port: Option<u16>,
+        http_port: Option<u16>,
         pg_version: u32,
         mode: ComputeMode,
     ) -> Result<Arc<Endpoint>> {
-        let port = port.unwrap_or_else(|| self.get_port());
-
+        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
+        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
         let ep = Arc::new(Endpoint {
-            name: name.to_owned(),
-            address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
+            endpoint_id: endpoint_id.to_owned(),
+            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
+            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
             env: self.env.clone(),
             pageserver: Arc::clone(&self.pageserver),
             timeline_id,
@@ -102,21 +136,27 @@ impl ComputeControlPlane {
             tenant_id,
             pg_version,
         });
-        ep.create_pgdata()?;
+
+        ep.create_endpoint_dir()?;
         std::fs::write(
             ep.endpoint_path().join("endpoint.json"),
             serde_json::to_string_pretty(&EndpointConf {
-                name: name.to_string(),
+                endpoint_id: endpoint_id.to_string(),
                 tenant_id,
                 timeline_id,
                 mode,
-                port,
+                http_port,
+                pg_port,
                 pg_version,
             })?,
         )?;
-        ep.setup_pg_conf()?;
+        std::fs::write(
+            ep.endpoint_path().join("postgresql.conf"),
+            ep.setup_pg_conf()?.to_string(),
+        )?;
 
-        self.endpoints.insert(ep.name.clone(), Arc::clone(&ep));
+        self.endpoints
+            .insert(ep.endpoint_id.clone(), Arc::clone(&ep));
 
         Ok(ep)
     }
@@ -127,13 +167,15 @@ impl ComputeControlPlane {
 #[derive(Debug)]
 pub struct Endpoint {
     /// used as the directory name
-    name: String,
+    endpoint_id: String,
     pub tenant_id: TenantId,
     pub timeline_id: TimelineId,
     pub mode: ComputeMode,
 
-    // port and address of the Postgres server
-    pub address: SocketAddr,
+    // port and address of the Postgres server and `compute_ctl`'s HTTP API
+    pub pg_address: SocketAddr,
+    pub http_address: SocketAddr,
+
     // postgres major version in the format: 14, 15, etc.
     pg_version: u32,
 
@@ -158,16 +200,16 @@ impl Endpoint {
 
         // parse data directory name
         let fname = entry.file_name();
-        let name = fname.to_str().unwrap().to_string();
+        let endpoint_id = fname.to_str().unwrap().to_string();
 
         // Read the endpoint.json file
         let conf: EndpointConf =
             serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
 
-        // ok now
         Ok(Endpoint {
-            address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.port),
-            name,
+            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
+            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
+            endpoint_id,
             env: env.clone(),
             pageserver: Arc::clone(pageserver),
             timeline_id: conf.timeline_id,
@@ -177,104 +219,17 @@ impl Endpoint {
         })
     }
 
-    fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
-        let pg_path = self.env.pg_bin_dir(pg_version)?.join("postgres");
-        let mut cmd = Command::new(pg_path);
-
-        cmd.arg("--sync-safekeepers")
-            .env_clear()
-            .env(
-                "LD_LIBRARY_PATH",
-                self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
-            )
-            .env(
-                "DYLD_LIBRARY_PATH",
-                self.env.pg_lib_dir(pg_version)?.to_str().unwrap(),
-            )
-            .env("PGDATA", self.pgdata().to_str().unwrap())
-            .stdout(Stdio::piped())
-            // Comment this to avoid capturing stderr (useful if command hangs)
-            .stderr(Stdio::piped());
-
-        if let Some(token) = auth_token {
-            cmd.env("NEON_AUTH_TOKEN", token);
-        }
-
-        let sync_handle = cmd
-            .spawn()
-            .expect("postgres --sync-safekeepers failed to start");
-
-        let sync_output = sync_handle
-            .wait_with_output()
-            .expect("postgres --sync-safekeepers failed");
-        if !sync_output.status.success() {
-            anyhow::bail!(
-                "sync-safekeepers failed: '{}'",
-                String::from_utf8_lossy(&sync_output.stderr)
-            );
-        }
-
-        let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
-        println!("Safekeepers synced on {}", lsn);
-        Ok(lsn)
-    }
-
-    /// Get basebackup from the pageserver as a tar archive and extract it
-    /// to the `self.pgdata()` directory.
-    fn do_basebackup(&self, lsn: Option<Lsn>) -> Result<()> {
-        println!(
-            "Extracting base backup to create postgres instance: path={} port={}",
-            self.pgdata().display(),
-            self.address.port()
-        );
-
-        let sql = if let Some(lsn) = lsn {
-            format!("basebackup {} {} {}", self.tenant_id, self.timeline_id, lsn)
-        } else {
-            format!("basebackup {} {}", self.tenant_id, self.timeline_id)
-        };
-
-        let mut client = self
-            .pageserver
-            .page_server_psql_client()
-            .context("connecting to page server failed")?;
-
-        let copyreader = client
-            .copy_out(sql.as_str())
-            .context("page server 'basebackup' command failed")?;
-
-        // Read the archive directly from the `CopyOutReader`
-        //
-        // Set `ignore_zeros` so that unpack() reads all the Copy data and
-        // doesn't stop at the end-of-archive marker. Otherwise, if the server
-        // sends an Error after finishing the tarball, we will not notice it.
-        let mut ar = tar::Archive::new(copyreader);
-        ar.set_ignore_zeros(true);
-        ar.unpack(&self.pgdata())
-            .context("extracting base backup failed")?;
-
-        Ok(())
-    }
-
-    fn create_pgdata(&self) -> Result<()> {
-        fs::create_dir_all(self.pgdata()).with_context(|| {
+    fn create_endpoint_dir(&self) -> Result<()> {
+        std::fs::create_dir_all(self.endpoint_path()).with_context(|| {
             format!(
-                "could not create data directory {}",
-                self.pgdata().display()
+                "could not create endpoint directory {}",
+                self.endpoint_path().display()
             )
-        })?;
-        fs::set_permissions(self.pgdata().as_path(), fs::Permissions::from_mode(0o700))
-            .with_context(|| {
-                format!(
-                    "could not set permissions in data directory {}",
-                    self.pgdata().display()
-                )
-            })
+        })
     }
 
-    // Write postgresql.conf with default configuration
-    // and PG_VERSION file to the data directory of a new endpoint.
-    fn setup_pg_conf(&self) -> Result<()> {
+    // Generate postgresql.conf with default configuration
+    fn setup_pg_conf(&self) -> Result<PostgresConf> {
         let mut conf = PostgresConf::new();
         conf.append("max_wal_senders", "10");
         conf.append("wal_log_hints", "off");
@@ -287,25 +242,14 @@ impl Endpoint {
         // wal_sender_timeout is the maximum time to wait for WAL replication.
         // It also defines how often the walreciever will send a feedback message to the wal sender.
         conf.append("wal_sender_timeout", "5s");
-        conf.append("listen_addresses", &self.address.ip().to_string());
-        conf.append("port", &self.address.port().to_string());
+        conf.append("listen_addresses", &self.pg_address.ip().to_string());
+        conf.append("port", &self.pg_address.port().to_string());
         conf.append("wal_keep_size", "0");
         // walproposer panics when basebackup is invalid, it is pointless to restart in this case.
         conf.append("restart_after_crash", "off");
 
-        // Configure the Neon Postgres extension to fetch pages from pageserver
-        let pageserver_connstr = {
-            let config = &self.pageserver.pg_connection_config;
-            let (host, port) = (config.host(), config.port());
-
-            // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
-            format!("postgresql://no_user@{host}:{port}")
-        };
+        // Load the 'neon' extension
         conf.append("shared_preload_libraries", "neon");
-        conf.append_line("");
-        conf.append("neon.pageserver_connstring", &pageserver_connstr);
-        conf.append("neon.tenant_id", &self.tenant_id.to_string());
-        conf.append("neon.timeline_id", &self.timeline_id.to_string());
 
         conf.append_line("");
         // Replication-related configurations, such as WAL sending
@@ -390,46 +334,11 @@ impl Endpoint {
             }
         }
 
-        let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
-        file.write_all(conf.to_string().as_bytes())?;
-
-        let mut file = File::create(self.pgdata().join("PG_VERSION"))?;
-        file.write_all(self.pg_version.to_string().as_bytes())?;
-
-        Ok(())
-    }
-
-    fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
-        let backup_lsn = match &self.mode {
-            ComputeMode::Primary => {
-                if !self.env.safekeepers.is_empty() {
-                    // LSN 0 means that it is bootstrap and we need to download just
-                    // latest data from the pageserver. That is a bit clumsy but whole bootstrap
-                    // procedure evolves quite actively right now, so let's think about it again
-                    // when things would be more stable (TODO).
-                    let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
-                    if lsn == Lsn(0) {
-                        None
-                    } else {
-                        Some(lsn)
-                    }
-                } else {
-                    None
-                }
-            }
-            ComputeMode::Static(lsn) => Some(*lsn),
-            ComputeMode::Replica => {
-                None // Take the latest snapshot available to start with
-            }
-        };
-
-        self.do_basebackup(backup_lsn)?;
-
-        Ok(())
+        Ok(conf)
     }
 
     pub fn endpoint_path(&self) -> PathBuf {
-        self.env.endpoints_path().join(&self.name)
+        self.env.endpoints_path().join(&self.endpoint_id)
     }
 
     pub fn pgdata(&self) -> PathBuf {
@@ -439,7 +348,7 @@ impl Endpoint {
     pub fn status(&self) -> &str {
         let timeout = Duration::from_millis(300);
         let has_pidfile = self.pgdata().join("postmaster.pid").exists();
-        let can_connect = TcpStream::connect_timeout(&self.address, timeout).is_ok();
+        let can_connect = TcpStream::connect_timeout(&self.pg_address, timeout).is_ok();
 
         match (has_pidfile, can_connect) {
             (true, true) => "running",
@@ -457,8 +366,6 @@ impl Endpoint {
                 &[
                     "-D",
                     self.pgdata().to_str().unwrap(),
-                    "-l",
-                    self.pgdata().join("pg.log").to_str().unwrap(),
                     "-w", //wait till pg_ctl actually does what was asked
                 ],
                 args,
@@ -494,36 +401,183 @@ impl Endpoint {
         Ok(())
     }
 
-    pub fn start(&self, auth_token: &Option<String>) -> Result<()> {
+    pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
         if self.status() == "running" {
             anyhow::bail!("The endpoint is already running");
         }
 
-        // 1. We always start Postgres from scratch, so
-        // if old dir exists, preserve 'postgresql.conf' and drop the directory
-        let postgresql_conf_path = self.pgdata().join("postgresql.conf");
-        let postgresql_conf = fs::read(&postgresql_conf_path).with_context(|| {
-            format!(
-                "failed to read config file in {}",
-                postgresql_conf_path.to_str().unwrap()
-            )
-        })?;
-        fs::remove_dir_all(self.pgdata())?;
-        self.create_pgdata()?;
+        // Slurp the endpoints/<endpoint id>/postgresql.conf file into
+        // memory. We will include it in the spec file that we pass to
+        // `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
+        // in the data directory.
+        let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
+        let postgresql_conf = match std::fs::read(&postgresql_conf_path) {
+            Ok(content) => String::from_utf8(content)?,
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(),
+            Err(e) => {
+                return Err(anyhow::Error::new(e).context(format!(
+                    "failed to read config file in {}",
+                    postgresql_conf_path.to_str().unwrap()
+                )))
+            }
+        };
 
-        // 2. Bring back config files
-        fs::write(&postgresql_conf_path, postgresql_conf)?;
+        // We always start the compute node from scratch, so if the Postgres
+        // data dir exists from a previous launch, remove it first.
+        if self.pgdata().exists() {
+            std::fs::remove_dir_all(self.pgdata())?;
+        }
 
-        // 3. Load basebackup
-        self.load_basebackup(auth_token)?;
+        let pageserver_connstring = {
+            let config = &self.pageserver.pg_connection_config;
+            let (host, port) = (config.host(), config.port());
+
+            // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
+            format!("postgresql://no_user@{host}:{port}")
+        };
+        let mut safekeeper_connstrings = Vec::new();
+        if self.mode == ComputeMode::Primary {
+            for sk_id in safekeepers {
+                let sk = self
+                    .env
+                    .safekeepers
+                    .iter()
+                    .find(|node| node.id == sk_id)
+                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
+                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.pg_port));
+            }
+        }
 
-        if self.mode != ComputeMode::Primary {
-            File::create(self.pgdata().join("standby.signal"))?;
+        // Create spec file
+        let spec = ComputeSpec {
+            format_version: 1.0,
+            operation_uuid: None,
+            cluster: Cluster {
+                cluster_id: None, // project ID: not used
+                name: None,       // project name: not used
+                state: None,
+                roles: vec![],
+                databases: vec![],
+                settings: None,
+                postgresql_conf: Some(postgresql_conf),
+            },
+            delta_operations: None,
+            tenant_id: Some(self.tenant_id),
+            timeline_id: Some(self.timeline_id),
+            mode: self.mode,
+            pageserver_connstring: Some(pageserver_connstring),
+            safekeeper_connstrings,
+            storage_auth_token: auth_token.clone(),
+        };
+        let spec_path = self.endpoint_path().join("spec.json");
+        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
+
+        // Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
+        let logfile = std::fs::OpenOptions::new()
+            .create(true)
+            .append(true)
+            .open(self.endpoint_path().join("compute.log"))?;
+
+        // Launch compute_ctl
+        println!("Starting postgres node at '{}'", self.connstr());
+        let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
+        cmd.args(["--http-port", &self.http_address.port().to_string()])
+            .args(["--pgdata", self.pgdata().to_str().unwrap()])
+            .args(["--connstr", &self.connstr()])
+            .args([
+                "--spec-path",
+                self.endpoint_path().join("spec.json").to_str().unwrap(),
+            ])
+            .args([
+                "--pgbin",
+                self.env
+                    .pg_bin_dir(self.pg_version)?
+                    .join("postgres")
+                    .to_str()
+                    .unwrap(),
+            ])
+            .stdin(std::process::Stdio::null())
+            .stderr(logfile.try_clone()?)
+            .stdout(logfile);
+        let _child = cmd.spawn()?;
+
+        // Wait for it to start
+        let mut attempt = 0;
+        const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
+        const MAX_ATTEMPTS: u32 = 10 * 30; // Wait up to 30 s
+        loop {
+            attempt += 1;
+            match self.get_status() {
+                Ok(state) => {
+                    match state.status {
+                        ComputeStatus::Init => {
+                            if attempt == MAX_ATTEMPTS {
+                                bail!("compute startup timed out; still in Init state");
+                            }
+                            // keep retrying
+                        }
+                        ComputeStatus::Running => {
+                            // All good!
+                            break;
+                        }
+                        ComputeStatus::Failed => {
+                            bail!(
+                                "compute startup failed: {}",
+                                state
+                                    .error
+                                    .as_deref()
+                                    .unwrap_or("<no error from compute_ctl>")
+                            );
+                        }
+                        ComputeStatus::Empty
+                        | ComputeStatus::ConfigurationPending
+                        | ComputeStatus::Configuration => {
+                            bail!("unexpected compute status: {:?}", state.status)
+                        }
+                    }
+                }
+                Err(e) => {
+                    if attempt == MAX_ATTEMPTS {
+                        return Err(e).context(
+                            "timed out waiting to connect to compute_ctl HTTP; last error: {e}",
+                        );
+                    }
+                }
+            }
+            std::thread::sleep(ATTEMPT_INTERVAL);
         }
 
-        // 4. Finally start postgres
-        println!("Starting postgres at '{}'", self.connstr());
-        self.pg_ctl(&["start"], auth_token)
+        Ok(())
+    }
+
+    // Call the /status HTTP API
+    pub fn get_status(&self) -> Result<ComputeState> {
+        let client = reqwest::blocking::Client::new();
+
+        let response = client
+            .request(
+                reqwest::Method::GET,
+                format!(
+                    "http://{}:{}/status",
+                    self.http_address.ip(),
+                    self.http_address.port()
+                ),
+            )
+            .send()?;
+
+        // Interpret the response
+        let status = response.status();
+        if !(status.is_client_error() || status.is_server_error()) {
+            Ok(response.json()?)
+        } else {
+            // reqwest does not export its error construction utility functions, so let's craft the message ourselves
+            let url = response.url().to_owned();
+            let msg = match response.text() {
+                Ok(err_body) => format!("Error: {}", err_body),
+                Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
+            };
+            Err(anyhow::anyhow!(msg))
+        }
     }
 
     pub fn stop(&self, destroy: bool) -> Result<()> {
@@ -540,7 +594,7 @@ impl Endpoint {
                 "Destroying postgres data directory '{}'",
                 self.pgdata().to_str().unwrap()
             );
-            fs::remove_dir_all(self.endpoint_path())?;
+            std::fs::remove_dir_all(self.endpoint_path())?;
         } else {
             self.pg_ctl(&["stop"], &None)?;
         }
@@ -549,10 +603,10 @@ impl Endpoint {
 
     pub fn connstr(&self) -> String {
         format!(
-            "host={} port={} user={} dbname={}",
-            self.address.ip(),
-            self.address.port(),
+            "postgresql://{}@{}:{}/{}",
             "cloud_admin",
+            self.pg_address.ip(),
+            self.pg_address.port(),
             "postgres"
         )
     }
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 9286944412dd..df70cb313928 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -37,7 +37,7 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 pub struct LocalEnv {
     // Base directory for all the nodes (the pageserver, safekeepers and
-    // compute nodes).
+    // compute endpoints).
     //
     // This is not stored in the config file. Rather, this is the path where the
     // config file itself is. It is read from the NEON_REPO_DIR env variable or
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 400df60f0e50..2ff09021e536 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -1,3 +1,9 @@
+//! Code to manage pageservers
+//!
+//! In the local test environment, the pageserver stores its data directly in
+//!
+//!   .neon/
+//!
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::fs::File;
diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index d358f7334325..9e053ff1f19f 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -1,3 +1,9 @@
+//! Code to manage safekeepers
+//!
+//! In the local test environment, the data for each safekeeper is stored in
+//!
+//!   .neon/safekeepers/<safekeeper id>
+//!
 use std::io::Write;
 use std::path::PathBuf;
 use std::process::Child;
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index d181c018b1be..ce73dda08ad8 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -5,13 +5,13 @@ use serde::{Deserialize, Serialize, Serializer};
 
 use crate::spec::ComputeSpec;
 
-#[derive(Serialize, Debug)]
+#[derive(Serialize, Debug, Deserialize)]
 pub struct GenericAPIError {
     pub error: String,
 }
 
 /// Response of the /status API
-#[derive(Serialize, Debug)]
+#[derive(Serialize, Debug, Deserialize)]
 #[serde(rename_all = "snake_case")]
 pub struct ComputeStatusResponse {
     pub start_time: DateTime<Utc>,
@@ -23,7 +23,7 @@ pub struct ComputeStatusResponse {
     pub error: Option<String>,
 }
 
-#[derive(Serialize)]
+#[derive(Deserialize, Serialize)]
 #[serde(rename_all = "snake_case")]
 pub struct ComputeState {
     pub status: ComputeStatus,
@@ -33,7 +33,7 @@ pub struct ComputeState {
     pub error: Option<String>,
 }
 
-#[derive(Serialize, Clone, Copy, Debug, PartialEq, Eq)]
+#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeStatus {
     // Spec wasn't provided at start, waiting for it to be
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 6072980ed8bb..4014774a7ed0 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -5,6 +5,7 @@
 //! and connect it to the storage nodes.
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
+use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 
 /// String type alias representing Postgres identifier and
@@ -14,7 +15,7 @@ pub type PgIdent = String;
 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
 #[serde_as]
-#[derive(Clone, Debug, Default, Deserialize)]
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct ComputeSpec {
     pub format_version: f32,
 
@@ -26,9 +27,32 @@ pub struct ComputeSpec {
     pub cluster: Cluster,
     pub delta_operations: Option<Vec<DeltaOp>>,
 
+    // Information needed to connect to the storage layer.
+    //
+    // `tenant_id`, `timeline_id` and `pageserver_connstring` are always needed.
+    //
+    // Depending on `mode`, this can be a primary read-write node, a read-only
+    // replica, or a read-only node pinned at an older LSN.
+    // `safekeeper_connstrings` must be set for a primary.
+    //
+    // For backwards compatibility, the control plane may leave out all of
+    // these, and instead set the "neon.tenant_id", "neon.timeline_id",
+    // etc. GUCs in cluster.settings. TODO: Once the control plane has been
+    // updated to fill these fields, we can make these non optional.
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub tenant_id: Option<TenantId>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub timeline_id: Option<TimelineId>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    pub pageserver_connstring: Option<String>,
+    #[serde(default)]
+    pub safekeeper_connstrings: Vec<String>,
+
     #[serde(default)]
     pub mode: ComputeMode,
 
+    /// If set, 'storage_auth_token' is used as the password to authenticate to
+    /// the pageserver and safekeepers.
     pub storage_auth_token: Option<String>,
 }
 
@@ -47,13 +71,19 @@ pub enum ComputeMode {
     Replica,
 }
 
-#[derive(Clone, Debug, Default, Deserialize)]
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct Cluster {
-    pub cluster_id: String,
-    pub name: String,
+    pub cluster_id: Option<String>,
+    pub name: Option<String>,
     pub state: Option<String>,
     pub roles: Vec<Role>,
     pub databases: Vec<Database>,
+
+    /// Desired contents of 'postgresql.conf' file. (The 'compute_ctl'
+    /// tool may add additional settings to the final file.)
+    pub postgresql_conf: Option<String>,
+
+    /// Additional settings that will be appended to the 'postgresql.conf' file.
     pub settings: GenericOptions,
 }
 
@@ -63,7 +93,7 @@ pub struct Cluster {
 /// - DROP ROLE
 /// - ALTER ROLE name RENAME TO new_name
 /// - ALTER DATABASE name RENAME TO new_name
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct DeltaOp {
     pub action: String,
     pub name: PgIdent,
@@ -72,7 +102,7 @@ pub struct DeltaOp {
 
 /// Rust representation of Postgres role info with only those fields
 /// that matter for us.
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct Role {
     pub name: PgIdent,
     pub encrypted_password: Option<String>,
@@ -81,7 +111,7 @@ pub struct Role {
 
 /// Rust representation of Postgres database info with only those fields
 /// that matter for us.
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct Database {
     pub name: PgIdent,
     pub owner: PgIdent,
@@ -91,7 +121,7 @@ pub struct Database {
 /// Common type representing both SQL statement params with or without value,
 /// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
 /// options like `wal_level = logical`.
-#[derive(Clone, Debug, Deserialize)]
+#[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct GenericOption {
     pub name: String,
     pub value: Option<String>,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a810c367d808..551faa116e1a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1446,11 +1446,12 @@ def safekeeper_stop(
     def endpoint_create(
         self,
         branch_name: str,
+        pg_port: int,
+        http_port: int,
         endpoint_id: Optional[str] = None,
         tenant_id: Optional[TenantId] = None,
         hot_standby: bool = False,
         lsn: Optional[Lsn] = None,
-        port: Optional[int] = None,
     ) -> "subprocess.CompletedProcess[str]":
         args = [
             "endpoint",
@@ -1464,8 +1465,10 @@ def endpoint_create(
         ]
         if lsn is not None:
             args.extend(["--lsn", str(lsn)])
-        if port is not None:
-            args.extend(["--port", str(port)])
+        if pg_port is not None:
+            args.extend(["--pg-port", str(pg_port)])
+        if http_port is not None:
+            args.extend(["--http-port", str(http_port)])
         if endpoint_id is not None:
             args.append(endpoint_id)
         if hot_standby:
@@ -1478,9 +1481,11 @@ def endpoint_create(
     def endpoint_start(
         self,
         endpoint_id: str,
+        pg_port: int,
+        http_port: int,
+        safekeepers: Optional[List[int]] = None,
         tenant_id: Optional[TenantId] = None,
         lsn: Optional[Lsn] = None,
-        port: Optional[int] = None,
     ) -> "subprocess.CompletedProcess[str]":
         args = [
             "endpoint",
@@ -1492,8 +1497,10 @@ def endpoint_start(
         ]
         if lsn is not None:
             args.append(f"--lsn={lsn}")
-        if port is not None:
-            args.append(f"--port={port}")
+        args.extend(["--pg-port", str(pg_port)])
+        args.extend(["--http-port", str(http_port)])
+        if safekeepers is not None:
+            args.extend(["--safekeepers", (",".join(map(str, safekeepers)))])
         if endpoint_id is not None:
             args.append(endpoint_id)
 
@@ -2284,17 +2291,24 @@ class Endpoint(PgProtocol):
     """An object representing a Postgres compute endpoint managed by the control plane."""
 
     def __init__(
-        self, env: NeonEnv, tenant_id: TenantId, port: int, check_stop_result: bool = True
+        self,
+        env: NeonEnv,
+        tenant_id: TenantId,
+        pg_port: int,
+        http_port: int,
+        check_stop_result: bool = True,
     ):
-        super().__init__(host="localhost", port=port, user="cloud_admin", dbname="postgres")
+        super().__init__(host="localhost", port=pg_port, user="cloud_admin", dbname="postgres")
         self.env = env
         self.running = False
         self.branch_name: Optional[str] = None  # dubious
         self.endpoint_id: Optional[str] = None  # dubious, see asserts below
         self.pgdata_dir: Optional[str] = None  # Path to computenode PGDATA
         self.tenant_id = tenant_id
-        self.port = port
+        self.pg_port = pg_port
+        self.http_port = http_port
         self.check_stop_result = check_stop_result
+        self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers))
         # path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf
 
     def create(
@@ -2324,7 +2338,8 @@ def create(
             tenant_id=self.tenant_id,
             lsn=lsn,
             hot_standby=hot_standby,
-            port=self.port,
+            pg_port=self.pg_port,
+            http_port=self.http_port,
         )
         path = Path("endpoints") / self.endpoint_id / "pgdata"
         self.pgdata_dir = os.path.join(self.env.repo_dir, path)
@@ -2349,7 +2364,13 @@ def start(self) -> "Endpoint":
 
         log.info(f"Starting postgres endpoint {self.endpoint_id}")
 
-        self.env.neon_cli.endpoint_start(self.endpoint_id, tenant_id=self.tenant_id, port=self.port)
+        self.env.neon_cli.endpoint_start(
+            self.endpoint_id,
+            pg_port=self.pg_port,
+            http_port=self.http_port,
+            tenant_id=self.tenant_id,
+            safekeepers=self.active_safekeepers,
+        )
         self.running = True
 
         return self
@@ -2373,32 +2394,8 @@ def pg_twophase_dir_path(self) -> str:
         return os.path.join(self.pg_data_dir_path(), "pg_twophase")
 
     def config_file_path(self) -> str:
-        """Path to postgresql.conf"""
-        return os.path.join(self.pg_data_dir_path(), "postgresql.conf")
-
-    def adjust_for_safekeepers(self, safekeepers: str) -> "Endpoint":
-        """
-        Adjust instance config for working with wal acceptors instead of
-        pageserver (pre-configured by CLI) directly.
-        """
-
-        # TODO: reuse config()
-        with open(self.config_file_path(), "r") as f:
-            cfg_lines = f.readlines()
-        with open(self.config_file_path(), "w") as f:
-            for cfg_line in cfg_lines:
-                # walproposer uses different application_name
-                if (
-                    "synchronous_standby_names" in cfg_line
-                    or
-                    # don't repeat safekeepers/wal_acceptors multiple times
-                    "neon.safekeepers" in cfg_line
-                ):
-                    continue
-                f.write(cfg_line)
-            f.write("synchronous_standby_names = 'walproposer'\n")
-            f.write("neon.safekeepers = '{}'\n".format(safekeepers))
-        return self
+        """Path to the postgresql.conf in the endpoint directory (not the one in pgdata)"""
+        return os.path.join(self.endpoint_path(), "postgresql.conf")
 
     def config(self, lines: List[str]) -> "Endpoint":
         """
@@ -2503,7 +2500,8 @@ def create_start(
         ep = Endpoint(
             self.env,
             tenant_id=tenant_id or self.env.initial_tenant,
-            port=self.env.port_distributor.get_port(),
+            pg_port=self.env.port_distributor.get_port(),
+            http_port=self.env.port_distributor.get_port(),
         )
         self.num_instances += 1
         self.endpoints.append(ep)
@@ -2528,7 +2526,8 @@ def create(
         ep = Endpoint(
             self.env,
             tenant_id=tenant_id or self.env.initial_tenant,
-            port=self.env.port_distributor.get_port(),
+            pg_port=self.env.port_distributor.get_port(),
+            http_port=self.env.port_distributor.get_port(),
         )
 
         if endpoint_id is None:
@@ -2911,6 +2910,7 @@ def test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Iterator[P
         "pg_internal.init",
         "pg.log",
         "zenith.signal",
+        "pg_hba.conf",
         "postgresql.conf",
         "postmaster.opts",
         "postmaster.pid",
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index fe8dc293c1f2..2635dbd93c3b 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -383,6 +383,9 @@ def check_neon_works(
     cli_target = NeonCli(config_target)
 
     # And the current binaries to launch computes
+    snapshot_config["neon_distrib_dir"] = str(neon_current_binpath)
+    with (snapshot_config_toml).open("w") as f:
+        toml.dump(snapshot_config, f)
     config_current = copy.copy(config)
     config_current.neon_binpath = neon_current_binpath
     cli_current = NeonCli(config_current)
@@ -391,7 +394,8 @@ def check_neon_works(
     request.addfinalizer(lambda: cli_target.raw_cli(["stop"]))
 
     pg_port = port_distributor.get_port()
-    cli_current.endpoint_start("main", port=pg_port)
+    http_port = port_distributor.get_port()
+    cli_current.endpoint_start("main", pg_port=pg_port, http_port=http_port)
     request.addfinalizer(lambda: cli_current.endpoint_stop("main"))
 
     connstr = f"host=127.0.0.1 port={pg_port} user=cloud_admin dbname=postgres"
diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py
deleted file mode 100644
index d72ffe078d83..000000000000
--- a/test_runner/regress/test_compute_ctl.py
+++ /dev/null
@@ -1,253 +0,0 @@
-import os
-from pathlib import Path
-from subprocess import TimeoutExpired
-
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import ComputeCtl, NeonEnvBuilder, PgBin
-
-
-# Test that compute_ctl works and prints "--sync-safekeepers" logs.
-def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
-    neon_env_builder.num_safekeepers = 3
-    env = neon_env_builder.init_start()
-    ctl = ComputeCtl(env)
-
-    env.neon_cli.create_branch("test_compute_ctl", "main")
-    endpoint = env.endpoints.create_start("test_compute_ctl")
-    endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
-
-    with open(endpoint.config_file_path(), "r") as f:
-        cfg_lines = f.readlines()
-    cfg_map = {}
-    for line in cfg_lines:
-        if "=" in line:
-            k, v = line.split("=")
-            cfg_map[k] = v.strip("\n '\"")
-    log.info(f"postgres config: {cfg_map}")
-    pgdata = endpoint.pg_data_dir_path()
-    pg_bin_path = os.path.join(pg_bin.pg_bin_path, "postgres")
-
-    endpoint.stop_and_destroy()
-
-    # stop_and_destroy removes the whole endpoint directory. Recreate it.
-    Path(pgdata).mkdir(parents=True)
-
-    spec = (
-        """
-{
-    "format_version": 1.0,
-
-    "timestamp": "2021-05-23T18:25:43.511Z",
-    "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b",
-
-    "cluster": {
-        "cluster_id": "test-cluster-42",
-        "name": "Neon Test",
-        "state": "restarted",
-        "roles": [
-        ],
-        "databases": [
-        ],
-        "settings": [
-            {
-                "name": "fsync",
-                "value": "off",
-                "vartype": "bool"
-            },
-            {
-                "name": "wal_level",
-                "value": "replica",
-                "vartype": "enum"
-            },
-            {
-                "name": "neon.safekeepers",
-                "value": """
-        + f'"{cfg_map["neon.safekeepers"]}"'
-        + """,
-                "vartype": "string"
-            },
-            {
-                "name": "wal_log_hints",
-                "value": "on",
-                "vartype": "bool"
-            },
-            {
-                "name": "log_connections",
-                "value": "on",
-                "vartype": "bool"
-            },
-            {
-                "name": "shared_buffers",
-                "value": "32768",
-                "vartype": "integer"
-            },
-            {
-                "name": "port",
-                "value": """
-        + f'"{cfg_map["port"]}"'
-        + """,
-                "vartype": "integer"
-            },
-            {
-                "name": "max_connections",
-                "value": "100",
-                "vartype": "integer"
-            },
-            {
-                "name": "max_wal_senders",
-                "value": "10",
-                "vartype": "integer"
-            },
-            {
-                "name": "listen_addresses",
-                "value": "0.0.0.0",
-                "vartype": "string"
-            },
-            {
-                "name": "wal_sender_timeout",
-                "value": "0",
-                "vartype": "integer"
-            },
-            {
-                "name": "password_encryption",
-                "value": "md5",
-                "vartype": "enum"
-            },
-            {
-                "name": "maintenance_work_mem",
-                "value": "65536",
-                "vartype": "integer"
-            },
-            {
-                "name": "max_parallel_workers",
-                "value": "8",
-                "vartype": "integer"
-            },
-            {
-                "name": "max_worker_processes",
-                "value": "8",
-                "vartype": "integer"
-            },
-            {
-                "name": "neon.tenant_id",
-                "value": """
-        + f'"{cfg_map["neon.tenant_id"]}"'
-        + """,
-                "vartype": "string"
-            },
-            {
-                "name": "max_replication_slots",
-                "value": "10",
-                "vartype": "integer"
-            },
-            {
-                "name": "neon.timeline_id",
-                "value": """
-        + f'"{cfg_map["neon.timeline_id"]}"'
-        + """,
-                "vartype": "string"
-            },
-            {
-                "name": "shared_preload_libraries",
-                "value": "neon",
-                "vartype": "string"
-            },
-            {
-                "name": "synchronous_standby_names",
-                "value": "walproposer",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.pageserver_connstring",
-                "value": """
-        + f'"{cfg_map["neon.pageserver_connstring"]}"'
-        + """,
-                "vartype": "string"
-            }
-        ]
-    },
-    "delta_operations": [
-    ]
-}
-"""
-    )
-
-    ps_connstr = cfg_map["neon.pageserver_connstring"]
-    log.info(f"ps_connstr: {ps_connstr}, pgdata: {pgdata}")
-
-    # run compute_ctl and wait for 10s
-    try:
-        ctl.raw_cli(
-            [
-                "--connstr",
-                "postgres://invalid/",
-                "--pgdata",
-                pgdata,
-                "--spec",
-                spec,
-                "--pgbin",
-                pg_bin_path,
-            ],
-            timeout=10,
-        )
-    except TimeoutExpired as exc:
-        ctl_logs = (exc.stderr or b"").decode("utf-8")
-        log.info(f"compute_ctl stderr:\n{ctl_logs}")
-
-    with ExternalProcessManager(Path(pgdata) / "postmaster.pid"):
-        start = "starting safekeepers syncing"
-        end = "safekeepers synced at LSN"
-        start_pos = ctl_logs.index(start)
-        assert start_pos != -1
-        end_pos = ctl_logs.index(end, start_pos)
-        assert end_pos != -1
-        sync_safekeepers_logs = ctl_logs[start_pos : end_pos + len(end)]
-        log.info("sync_safekeepers_logs:\n" + sync_safekeepers_logs)
-
-        # assert that --sync-safekeepers logs are present in the output
-        assert "connecting with node" in sync_safekeepers_logs
-        assert "connected with node" in sync_safekeepers_logs
-        assert "proposer connected to quorum (2)" in sync_safekeepers_logs
-        assert "got votes from majority (2)" in sync_safekeepers_logs
-        assert "sending elected msg to node" in sync_safekeepers_logs
-
-
-class ExternalProcessManager:
-    """
-    Context manager that kills a process with a pid file on exit.
-    """
-
-    def __init__(self, pid_file: Path):
-        self.path = pid_file
-        self.pid_file = open(pid_file, "r")
-        self.pid = int(self.pid_file.readline().strip())
-
-    def __enter__(self):
-        return self
-
-    def leave_alive(self):
-        self.pid_file.close()
-
-    def __exit__(self, _type, _value, _traceback):
-        import signal
-        import time
-
-        if self.pid_file.closed:
-            return
-
-        with self.pid_file:
-            try:
-                os.kill(self.pid, signal.SIGTERM)
-            except OSError as e:
-                if not self.path.is_file():
-                    return
-                log.info(f"Failed to kill {self.pid}, but the pidfile remains: {e}")
-                return
-
-            for _ in range(20):
-                if not self.path.is_file():
-                    return
-                time.sleep(0.2)
-
-            log.info("Process failed to stop after SIGTERM: {self.pid}")
-            os.kill(self.pid, signal.SIGKILL)
diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py
index f6629c54f9b9..3314e7fbf65e 100644
--- a/test_runner/regress/test_neon_local_cli.py
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -9,11 +9,18 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por
     try:
         env.neon_cli.start()
         env.neon_cli.create_tenant(tenant_id=env.initial_tenant, set_default=True)
-        env.neon_cli.endpoint_start(endpoint_id="ep-main", port=port_distributor.get_port())
+
+        pg_port = port_distributor.get_port()
+        http_port = port_distributor.get_port()
+        env.neon_cli.endpoint_start(
+            endpoint_id="ep-basic-main", pg_port=pg_port, http_port=http_port
+        )
 
         env.neon_cli.create_branch(new_branch_name="migration_check")
+        pg_port = port_distributor.get_port()
+        http_port = port_distributor.get_port()
         env.neon_cli.endpoint_start(
-            endpoint_id="ep-migration_check", port=port_distributor.get_port()
+            endpoint_id="ep-migration_check", pg_port=pg_port, http_port=http_port
         )
     finally:
         env.neon_cli.stop()
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index f5e0e34bc9a4..9d0fdcfaf8eb 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -59,6 +59,13 @@ def test_tenant_reattach(
     # create new nenant
     tenant_id, timeline_id = env.neon_cli.create_tenant()
 
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
+    )
+
     with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
         with endpoint.cursor() as cur:
             cur.execute("CREATE TABLE t(key int primary key, value text)")
@@ -223,13 +230,6 @@ def test_tenant_reattach_while_busy(
     )
     env = neon_env_builder.init_start()
 
-    # Attempts to connect from compute to pageserver while the tenant is
-    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(".*Tenant .* not found.*")
-    env.pageserver.allowed_errors.append(
-        ".*Tenant .* will not become active\\. Current state: Stopping.*"
-    )
-
     pageserver_http = env.pageserver.http_client()
 
     # create new nenant
@@ -238,6 +238,13 @@ def test_tenant_reattach_while_busy(
         conf={"checkpoint_distance": "100000"}
     )
 
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
+    )
+
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
 
     cur = endpoint.connect().cursor()
@@ -275,6 +282,13 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
     # create new nenant
     tenant_id, timeline_id = env.neon_cli.create_tenant()
 
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
+    )
+
     # assert tenant exists on disk
     assert (env.repo_dir / "tenants" / str(tenant_id)).exists()
 
@@ -336,6 +350,13 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
     # create a new tenant
     tenant_id, _ = env.neon_cli.create_tenant()
 
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
+    )
+
     # assert tenant exists on disk
     assert (env.repo_dir / "tenants" / str(tenant_id)).exists()
 
@@ -385,6 +406,13 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
     # create a new tenant
     tenant_id, _ = env.neon_cli.create_tenant()
 
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
+    )
+
     # assert tenant exists on disk
     assert (env.repo_dir / "tenants" / str(tenant_id)).exists()
 
@@ -399,6 +427,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
 
     log.info("detaching regular tenant with detach ignored flag")
     client.tenant_detach(tenant_id, True)
+
     log.info("regular tenant detached without error")
 
     # check that nothing is left on disk for deleted tenant
@@ -432,6 +461,13 @@ def test_detach_while_attaching(
     tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
     timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
+    )
+
     # Create table, and insert some rows. Make it big enough that it doesn't fit in
     # shared_buffers, otherwise the SELECT after restart will just return answer
     # from shared_buffers without hitting the page server, which defeats the point
@@ -577,6 +613,13 @@ def test_ignored_tenant_download_missing_layers(
     tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
     timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
+    )
+
     data_id = 1
     data_secret = "very secret secret"
     insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
@@ -636,6 +679,13 @@ def test_ignored_tenant_stays_broken_without_metadata(
     tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
     timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: Broken.*"
+    )
+
     # ignore the tenant and remove its metadata
     pageserver_http.tenant_ignore(tenant_id)
     tenant_timeline_dir = env.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
@@ -672,6 +722,13 @@ def test_load_attach_negatives(
 
     tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
 
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
+    )
+
     env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
     with pytest.raises(
         expected_exception=PageserverApiException,
@@ -714,6 +771,13 @@ def test_ignore_while_attaching(
     tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
     timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
 
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
+    )
+
     data_id = 1
     data_secret = "very secret secret"
     insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 60ab268882c0..e9dcd1e5cdee 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -318,7 +318,7 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa
 
 
 def test_single_branch_get_tenant_size_grows(
-    neon_env_builder: NeonEnvBuilder, test_output_dir: Path
+    neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion
 ):
     """
     Operate on single branch reading the tenants size after each transaction.
@@ -333,6 +333,13 @@ def test_single_branch_get_tenant_size_grows(
     # that there next_gc_cutoff could be smaller than initdb_lsn, which will
     # obviously lead to issues when calculating the size.
     gc_horizon = 0x38000
+
+    # it's a bit of a hack, but different versions of postgres have different
+    # amount of WAL generated for the same amount of data. so we need to
+    # adjust the gc_horizon accordingly.
+    if pg_version == PgVersion.V14:
+        gc_horizon = 0x40000
+
     neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
 
     env = neon_env_builder.init_start()
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 15712b9e5506..aef2df49321c 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -267,6 +267,7 @@ def test_pageserver_metrics_removed_after_detach(
                 cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
                 cur.execute("SELECT sum(key) FROM t")
                 assert cur.fetchone() == (5000050000,)
+        endpoint.stop()
 
     def get_ps_metric_samples_for_tenant(tenant_id: TenantId) -> List[Sample]:
         ps_metrics = env.pageserver.http_client().get_metrics()
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 2a4141ed30be..8b595596cb94 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1001,9 +1001,6 @@ def test_safekeeper_without_pageserver(
 
 
 def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder):
-    def safekeepers_guc(env: NeonEnv, sk_names: List[int]) -> str:
-        return ",".join([f"localhost:{sk.port.pg}" for sk in env.safekeepers if sk.id in sk_names])
-
     def execute_payload(endpoint: Endpoint):
         with closing(endpoint.connect()) as conn:
             with conn.cursor() as cur:
@@ -1032,9 +1029,8 @@ def show_statuses(safekeepers: List[Safekeeper], tenant_id: TenantId, timeline_i
 
     log.info("Use only first 3 safekeepers")
     env.safekeepers[3].stop()
-    active_safekeepers = [1, 2, 3]
     endpoint = env.endpoints.create("test_replace_safekeeper")
-    endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers))
+    endpoint.active_safekeepers = [1, 2, 3]
     endpoint.start()
 
     # learn neon timeline from compute
@@ -1072,9 +1068,8 @@ def show_statuses(safekeepers: List[Safekeeper], tenant_id: TenantId, timeline_i
 
     log.info("Recreate postgres to replace failed sk1 with new sk4")
     endpoint.stop_and_destroy().create("test_replace_safekeeper")
-    active_safekeepers = [2, 3, 4]
     env.safekeepers[3].start()
-    endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers))
+    endpoint.active_safekeepers = [2, 3, 4]
     endpoint.start()
 
     execute_payload(endpoint)
@@ -1293,9 +1288,8 @@ def show_statuses(safekeepers: List[Safekeeper], tenant_id: TenantId, timeline_i
 
     log.info("Use only first 3 safekeepers")
     env.safekeepers[3].stop()
-    active_safekeepers = [1, 2, 3]
     endpoint = env.endpoints.create("test_pull_timeline")
-    endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers))
+    endpoint.active_safekeepers = [1, 2, 3]
     endpoint.start()
 
     # learn neon timeline from compute
@@ -1332,10 +1326,8 @@ def show_statuses(safekeepers: List[Safekeeper], tenant_id: TenantId, timeline_i
     show_statuses(env.safekeepers, tenant_id, timeline_id)
 
     log.info("Restarting compute with new config to verify that it works")
-    active_safekeepers = [1, 3, 4]
-
     endpoint.stop_and_destroy().create("test_pull_timeline")
-    endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_safekeepers))
+    endpoint.active_safekeepers = [1, 3, 4]
     endpoint.start()
 
     execute_payload(endpoint)
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index 7debeed1406e..ce33975a0e7b 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -2,9 +2,11 @@
 import random
 import time
 from dataclasses import dataclass
+from pathlib import Path
 from typing import List, Optional
 
 import asyncpg
+import toml
 from fixtures.log_helper import getLogger
 from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper
 from fixtures.types import Lsn, TenantId, TimelineId
@@ -251,7 +253,8 @@ def endpoint_create_start(env: NeonEnv, branch: str, pgdir_name: Optional[str]):
     endpoint = Endpoint(
         env,
         tenant_id=env.initial_tenant,
-        port=env.port_distributor.get_port(),
+        pg_port=env.port_distributor.get_port(),
+        http_port=env.port_distributor.get_port(),
         # In these tests compute has high probability of terminating on its own
         # before our stop() due to lost consensus leadership.
         check_stop_result=False,
@@ -536,15 +539,20 @@ def test_race_conditions(neon_env_builder: NeonEnvBuilder):
 
 # Check that pageserver can select safekeeper with largest commit_lsn
 # and switch if LSN is not updated for some time (NoWalTimeout).
-async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint):
-    def safekeepers_guc(env: NeonEnv, active_sk: List[bool]) -> str:
-        # use ports 10, 11 and 12 to simulate unavailable safekeepers
-        return ",".join(
-            [
-                f"localhost:{sk.port.pg if active else 10 + i}"
-                for i, (sk, active) in enumerate(zip(env.safekeepers, active_sk))
-            ]
-        )
+async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint, test_output_dir: Path):
+    def adjust_safekeepers(env: NeonEnv, active_sk: List[bool]):
+        # Change the pg ports of the inactive safekeepers in the config file to be
+        # invalid, to make them unavailable to the endpoint.  We use
+        # ports 10, 11 and 12 to simulate unavailable safekeepers.
+        config = toml.load(test_output_dir / "repo" / "config")
+        for i, (sk, active) in enumerate(zip(env.safekeepers, active_sk)):
+            if active:
+                config["safekeepers"][i]["pg_port"] = env.safekeepers[i].port.pg
+            else:
+                config["safekeepers"][i]["pg_port"] = 10 + i
+
+        with open(test_output_dir / "repo" / "config", "w") as f:
+            toml.dump(config, f)
 
     conn = await endpoint.connect_async()
     await conn.execute("CREATE TABLE t(key int primary key, value text)")
@@ -565,7 +573,7 @@ def safekeepers_guc(env: NeonEnv, active_sk: List[bool]) -> str:
             it -= 1
             continue
 
-        endpoint.adjust_for_safekeepers(safekeepers_guc(env, active_sk))
+        adjust_safekeepers(env, active_sk)
         log.info(f"Iteration {it}: {active_sk}")
 
         endpoint.start()
@@ -579,7 +587,7 @@ def safekeepers_guc(env: NeonEnv, active_sk: List[bool]) -> str:
         await conn.close()
         endpoint.stop()
 
-    endpoint.adjust_for_safekeepers(safekeepers_guc(env, [True] * len(env.safekeepers)))
+    adjust_safekeepers(env, [True] * len(env.safekeepers))
     endpoint.start()
     conn = await endpoint.connect_async()
 
@@ -590,11 +598,11 @@ def safekeepers_guc(env: NeonEnv, active_sk: List[bool]) -> str:
 
 
 # do inserts while restarting postgres and messing with safekeeper addresses
-def test_wal_lagging(neon_env_builder: NeonEnvBuilder):
+def test_wal_lagging(neon_env_builder: NeonEnvBuilder, test_output_dir: Path):
     neon_env_builder.num_safekeepers = 3
     env = neon_env_builder.init_start()
 
     env.neon_cli.create_branch("test_wal_lagging")
     endpoint = env.endpoints.create_start("test_wal_lagging")
 
-    asyncio.run(run_wal_lagging(env, endpoint))
+    asyncio.run(run_wal_lagging(env, endpoint, test_output_dir))
diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py
index 8e4e154be1fd..515d47c079ee 100644
--- a/test_runner/regress/test_wal_receiver.py
+++ b/test_runner/regress/test_wal_receiver.py
@@ -77,7 +77,8 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
     try:
         trigger_wait_lsn_timeout(env, tenant_id)
     except Exception as e:
-        exception_string = str(e)
+        # Strip out the part before stdout, as it contains full command with the list of all safekeepers
+        exception_string = str(e).split("stdout", 1)[-1]
         assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
 
         for safekeeper in env.safekeepers:
diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
index 7d944bebb3cb..4a478989356a 100644
--- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py
+++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py
@@ -83,6 +83,9 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder):
     # XXX this is quite brittle as the lifecycle of the WAL redo process is an implementation detail
     assert_child_processes(pagserver_pid, wal_redo_present=True, defunct_present=False)
 
+    # Stop the compute before detaching, to avoid errors in the log.
+    endpoint.stop()
+
     last_error = None
     for i in range(3):
         try:

From dc6a3828731b787f71cf172e5276ed341c6489c8 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 6 Jun 2023 15:16:54 +0400
Subject: [PATCH 55/59] Increase timeouts on compute -> sk connections.

context: https://github.com/neondatabase/neon/issues/4414

And improve messages/comments here and there.
---
 pgxn/neon/walproposer.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index a99be40955c5..64d980d2e40c 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -254,20 +254,20 @@ nwp_register_gucs(void)
 
 	DefineCustomIntVariable(
 							"neon.safekeeper_reconnect_timeout",
-							"Timeout for reconnecting to offline wal acceptor.",
+							"Walproposer reconnects to offline safekeepers once in this interval.",
 							NULL,
 							&wal_acceptor_reconnect_timeout,
-							1000, 0, INT_MAX,	/* default, min, max */
+							5000, 0, INT_MAX,	/* default, min, max */
 							PGC_SIGHUP, /* context */
 							GUC_UNIT_MS,	/* flags */
 							NULL, NULL, NULL);
 
 	DefineCustomIntVariable(
 							"neon.safekeeper_connect_timeout",
-							"Timeout for connection establishement and it's maintenance against safekeeper",
+							"Connection or connection attempt to safekeeper is terminated if no message is received (or connection attempt doesn't finish) within this period.",
 							NULL,
 							&wal_acceptor_connection_timeout,
-							5000, 0, INT_MAX,
+							10000, 0, INT_MAX,
 							PGC_SIGHUP,
 							GUC_UNIT_MS,
 							NULL, NULL, NULL);
@@ -441,7 +441,7 @@ WalProposerPoll(void)
 				if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
 											   wal_acceptor_connection_timeout))
 				{
-					elog(WARNING, "failed to connect to node '%s:%s' in '%s' state: exceeded connection timeout %dms",
+					elog(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
 						 sk->host, sk->port, FormatSafekeeperState(sk->state), wal_acceptor_connection_timeout);
 					ShutdownConnection(sk);
 				}
@@ -1035,9 +1035,16 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->greetResponse))
 		return;
 
+	elog(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
+
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_VOTING;
 
+	/* 
+	 * Note: it would be better to track the counter on per safekeeper basis,
+	 * but at worst walproposer would restart with 'term rejected', so leave as
+	 * is for now.
+	 */
 	++n_connected;
 	if (n_connected <= quorum)
 	{

From c058e1cec2df70505263b19ad4e1f337d9643285 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 6 Jun 2023 15:27:17 +0400
Subject: [PATCH 56/59] Quick exit in truncate_wal if nothing to do.

ref https://github.com/neondatabase/neon/issues/4414
---
 safekeeper/src/wal_storage.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 1b82bd754e56..644c956fc1c9 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -379,6 +379,12 @@ impl Storage for PhysicalStorage {
             );
         }
 
+        // Quick exit if nothing to do to avoid writing up to 16 MiB of zeros on
+        // disk (this happens on each connect).
+        if end_pos == self.write_lsn {
+            return Ok(());
+        }
+
         // Close previously opened file, if any
         if let Some(mut unflushed_file) = self.file.take() {
             self.fdatasync_file(&mut unflushed_file)?;

From 6b3c020cd90aab8e1e8c015ece39ebf8b6898da2 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 6 Jun 2023 15:30:26 +0400
Subject: [PATCH 57/59] Don't warn on system id = 0 in walproposer greeting.

sync-safekeepers doesn't know it and sends 0.
---
 safekeeper/src/safekeeper.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 33da0c8e5a1a..eb434136d424 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -634,7 +634,8 @@ where
         }
 
         // system_id will be updated on mismatch
-        if self.state.server.system_id != msg.system_id {
+        // sync-safekeepers doesn't know sysid and sends 0, ignore it
+        if self.state.server.system_id != msg.system_id && msg.system_id != 0 {
             if self.state.server.system_id != 0 {
                 warn!(
                     "unexpected system ID arrived, got {}, expected {}",

From 88f0cfc5755cd226c7cfd40440cb03130a28b432 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Wed, 7 Jun 2023 11:41:53 +0200
Subject: [PATCH 58/59] Fix `pgx_ulid` extension (#4431)

The issue was in the wrong `control` file name
---
 Dockerfile.compute-node | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index f8429e72b868..44e13a6c7309 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -531,7 +531,7 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -
     mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgx        = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgx install --release && \
-    echo "trusted = true" >> /usr/local/pgsql/share/extension/pgx_ulid.control
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
 
 #########################################################################################
 #

From 5761190e0d61618ab805dd3b6dff3ef7fb768aff Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 7 Jun 2023 14:29:23 +0300
Subject: [PATCH 59/59] feat: three phased startup order (#4399)

Initial logical size calculation could still hinder our fast startup
efforts in #4397. See #4183. In deployment of 2023-06-06
about a 200 initial logical sizes were calculated on hosts which
took the longest to complete initial load (12s).

Implements the three step/tier initialization ordering described in
#4397:
1. load local tenants
2. do initial logical sizes per walreceivers for 10s
3. background tasks

Ordering is controlled by:
- waiting on `utils::completion::Barrier`s on background tasks
- having one attempt for each Timeline to do initial logical size
calculation
- `pageserver/src/bin/pageserver.rs` releasing background jobs after
timeout or completion of initial logical size calculation

The timeout is there just to safeguard in case a legitimate non-broken
timeline initial logical size calculation goes long. The timeout is
configurable, by default 10s, which I think would be fine for production
systems. In the test cases I've been looking at, it seems that these
steps are completed as fast as possible.

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/bin/pageserver.rs              | 119 ++++++++++++++++--
 pageserver/src/config.rs                      |  34 ++++-
 pageserver/src/disk_usage_eviction_task.rs    |  19 ++-
 pageserver/src/lib.rs                         |  23 ++++
 pageserver/src/tenant.rs                      |  58 +++++----
 pageserver/src/tenant/mgr.rs                  |  13 +-
 pageserver/src/tenant/tasks.rs                |  34 ++---
 pageserver/src/tenant/timeline.rs             |  46 ++++++-
 .../src/tenant/timeline/eviction_task.rs      |  16 ++-
 .../regress/test_disk_usage_eviction.py       |   6 +
 10 files changed, 293 insertions(+), 75 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index e0731ba79b2b..1fa5e4ab3bb8 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -337,33 +337,114 @@ fn start_pageserver(
 
     // Startup staging or optimizing:
     //
-    // (init_done_tx, init_done_rx) are used to control when do background loops start. This is to
-    // avoid starving out the BACKGROUND_RUNTIME async worker threads doing heavy work, like
-    // initial repartitioning while we still have Loading tenants.
+    // We want to minimize downtime for `page_service` connections, and trying not to overload
+    // BACKGROUND_RUNTIME by doing initial compactions and initial logical sizes at the same time.
     //
-    // init_done_rx is a barrier which stops waiting once all init_done_tx clones are dropped.
+    // init_done_rx will notify when all initial load operations have completed.
+    //
+    // background_jobs_can_start (same name used to hold off background jobs from starting at
+    // consumer side) will be dropped once we can start the background jobs. Currently it is behind
+    // completing all initial logical size calculations (init_logical_size_done_rx) and a timeout
+    // (background_task_maximum_delay).
     let (init_done_tx, init_done_rx) = utils::completion::channel();
 
+    let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
+
+    let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
+
+    let order = pageserver::InitializationOrder {
+        initial_tenant_load: Some(init_done_tx),
+        initial_logical_size_can_start: init_done_rx.clone(),
+        initial_logical_size_attempt: init_logical_size_done_tx,
+        background_jobs_can_start: background_jobs_barrier.clone(),
+    };
+
     // Scan the local 'tenants/' directory and start loading the tenants
     let init_started_at = std::time::Instant::now();
+    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
+
     BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
         conf,
         broker_client.clone(),
         remote_storage.clone(),
-        (init_done_tx, init_done_rx.clone()),
+        order,
     ))?;
 
     BACKGROUND_RUNTIME.spawn({
-        let init_done_rx = init_done_rx.clone();
-        async move {
+        let init_done_rx = init_done_rx;
+        let shutdown_pageserver = shutdown_pageserver.clone();
+        let drive_init = async move {
+            // NOTE: unlike many futures in pageserver, this one is cancellation-safe
+            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
+
             init_done_rx.wait().await;
+            // initial logical sizes can now start, as they were waiting on init_done_rx.
 
-            let elapsed = init_started_at.elapsed();
+            scopeguard::ScopeGuard::into_inner(guard);
+
+            let init_done = std::time::Instant::now();
+            let elapsed = init_done - init_started_at;
 
             tracing::info!(
                 elapsed_millis = elapsed.as_millis(),
-                "Initial load completed."
+                "Initial load completed"
             );
+
+            let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
+
+            let timeout = conf.background_task_maximum_delay;
+
+            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
+
+            let init_sizes_done = tokio::select! {
+                _ = &mut init_sizes_done => {
+                    let now = std::time::Instant::now();
+                    tracing::info!(
+                        from_init_done_millis = (now - init_done).as_millis(),
+                        from_init_millis = (now - init_started_at).as_millis(),
+                        "Initial logical sizes completed"
+                    );
+                    None
+                }
+                _ = tokio::time::sleep(timeout) => {
+                    tracing::info!(
+                        timeout_millis = timeout.as_millis(),
+                        "Initial logical size timeout elapsed; starting background jobs"
+                    );
+                    Some(init_sizes_done)
+                }
+            };
+
+            scopeguard::ScopeGuard::into_inner(guard);
+
+            // allow background jobs to start
+            drop(background_jobs_can_start);
+
+            if let Some(init_sizes_done) = init_sizes_done {
+                // ending up here is not a bug; at the latest logical sizes will be queried by
+                // consumption metrics.
+                let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
+                init_sizes_done.await;
+
+                scopeguard::ScopeGuard::into_inner(guard);
+
+                let now = std::time::Instant::now();
+                tracing::info!(
+                    from_init_done_millis = (now - init_done).as_millis(),
+                    from_init_millis = (now - init_started_at).as_millis(),
+                    "Initial logical sizes completed after timeout (background jobs already started)"
+                );
+
+            }
+        };
+
+        async move {
+            let mut drive_init = std::pin::pin!(drive_init);
+            // just race these tasks
+            tokio::select! {
+                _ = shutdown_pageserver.cancelled() => {},
+                _ = &mut drive_init => {},
+            }
         }
     });
 
@@ -378,7 +459,7 @@ fn start_pageserver(
             conf,
             remote_storage.clone(),
             disk_usage_eviction_state.clone(),
-            init_done_rx.clone(),
+            background_jobs_barrier.clone(),
         )?;
     }
 
@@ -416,7 +497,7 @@ fn start_pageserver(
         );
 
         if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
-            let init_done_rx = init_done_rx;
+            let background_jobs_barrier = background_jobs_barrier;
             let metrics_ctx = RequestContext::todo_child(
                 TaskKind::MetricsCollection,
                 // This task itself shouldn't download anything.
@@ -432,12 +513,17 @@ fn start_pageserver(
                 "consumption metrics collection",
                 true,
                 async move {
-                    // first wait for initial load to complete before first iteration.
+                    // first wait until background jobs are cleared to launch.
                     //
                     // this is because we only process active tenants and timelines, and the
                     // Timeline::get_current_logical_size will spawn the logical size calculation,
                     // which will not be rate-limited.
-                    init_done_rx.wait().await;
+                    let cancel = task_mgr::shutdown_token();
+
+                    tokio::select! {
+                        _ = cancel.cancelled() => { return Ok(()); },
+                        _ = background_jobs_barrier.wait() => {}
+                    };
 
                     pageserver::consumption_metrics::collect_metrics(
                         metric_collection_endpoint,
@@ -487,6 +573,8 @@ fn start_pageserver(
         );
     }
 
+    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
+
     // All started up! Now just sit and wait for shutdown signal.
     ShutdownSignals::handle(|signal| match signal {
         Signal::Quit => {
@@ -502,6 +590,11 @@ fn start_pageserver(
                 "Got {}. Terminating gracefully in fast shutdown mode",
                 signal.name()
             );
+
+            // This cancels the `shutdown_pageserver` cancellation tree.
+            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
+            // The plan is to change that over time.
+            shutdown_pageserver.take();
             BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
             unreachable!()
         }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 02763c9b7d7b..17e6e3fb2a70 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -63,6 +63,7 @@ pub mod defaults {
     pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
     pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
     pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
+    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
 
     ///
     /// Default built-in configuration file.
@@ -91,9 +92,10 @@ pub mod defaults {
 #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
 #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
 
-
 #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
 
+#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
+
 # [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -187,6 +189,15 @@ pub struct PageServerConf {
     pub test_remote_failures: u64,
 
     pub ondemand_download_behavior_treat_error_as_warn: bool,
+
+    /// How long will background tasks be delayed at most after initial load of tenants.
+    ///
+    /// Our largest initialization completions are in the range of 100-200s, so perhaps 10s works
+    /// as we now isolate initial loading, initial logical size calculation and background tasks.
+    /// Smaller nodes will have background tasks "not running" for this long unless every timeline
+    /// has it's initial logical size calculated. Not running background tasks for some seconds is
+    /// not terrible.
+    pub background_task_maximum_delay: Duration,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -259,6 +270,8 @@ struct PageServerConfigBuilder {
     test_remote_failures: BuilderValue<u64>,
 
     ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
+
+    background_task_maximum_delay: BuilderValue<Duration>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -316,6 +329,11 @@ impl Default for PageServerConfigBuilder {
             test_remote_failures: Set(0),
 
             ondemand_download_behavior_treat_error_as_warn: Set(false),
+
+            background_task_maximum_delay: Set(humantime::parse_duration(
+                DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
+            )
+            .unwrap()),
         }
     }
 }
@@ -440,6 +458,10 @@ impl PageServerConfigBuilder {
             BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn);
     }
 
+    pub fn background_task_maximum_delay(&mut self, delay: Duration) {
+        self.background_task_maximum_delay = BuilderValue::Set(delay);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let concurrent_tenant_size_logical_size_queries = self
             .concurrent_tenant_size_logical_size_queries
@@ -522,6 +544,9 @@ impl PageServerConfigBuilder {
                 .ok_or(anyhow!(
                     "missing ondemand_download_behavior_treat_error_as_warn"
                 ))?,
+            background_task_maximum_delay: self
+                .background_task_maximum_delay
+                .ok_or(anyhow!("missing background_task_maximum_delay"))?,
         })
     }
 }
@@ -710,6 +735,7 @@ impl PageServerConf {
                     )
                 },
                 "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
+                "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -877,6 +903,7 @@ impl PageServerConf {
             disk_usage_based_eviction: None,
             test_remote_failures: 0,
             ondemand_download_behavior_treat_error_as_warn: false,
+            background_task_maximum_delay: Duration::ZERO,
         }
     }
 }
@@ -1036,6 +1063,7 @@ metric_collection_endpoint = 'http://localhost:80/metrics'
 synthetic_size_calculation_interval = '333 s'
 
 log_format = 'json'
+background_task_maximum_delay = '334 s'
 
 "#;
 
@@ -1094,6 +1122,9 @@ log_format = 'json'
                 disk_usage_based_eviction: None,
                 test_remote_failures: 0,
                 ondemand_download_behavior_treat_error_as_warn: false,
+                background_task_maximum_delay: humantime::parse_duration(
+                    defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
+                )?,
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1148,6 +1179,7 @@ log_format = 'json'
                 disk_usage_based_eviction: None,
                 test_remote_failures: 0,
                 ondemand_download_behavior_treat_error_as_warn: false,
+                background_task_maximum_delay: Duration::from_secs(334),
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 1a8886935c8b..ce5f81c44bf7 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -83,7 +83,7 @@ pub fn launch_disk_usage_global_eviction_task(
     conf: &'static PageServerConf,
     storage: GenericRemoteStorage,
     state: Arc<State>,
-    init_done: completion::Barrier,
+    background_jobs_barrier: completion::Barrier,
 ) -> anyhow::Result<()> {
     let Some(task_config) = &conf.disk_usage_based_eviction else {
         info!("disk usage based eviction task not configured");
@@ -100,17 +100,16 @@ pub fn launch_disk_usage_global_eviction_task(
         "disk usage based eviction",
         false,
         async move {
+            let cancel = task_mgr::shutdown_token();
+
             // wait until initial load is complete, because we cannot evict from loading tenants.
-            init_done.wait().await;
+            tokio::select! {
+                _ = cancel.cancelled() => { return Ok(()); },
+                _ = background_jobs_barrier.wait() => { }
+            };
 
-            disk_usage_eviction_task(
-                &state,
-                task_config,
-                storage,
-                &conf.tenants_path(),
-                task_mgr::shutdown_token(),
-            )
-            .await;
+            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
+                .await;
             info!("disk usage based eviction task finishing");
             Ok(())
         },
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 40a672bee3fb..5831091098bd 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -132,6 +132,29 @@ pub fn is_uninit_mark(path: &Path) -> bool {
     }
 }
 
+/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
+/// blocking.
+///
+/// The instances of this value exist only during startup, otherwise `None` is provided, meaning no
+/// delaying is needed.
+#[derive(Clone)]
+pub struct InitializationOrder {
+    /// Each initial tenant load task carries this until completion.
+    pub initial_tenant_load: Option<utils::completion::Completion>,
+
+    /// Barrier for when we can start initial logical size calculations.
+    pub initial_logical_size_can_start: utils::completion::Barrier,
+
+    /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
+    /// attempt. It is important to drop this once the attempt has completed.
+    pub initial_logical_size_attempt: utils::completion::Completion,
+
+    /// Barrier for when we can start any background jobs.
+    ///
+    /// This can be broken up later on, but right now there is just one class of a background job.
+    pub background_jobs_can_start: utils::completion::Barrier,
+}
+
 #[cfg(test)]
 mod backoff_defaults_tests {
     use super::*;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7ce0ed81bc31..29086cae86f2 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -65,6 +65,7 @@ use crate::tenant::remote_timeline_client::PersistIndexPartWithDeletedFlagError;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
+use crate::InitializationOrder;
 
 use crate::virtual_file::VirtualFile;
 use crate::walredo::PostgresRedoManager;
@@ -510,6 +511,7 @@ impl Tenant {
         local_metadata: Option<TimelineMetadata>,
         ancestor: Option<Arc<Timeline>>,
         first_save: bool,
+        init_order: Option<&InitializationOrder>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let tenant_id = self.tenant_id;
@@ -535,6 +537,7 @@ impl Tenant {
                 up_to_date_metadata,
                 ancestor.clone(),
                 remote_client,
+                init_order,
             )?;
 
             let timeline = UninitializedTimeline {
@@ -560,6 +563,7 @@ impl Tenant {
                             up_to_date_metadata,
                             ancestor.clone(),
                             None,
+                            None,
                         )
                         .with_context(|| {
                             format!("creating broken timeline data for {tenant_id}/{timeline_id}")
@@ -858,6 +862,7 @@ impl Tenant {
             local_metadata,
             ancestor,
             true,
+            None,
             ctx,
         )
         .await
@@ -892,16 +897,13 @@ impl Tenant {
     ///
     /// If the loading fails for some reason, the Tenant will go into Broken
     /// state.
-    ///
-    /// `init_done` is an optional channel used during initial load to delay background task
-    /// start. It is not used later.
     #[instrument(skip_all, fields(tenant_id=%tenant_id))]
     pub fn spawn_load(
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         broker_client: storage_broker::BrokerClientChannel,
         remote_storage: Option<GenericRemoteStorage>,
-        init_done: Option<(completion::Completion, completion::Barrier)>,
+        init_order: Option<InitializationOrder>,
         ctx: &RequestContext,
     ) -> Arc<Tenant> {
         debug_assert_current_span_has_tenant_id();
@@ -937,17 +939,17 @@ impl Tenant {
             "initial tenant load",
             false,
             async move {
-                // keep the sender alive as long as we have the initial load ongoing; it will be
-                // None for loads spawned after init_tenant_mgr.
-                let (_tx, rx) = if let Some((tx, rx)) = init_done {
-                    (Some(tx), Some(rx))
-                } else {
-                    (None, None)
-                };
-                match tenant_clone.load(&ctx).await {
+                let mut init_order = init_order;
+
+                // take the completion because initial tenant loading will complete when all of
+                // these tasks complete.
+                let _completion = init_order.as_mut().and_then(|x| x.initial_tenant_load.take());
+
+                match tenant_clone.load(init_order.as_ref(), &ctx).await {
                     Ok(()) => {
                         debug!("load finished, activating");
-                        tenant_clone.activate(broker_client, rx.as_ref(), &ctx);
+                        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
+                        tenant_clone.activate(broker_client, background_jobs_can_start, &ctx);
                     }
                     Err(err) => {
                         error!("load failed, setting tenant state to Broken: {err:?}");
@@ -974,7 +976,11 @@ impl Tenant {
     /// files on disk. Used at pageserver startup.
     ///
     /// No background tasks are started as part of this routine.
-    async fn load(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
+    async fn load(
+        self: &Arc<Tenant>,
+        init_order: Option<&InitializationOrder>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         debug_assert_current_span_has_tenant_id();
 
         debug!("loading tenant task");
@@ -1094,7 +1100,7 @@ impl Tenant {
         //    1. "Timeline has no ancestor and no layer files"
 
         for (timeline_id, local_metadata) in sorted_timelines {
-            self.load_local_timeline(timeline_id, local_metadata, ctx)
+            self.load_local_timeline(timeline_id, local_metadata, init_order, ctx)
                 .await
                 .with_context(|| format!("load local timeline {timeline_id}"))?;
         }
@@ -1112,6 +1118,7 @@ impl Tenant {
         &self,
         timeline_id: TimelineId,
         local_metadata: TimelineMetadata,
+        init_order: Option<&InitializationOrder>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         debug_assert_current_span_has_tenant_id();
@@ -1181,6 +1188,7 @@ impl Tenant {
             Some(local_metadata),
             ancestor,
             false,
+            init_order,
             ctx,
         )
         .await
@@ -1724,12 +1732,12 @@ impl Tenant {
 
     /// Changes tenant status to active, unless shutdown was already requested.
     ///
-    /// `init_done` is an optional channel used during initial load to delay background task
-    /// start. It is not used later.
+    /// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup
+    /// to delay background jobs. Background jobs can be started right away when None is given.
     fn activate(
         self: &Arc<Self>,
         broker_client: BrokerClientChannel,
-        init_done: Option<&completion::Barrier>,
+        background_jobs_can_start: Option<&completion::Barrier>,
         ctx: &RequestContext,
     ) {
         debug_assert_current_span_has_tenant_id();
@@ -1762,12 +1770,12 @@ impl Tenant {
 
             // Spawn gc and compaction loops. The loops will shut themselves
             // down when they notice that the tenant is inactive.
-            tasks::start_background_loops(self, init_done);
+            tasks::start_background_loops(self, background_jobs_can_start);
 
             let mut activated_timelines = 0;
 
             for timeline in not_broken_timelines {
-                timeline.activate(broker_client.clone(), init_done, ctx);
+                timeline.activate(broker_client.clone(), background_jobs_can_start, ctx);
                 activated_timelines += 1;
             }
 
@@ -2158,6 +2166,7 @@ impl Tenant {
         new_metadata: &TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
         remote_client: Option<RemoteTimelineClient>,
+        init_order: Option<&InitializationOrder>,
     ) -> anyhow::Result<Arc<Timeline>> {
         if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() {
             anyhow::ensure!(
@@ -2166,6 +2175,9 @@ impl Tenant {
             )
         }
 
+        let initial_logical_size_can_start = init_order.map(|x| &x.initial_logical_size_can_start);
+        let initial_logical_size_attempt = init_order.map(|x| &x.initial_logical_size_attempt);
+
         let pg_version = new_metadata.pg_version();
         Ok(Timeline::new(
             self.conf,
@@ -2177,6 +2189,8 @@ impl Tenant {
             Arc::clone(&self.walredo_mgr),
             remote_client,
             pg_version,
+            initial_logical_size_can_start.cloned(),
+            initial_logical_size_attempt.cloned(),
         ))
     }
 
@@ -2852,7 +2866,7 @@ impl Tenant {
         remote_client: Option<RemoteTimelineClient>,
     ) -> anyhow::Result<Arc<Timeline>> {
         let timeline_data = self
-            .create_timeline_data(new_timeline_id, new_metadata, ancestor, remote_client)
+            .create_timeline_data(new_timeline_id, new_metadata, ancestor, remote_client, None)
             .context("Failed to create timeline data structure")?;
         crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?;
 
@@ -3420,7 +3434,7 @@ pub mod harness {
                 timelines_to_load.insert(timeline_id, timeline_metadata);
             }
             tenant
-                .load(ctx)
+                .load(None, ctx)
                 .instrument(info_span!("try_load", tenant_id=%self.tenant_id))
                 .await?;
             tenant.state.send_replace(TenantState::Active);
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 740f9621b695..a1638e4a95d4 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -21,9 +21,8 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
-use crate::IGNORED_TENANT_FILE_NAME;
+use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
 
-use utils::completion;
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};
 
@@ -65,7 +64,7 @@ pub async fn init_tenant_mgr(
     conf: &'static PageServerConf,
     broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
-    init_done: (completion::Completion, completion::Barrier),
+    init_order: InitializationOrder,
 ) -> anyhow::Result<()> {
     // Scan local filesystem for attached tenants
     let tenants_dir = conf.tenants_path();
@@ -122,7 +121,7 @@ pub async fn init_tenant_mgr(
                         &tenant_dir_path,
                         broker_client.clone(),
                         remote_storage.clone(),
-                        Some(init_done.clone()),
+                        Some(init_order.clone()),
                         &ctx,
                     ) {
                         Ok(tenant) => {
@@ -153,14 +152,12 @@ pub async fn init_tenant_mgr(
     Ok(())
 }
 
-/// `init_done` is an optional channel used during initial load to delay background task
-/// start. It is not used later.
 pub fn schedule_local_tenant_processing(
     conf: &'static PageServerConf,
     tenant_path: &Path,
     broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
-    init_done: Option<(completion::Completion, completion::Barrier)>,
+    init_order: Option<InitializationOrder>,
     ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
     anyhow::ensure!(
@@ -219,7 +216,7 @@ pub fn schedule_local_tenant_processing(
             tenant_id,
             broker_client,
             remote_storage,
-            init_done,
+            init_order,
             ctx,
         )
     };
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 1bbc1b1c084a..360818b5a7b8 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -15,10 +15,10 @@ use tracing::*;
 use utils::completion;
 
 /// Start per tenant background loops: compaction and gc.
-///
-/// `init_done` is an optional channel used during initial load to delay background task
-/// start. It is not used later.
-pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completion::Barrier>) {
+pub fn start_background_loops(
+    tenant: &Arc<Tenant>,
+    background_jobs_can_start: Option<&completion::Barrier>,
+) {
     let tenant_id = tenant.tenant_id;
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
@@ -29,10 +29,14 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completio
         false,
         {
             let tenant = Arc::clone(tenant);
-            let init_done = init_done.cloned();
+            let background_jobs_can_start = background_jobs_can_start.cloned();
             async move {
-                completion::Barrier::maybe_wait(init_done).await;
-                compaction_loop(tenant)
+                let cancel = task_mgr::shutdown_token();
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()) },
+                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
+                };
+                compaction_loop(tenant, cancel)
                     .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
                     .await;
                 Ok(())
@@ -48,10 +52,14 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completio
         false,
         {
             let tenant = Arc::clone(tenant);
-            let init_done = init_done.cloned();
+            let background_jobs_can_start = background_jobs_can_start.cloned();
             async move {
-                completion::Barrier::maybe_wait(init_done).await;
-                gc_loop(tenant)
+                let cancel = task_mgr::shutdown_token();
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()) },
+                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
+                };
+                gc_loop(tenant, cancel)
                     .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
                     .await;
                 Ok(())
@@ -63,12 +71,11 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completio
 ///
 /// Compaction task's main loop
 ///
-async fn compaction_loop(tenant: Arc<Tenant>) {
+async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
     let wait_duration = Duration::from_secs(2);
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
-        let cancel = task_mgr::shutdown_token();
         let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
         let mut first = true;
         loop {
@@ -133,12 +140,11 @@ async fn compaction_loop(tenant: Arc<Tenant>) {
 ///
 /// GC task's main loop
 ///
-async fn gc_loop(tenant: Arc<Tenant>) {
+async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
     let wait_duration = Duration::from_secs(2);
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
-        let cancel = task_mgr::shutdown_token();
         // GC might require downloading, to find the cutoff LSN that corresponds to the
         // cutoff specified as time.
         let ctx =
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fdaad58e1602..507f0de4f3c1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -242,6 +242,13 @@ pub struct Timeline {
     pub delete_lock: tokio::sync::Mutex<bool>,
 
     eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
+
+    /// Barrier to wait before doing initial logical size calculation. Used only during startup.
+    initial_logical_size_can_start: Option<completion::Barrier>,
+
+    /// Completion shared between all timelines loaded during startup; used to delay heavier
+    /// background tasks until some logical sizes have been calculated.
+    initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
 }
 
 /// Internal structure to hold all data needed for logical size calculation.
@@ -932,12 +939,12 @@ impl Timeline {
     pub fn activate(
         self: &Arc<Self>,
         broker_client: BrokerClientChannel,
-        init_done: Option<&completion::Barrier>,
+        background_jobs_can_start: Option<&completion::Barrier>,
         ctx: &RequestContext,
     ) {
         self.launch_wal_receiver(ctx, broker_client);
         self.set_state(TimelineState::Active);
-        self.launch_eviction_task(init_done);
+        self.launch_eviction_task(background_jobs_can_start);
     }
 
     pub fn set_state(&self, new_state: TimelineState) {
@@ -955,6 +962,14 @@ impl Timeline {
                 error!("Not activating a Stopping timeline");
             }
             (_, new_state) => {
+                if matches!(new_state, TimelineState::Stopping | TimelineState::Broken) {
+                    // drop the copmletion guard, if any; it might be holding off the completion
+                    // forever needlessly
+                    self.initial_logical_size_attempt
+                        .lock()
+                        .unwrap_or_else(|e| e.into_inner())
+                        .take();
+                }
                 self.state.send_replace(new_state);
             }
         }
@@ -1345,6 +1360,8 @@ impl Timeline {
         walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
         remote_client: Option<RemoteTimelineClient>,
         pg_version: u32,
+        initial_logical_size_can_start: Option<completion::Barrier>,
+        initial_logical_size_attempt: Option<completion::Completion>,
     ) -> Arc<Self> {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
         let (state, _) = watch::channel(TimelineState::Loading);
@@ -1439,6 +1456,9 @@ impl Timeline {
                     EvictionTaskTimelineState::default(),
                 ),
                 delete_lock: tokio::sync::Mutex::new(false),
+
+                initial_logical_size_can_start,
+                initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
             };
             result.repartition_threshold = result.get_checkpoint_distance() / 10;
             result
@@ -1927,7 +1947,27 @@ impl Timeline {
             false,
             // NB: don't log errors here, task_mgr will do that.
             async move {
-                // no cancellation here, because nothing really waits for this to complete compared
+
+                let cancel = task_mgr::shutdown_token();
+
+                // in case we were created during pageserver initialization, wait for
+                // initialization to complete before proceeding. startup time init runs on the same
+                // runtime.
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()); },
+                    _ = completion::Barrier::maybe_wait(self_clone.initial_logical_size_can_start.clone()) => {}
+                };
+
+                // hold off background tasks from starting until all timelines get to try at least
+                // once initial logical size calculation; though retry will rarely be useful.
+                // holding off is done because heavier tasks execute blockingly on the same
+                // runtime.
+                //
+                // dropping this at every outcome is probably better than trying to cling on to it,
+                // delay will be terminated by a timeout regardless.
+                let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };
+
+                // no extra cancellation here, because nothing really waits for this to complete compared
                 // to spawn_ondemand_logical_size_calculation.
                 let cancel = CancellationToken::new();
 
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 7029d75d63b8..1040dff63d84 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -49,9 +49,12 @@ pub struct EvictionTaskTenantState {
 }
 
 impl Timeline {
-    pub(super) fn launch_eviction_task(self: &Arc<Self>, init_done: Option<&completion::Barrier>) {
+    pub(super) fn launch_eviction_task(
+        self: &Arc<Self>,
+        background_tasks_can_start: Option<&completion::Barrier>,
+    ) {
         let self_clone = Arc::clone(self);
-        let init_done = init_done.cloned();
+        let background_tasks_can_start = background_tasks_can_start.cloned();
         task_mgr::spawn(
             BACKGROUND_RUNTIME.handle(),
             TaskKind::Eviction,
@@ -60,8 +63,13 @@ impl Timeline {
             &format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id),
             false,
             async move {
-                completion::Barrier::maybe_wait(init_done).await;
-                self_clone.eviction_task(task_mgr::shutdown_token()).await;
+                let cancel = task_mgr::shutdown_token();
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()); }
+                    _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
+                };
+
+                self_clone.eviction_task(cancel).await;
                 info!("eviction task finishing");
                 Ok(())
             },
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index ab6751809272..0ec023b9e115 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -110,6 +110,12 @@ def pageserver_start_with_disk_usage_eviction(
             overrides=(
                 "--pageserver-config-override=disk_usage_based_eviction="
                 + enc.dump_inline_table(disk_usage_config).replace("\n", " "),
+                # Disk usage based eviction runs as a background task.
+                # But pageserver startup delays launch of background tasks for some time, to prioritize initial logical size calculations during startup.
+                # But, initial logical size calculation may not be triggered if safekeepers don't publish new broker messages.
+                # But, we only have a 10-second-timeout in this test.
+                # So, disable the delay for this test.
+                "--pageserver-config-override=background_task_maximum_delay='0s'",
             ),
         )