Skip to content
This repository has been archived by the owner on Nov 15, 2023. It is now read-only.

[DNM] Wrapper allocator PoC #7206

Draft
wants to merge 39 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
d86bd79
Implement wrapper allocator -- draft
s0me0ne-unkn0wn May 6, 2023
998a6ff
Minor fixes
s0me0ne-unkn0wn May 9, 2023
dbd40a7
Backlog tracking allocator
s0me0ne-unkn0wn May 18, 2023
c8e9d1c
Try spinlock approach
s0me0ne-unkn0wn May 19, 2023
b841129
Rename things
s0me0ne-unkn0wn May 19, 2023
d3e3e72
Merge remote-tracking branch 'origin/master' into s0me0ne/wrapper-all…
s0me0ne-unkn0wn May 19, 2023
089e6d8
Fix feature name
s0me0ne-unkn0wn May 22, 2023
818699a
Add a benchmark to measure Kusama runtime preparation time
s0me0ne-unkn0wn May 22, 2023
052b096
Merge remote-tracking branch 'origin/master' into s0me0ne/wrapper-all…
s0me0ne-unkn0wn Jun 5, 2023
66e8a8b
Merge remote-tracking branch 'origin/master' into s0me0ne/wrapper-all…
s0me0ne-unkn0wn Aug 1, 2023
dbdeb52
".git/.scripts/commands/fmt/fmt.sh"
Aug 1, 2023
1dfd9ca
` XcmContext` to `buy_weight / refund_weight` (#7563)
bkontur Aug 1, 2023
152888f
Take into account size as well in weight limiting. (#7369)
eskimor Aug 1, 2023
70aed93
[companion] Get rid of `Peerset` compatibility layer (#7355)
dmitry-markin Aug 2, 2023
88c1a70
Companion for Substrate#14373 (#7572)
drskalman Aug 2, 2023
b137472
[xcm] `GlobalConsensusConvertsFor` for remote relay chain (based on p…
bkontur Aug 3, 2023
b810ce4
Fix flaky reputation change test (#7550)
AndreiEres Aug 4, 2023
314e519
Add license to crates (#7578)
Morganamilo Aug 4, 2023
62b489f
Remove xcm on_runtime_upgrade pallet hook (#7235)
pgherveou Aug 5, 2023
14e6605
Document non-uniqueness of SetTopic IDs (#7579)
KiChjang Aug 7, 2023
929e2d4
PVF: Add missing crate descriptions (#7587)
mrcnski Aug 8, 2023
06ccdf2
update weight file template (#7589)
xlc Aug 8, 2023
aaea117
Companion for #14412 (#7547)
davxy Aug 9, 2023
c7bbfba
Remove unused code in runtime/polkadot/src/lib.rs (#7540)
liamaharon Aug 10, 2023
08f4333
Companion for substrate#12970 (#6807)
gpestana Aug 10, 2023
0f27b6c
Add counter for unapproved candidates (#7491)
AndreiEres Aug 10, 2023
e813323
Publish RC container images (#7556)
chevdor Aug 11, 2023
12fdcba
companion for 14754: cli: move no-beefy flag to sc-cli (#7600)
acatangiu Aug 11, 2023
0f57383
pvf: use test-utils feature to export test only (#7538)
jpserrat Aug 14, 2023
2dda590
RC container image fixes (#7607)
chevdor Aug 14, 2023
730a1c8
Fix the user used to login to Docker hub (#7610)
chevdor Aug 14, 2023
04ae532
Remove ParityDb migration tests (#7612)
altonen Aug 14, 2023
6f9fe26
Use same `fmt` and `clippy` configs as in Substrate (#7611)
ggwpez Aug 14, 2023
ffb8d15
Disable validation/collation protocols for normal full nodes (#7601)
altonen Aug 14, 2023
74b2fec
Don't publish test crates (#7588)
Morganamilo Aug 14, 2023
2a2393f
PVF workers: some fixes for cargo run and cargo install (#7608)
mrcnski Aug 14, 2023
ed8f0f8
XCM: Rename Instruction instructions to Command instructions (#7593)
KiChjang Aug 14, 2023
4f47d3c
Remove superflous parameter `overseer_enable_anyways` and make parach…
bkchr Aug 15, 2023
d25f550
Merge remote-tracking branch 'origin/master' into s0me0ne/wrapper-all…
s0me0ne-unkn0wn Aug 15, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ tikv-jemallocator = "0.5.0"
polkadot-cli = { path = "cli", features = [ "kusama-native", "westend-native", "rococo-native" ] }
polkadot-node-core-pvf-worker = { path = "node/core/pvf/worker" }
polkadot-overseer = { path = "node/overseer" }
wrapper-allocator = { path = "node/wrapper-allocator", optional = true }

[dev-dependencies]
assert_cmd = "2.0.4"
Expand Down Expand Up @@ -101,6 +102,7 @@ members = [
"node/subsystem-types",
"node/subsystem-test-helpers",
"node/subsystem-util",
"node/wrapper-allocator",
"node/jaeger",
"node/gum",
"node/gum/proc-macro",
Expand Down Expand Up @@ -208,6 +210,7 @@ fast-runtime = [ "polkadot-cli/fast-runtime" ]
runtime-metrics = [ "polkadot-cli/runtime-metrics" ]
pyroscope = ["polkadot-cli/pyroscope"]
jemalloc-allocator = ["polkadot-node-core-pvf-worker/jemalloc-allocator", "polkadot-overseer/jemalloc-allocator"]
wrapper-allocator = ["jemalloc-allocator", "dep:wrapper-allocator", "polkadot-node-core-pvf-worker/wrapper-allocator"]

# Configuration for building a .deb package - for use with `cargo-deb`
[package.metadata.deb]
Expand Down
2 changes: 2 additions & 0 deletions node/core/pvf/worker/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ rayon = "1.5.1"
tempfile = "3.3.0"
tikv-jemalloc-ctl = { version = "0.5.0", optional = true }
tokio = "1.24.2"
wrapper-allocator = { path = "../../../wrapper-allocator", optional = true }

parity-scale-codec = { version = "3.4.0", default-features = false, features = ["derive"] }

Expand Down Expand Up @@ -47,3 +48,4 @@ tempfile = "3.3.0"

[features]
jemalloc-allocator = ["dep:tikv-jemalloc-ctl"]
wrapper-allocator = ["dep:wrapper-allocator"]
17 changes: 17 additions & 0 deletions node/core/pvf/worker/src/prepare.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ use polkadot_node_core_pvf::{
};
use std::{any::Any, panic, path::PathBuf, sync::mpsc::channel};
use tokio::{io, net::UnixStream};
#[cfg(feature = "wrapper-allocator")]
use wrapper_allocator::ALLOC;

async fn recv_request(stream: &mut UnixStream) -> io::Result<(PvfPrepData, PathBuf)> {
let pvf = framed_recv(stream).await?;
Expand Down Expand Up @@ -109,8 +111,23 @@ pub fn worker_entrypoint(socket_path: &str, node_version: Option<&str>) {
// Spawn another thread for preparation.
let prepare_fut = rt_handle
.spawn_blocking(move || {
#[cfg(feature = "wrapper-allocator")]
ALLOC.start_tracking(100_000_000);

let result = prepare_artifact(pvf);

#[cfg(feature = "wrapper-allocator")]
{
let (events, peak) = ALLOC.end_tracking();
gum::debug!(
target: LOG_TARGET,
%worker_pid,
"prepare job peak allocation is {} bytes in {} events",
peak,
events,
);
}

// Get the `ru_maxrss` stat. If supported, call getrusage for the thread.
#[cfg(target_os = "linux")]
let result = result.map(|artifact| (artifact, get_max_rss_thread()));
Expand Down
9 changes: 9 additions & 0 deletions node/wrapper-allocator/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[package]
name = "wrapper-allocator"
s0me0ne-unkn0wn marked this conversation as resolved.
Show resolved Hide resolved
description = "Wrapper allocator to control amount of memory consumed by PVF preparation process"
version.workspace = true
authors.workspace = true
edition.workspace = true

[dependencies]
tikv-jemallocator = "0.5.0"
151 changes: 151 additions & 0 deletions node/wrapper-allocator/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Polkadot.

// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.

//! Tracking global allocator. Initially just forwards allocation and deallocation requests
//! to the underlying allocator. When tracking is enabled, stores every allocation event into
//! pre-allocated backlog. When tracking mode is disables, replays the backlog and counts the
//! number of allocation events and the peak allocation value.

use core::alloc::{GlobalAlloc, Layout};
use std::sync::{
atomic::{AtomicUsize, Ordering::Relaxed},
RwLock,
};
use tikv_jemallocator::Jemalloc;

struct WrapperAllocatorData {
tracking: RwLock<bool>,
backlog: Vec<isize>,
backlog_index: AtomicUsize,
}

impl WrapperAllocatorData {
// SAFETY:
// * Tracking must only be performed by a single thread at a time
// * `start_tracking` and `stop_tracking` must be called from the same thread
// * Tracking periods must not overlap
// * Caller must provide sufficient backlog size

unsafe fn start_tracking(&mut self, backlog_size: usize) {
// Allocate the backlog before locking anything. The allocation won't be available later.
let backlog = Vec::with_capacity(backlog_size);
// Lock allocations, move the allocated vector to our place and start tracking.
let mut tracking = self.tracking.write().unwrap();
assert!(!*tracking); // Shouldn't start tracking if already tracking
self.backlog = backlog;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Won't this will deadlock if start_tracking is called twice? (Assigning a new backlog will trigger a deallocation of the old backlog.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like you're right 😕
Nah, the whole thing now seems FUBAR to me. I was thinking about using the nightly allocator_api feature and Vec::with_capacity_in() to allow side-allocations from the main allocator but everything I try to implement looks too hacky. I'll give a try to your spinlock approach, probably, the overhead will be even less than the overhead of allocating and initializing the 800 Mb array and then replaying it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Btw, why didn't you use std::hint::spin_loop() inside while in your spinlock example? Was it on purpose?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Btw, why didn't you use std::hint::spin_loop() inside while in your spinlock example? Was it on purpose?

No particular reason; I wrote that code from memory and simply forgot. :P Yeah, you should probably add that. (On AMD64 that will generate a pause instruction, which is generally a good idea to use in a spinlock.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at your code a bit more... The critical section is under exclusive lock now, do current and peek still have to be Atomics? Would we cut off a bit of the overhead making them simple isizes?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The critical section is under exclusive lock now, do current and peek still have to be Atomics? Would we cut off a bit of the overhead making them simple isizes?

AFAIK they don't. I just used them as I didn't feel like using an UnsafeCell for the example.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Repeating some things as my review yesterday got lost. The memory overhead per prepare job right now is too high to be viable, if we really do that many allocations. It is an interesting idea, but it does have the extra complexity of determining what is a safe amount to pre-allocate. Not sure if the latency spike at the end is a problem, we do it at the end of an already blocking operation, but ideally we benchmark the overall start-to-end time and get numbers for the different approaches. The panic can be worked around (just set some bit that we overflowed and return an error in end_tracking). Overall the spinlock strategy does seem like the best approach so far.

self.backlog.resize(backlog_size, 0);
self.backlog_index.store(0, Relaxed);
*tracking = true;
}

unsafe fn end_tracking(&mut self) -> (usize, isize) {
let mut tracking = self.tracking.write().unwrap();
assert!(*tracking); // Start/end calls must be consistent

// At this point, all the allocation is blocked as all the threads are waiting for
// read lock on `tracking`. The following code replays the backlog and calulates the
// peak value. It must not perform any allocation, otherwise a deadlock will occur.
let mut peak = 0;
let mut alloc = 0;
let mut events = 0usize;
for i in 0..self.backlog.len() {
if self.backlog[i] == 0 {
break
}
events += 1;
alloc += self.backlog[i];
if alloc > peak {
peak = alloc
}
}
*tracking = false;
(events, peak)
}

#[inline]
unsafe fn track(&mut self, alloc: isize) {
let tracking = self.tracking.read().unwrap();
if !*tracking {
return
}
let i = self.backlog_index.fetch_add(1, Relaxed);
if i == self.backlog.len() {
// We cannot use formatted text here as it would result in allocations and a deadlock
panic!("Backlog size provided was not enough for allocation tracking");
}
// It is safe as the vector is pre-allocated and the index is acquired atomically
self.backlog[i] = alloc;
}
}

static mut ALLOCATOR_DATA: WrapperAllocatorData = WrapperAllocatorData {
tracking: RwLock::new(false),
backlog: vec![],
backlog_index: AtomicUsize::new(0),
};

pub struct WrapperAllocator<A: GlobalAlloc>(A);

impl<A: GlobalAlloc> WrapperAllocator<A> {
/// Start tracking with the given backlog size (in allocation events). Providing insufficient
/// backlog size will result in a panic.
pub fn start_tracking(&self, backlog_size: usize) {
unsafe {
ALLOCATOR_DATA.start_tracking(backlog_size);
}
}

/// End tracking and return number of allocation events (as `usize`) and peak allocation
/// value in bytes (as `isize`). Peak allocation value is not guaranteed to be neither
/// non-zero nor positive.
pub fn end_tracking(&self) -> (usize, isize) {
unsafe { ALLOCATOR_DATA.end_tracking() }
}
}

unsafe impl<A: GlobalAlloc> GlobalAlloc for WrapperAllocator<A> {
// SAFETY:
// * The wrapped methods are as safe as the underlying allocator implementation is
// * In tracking mode, it is safe as long as a sufficient backlog size is provided when
// entering the tracking mode

#[inline]
unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
ALLOCATOR_DATA.track(layout.size() as isize);
self.0.alloc(layout)
}

#[inline]
unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 {
ALLOCATOR_DATA.track(layout.size() as isize);
self.0.alloc_zeroed(layout)
}

#[inline]
unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) -> () {
ALLOCATOR_DATA.track(-(layout.size() as isize));
self.0.dealloc(ptr, layout)
}

#[inline]
unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 {
ALLOCATOR_DATA.track((new_size as isize) - (layout.size() as isize));
self.0.realloc(ptr, layout, new_size)
}
}

#[global_allocator]
pub static ALLOC: WrapperAllocator<Jemalloc> = WrapperAllocator(Jemalloc);
5 changes: 4 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ use color_eyre::eyre;

/// Global allocator. Changing it to another allocator will require changing
/// `memory_stats::MemoryAllocationTracker`.
#[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))]
#[cfg(all(
any(target_os = "linux", feature = "jemalloc-allocator"),
not(feature = "wrapper-allocator")
))]
#[global_allocator]
pub static ALLOC: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;

Expand Down