diff --git a/Cargo.lock b/Cargo.lock index bab0155beab3..cf7510781673 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6014,6 +6014,7 @@ dependencies = [ "polkadot-node-subsystem-test-helpers", "polkadot-node-subsystem-util", "polkadot-primitives", + "polkadot-primitives-test-helpers", "rand 0.8.5", "rand_chacha 0.3.1", "rand_core 0.5.1", @@ -6223,6 +6224,7 @@ dependencies = [ "fatality", "futures", "futures-timer", + "indexmap", "lazy_static", "lru 0.7.8", "parity-scale-codec", @@ -6916,6 +6918,7 @@ dependencies = [ "polkadot-primitives", "rand 0.8.5", "sp-application-crypto", + "sp-core", "sp-keyring", "sp-runtime", ] diff --git a/Cargo.toml b/Cargo.toml index cf238625e2d6..3f6332ecaef9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -128,9 +128,9 @@ maintenance = { status = "actively-developed" } # # This list is ordered alphabetically. [profile.dev.package] -blake2b_simd = { opt-level = 3 } blake2 = { opt-level = 3 } blake2-rfc = { opt-level = 3 } +blake2b_simd = { opt-level = 3 } chacha20poly1305 = { opt-level = 3 } cranelift-codegen = { opt-level = 3 } cranelift-wasm = { opt-level = 3 } @@ -141,8 +141,8 @@ curve25519-dalek = { opt-level = 3 } ed25519-dalek = { opt-level = 3 } flate2 = { opt-level = 3 } futures-channel = { opt-level = 3 } -hashbrown = { opt-level = 3 } hash-db = { opt-level = 3 } +hashbrown = { opt-level = 3 } hmac = { opt-level = 3 } httparse = { opt-level = 3 } integer-sqrt = { opt-level = 3 } @@ -154,8 +154,8 @@ libz-sys = { opt-level = 3 } mio = { opt-level = 3 } nalgebra = { opt-level = 3 } num-bigint = { opt-level = 3 } -parking_lot_core = { opt-level = 3 } parking_lot = { opt-level = 3 } +parking_lot_core = { opt-level = 3 } percent-encoding = { opt-level = 3 } primitive-types = { opt-level = 3 } reed-solomon-novelpoly = { opt-level = 3 } @@ -165,6 +165,7 @@ sha2 = { opt-level = 3 } sha3 = { opt-level = 3 } smallvec = { opt-level = 3 } snow = { opt-level = 3 } +substrate-bip39 = {opt-level = 3} twox-hash = { opt-level = 3 } uint = { opt-level = 3 } wasmi = { opt-level = 3 } @@ -199,7 +200,6 @@ try-runtime = [ "polkadot-cli/try-runtime" ] fast-runtime = [ "polkadot-cli/fast-runtime" ] runtime-metrics = [ "polkadot-cli/runtime-metrics" ] pyroscope = ["polkadot-cli/pyroscope"] -staging-client = ["polkadot-cli/staging-client"] # Configuration for building a .deb package - for use with `cargo-deb` [package.metadata.deb] diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 1a161db7916d..0f009e2afb81 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -74,4 +74,3 @@ rococo-native = ["service/rococo-native"] malus = ["full-node", "service/malus"] runtime-metrics = ["service/runtime-metrics", "polkadot-node-metrics/runtime-metrics"] -staging-client = ["service/staging-client"] diff --git a/node/core/dispute-coordinator/src/db/v1.rs b/node/core/dispute-coordinator/src/db/v1.rs index 4d33949db644..2c643d341de2 100644 --- a/node/core/dispute-coordinator/src/db/v1.rs +++ b/node/core/dispute-coordinator/src/db/v1.rs @@ -16,6 +16,7 @@ //! `V1` database for the dispute coordinator. +use polkadot_node_primitives::DisputeStatus; use polkadot_node_subsystem::{SubsystemError, SubsystemResult}; use polkadot_node_subsystem_util::database::{DBTransaction, Database}; use polkadot_primitives::v2::{ @@ -31,7 +32,6 @@ use crate::{ backend::{Backend, BackendWriteOp, OverlayedBackend}, error::{FatalError, FatalResult}, metrics::Metrics, - status::DisputeStatus, DISPUTE_WINDOW, LOG_TARGET, }; diff --git a/node/core/dispute-coordinator/src/initialized.rs b/node/core/dispute-coordinator/src/initialized.rs index e37459dc5142..c1b34ce134cc 100644 --- a/node/core/dispute-coordinator/src/initialized.rs +++ b/node/core/dispute-coordinator/src/initialized.rs @@ -26,8 +26,8 @@ use futures::{ use sc_keystore::LocalKeystore; use polkadot_node_primitives::{ - CandidateVotes, DisputeMessage, DisputeMessageCheckError, SignedDisputeStatement, - DISPUTE_WINDOW, + CandidateVotes, DisputeMessage, DisputeMessageCheckError, DisputeStatus, + SignedDisputeStatement, Timestamp, DISPUTE_WINDOW, }; use polkadot_node_subsystem::{ messages::{ @@ -49,7 +49,7 @@ use crate::{ error::{log_error, Error, FatalError, FatalResult, JfyiError, JfyiResult, Result}, import::{CandidateEnvironment, CandidateVoteState}, metrics::Metrics, - status::{get_active_with_status, Clock, DisputeStatus, Timestamp}, + status::{get_active_with_status, Clock}, DisputeCoordinatorSubsystem, LOG_TARGET, }; @@ -599,7 +599,9 @@ impl Initialized { }; gum::trace!(target: LOG_TARGET, "Loaded recent disputes from db"); - let _ = tx.send(recent_disputes.keys().cloned().collect()); + let _ = tx.send( + recent_disputes.into_iter().map(|(k, v)| (k.0, k.1, v)).collect::>(), + ); }, DisputeCoordinatorMessage::ActiveDisputes(tx) => { // Return error if session information is missing. diff --git a/node/core/dispute-coordinator/src/status.rs b/node/core/dispute-coordinator/src/status.rs index d2ad551bd9ad..6332c3653274 100644 --- a/node/core/dispute-coordinator/src/status.rs +++ b/node/core/dispute-coordinator/src/status.rs @@ -14,125 +14,18 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use std::time::{SystemTime, UNIX_EPOCH}; - -use parity_scale_codec::{Decode, Encode}; +use polkadot_node_primitives::{dispute_is_inactive, DisputeStatus, Timestamp}; use polkadot_primitives::v2::{CandidateHash, SessionIndex}; +use std::time::{SystemTime, UNIX_EPOCH}; use crate::LOG_TARGET; -/// The choice here is fairly arbitrary. But any dispute that concluded more than a few minutes ago -/// is not worth considering anymore. Changing this value has little to no bearing on consensus, -/// and really only affects the work that the node might do on startup during periods of many -/// disputes. -pub const ACTIVE_DURATION_SECS: Timestamp = 180; - -/// Timestamp based on the 1 Jan 1970 UNIX base, which is persistent across node restarts and OS reboots. -pub type Timestamp = u64; - -/// The status of dispute. This is a state machine which can be altered by the -/// helper methods. -#[derive(Debug, Clone, Copy, Encode, Decode, PartialEq)] -pub enum DisputeStatus { - /// The dispute is active and unconcluded. - #[codec(index = 0)] - Active, - /// The dispute has been concluded in favor of the candidate - /// since the given timestamp. - #[codec(index = 1)] - ConcludedFor(Timestamp), - /// The dispute has been concluded against the candidate - /// since the given timestamp. - /// - /// This takes precedence over `ConcludedFor` in the case that - /// both are true, which is impossible unless a large amount of - /// validators are participating on both sides. - #[codec(index = 2)] - ConcludedAgainst(Timestamp), - /// Dispute has been confirmed (more than `byzantine_threshold` have already participated/ or - /// we have seen the candidate included already/participated successfully ourselves). - #[codec(index = 3)] - Confirmed, -} - -impl DisputeStatus { - /// Initialize the status to the active state. - pub fn active() -> DisputeStatus { - DisputeStatus::Active - } - - /// Move status to confirmed status, if not yet concluded/confirmed already. - pub fn confirm(self) -> DisputeStatus { - match self { - DisputeStatus::Active => DisputeStatus::Confirmed, - DisputeStatus::Confirmed => DisputeStatus::Confirmed, - DisputeStatus::ConcludedFor(_) | DisputeStatus::ConcludedAgainst(_) => self, - } - } - - /// Check whether the dispute is not a spam dispute. - pub fn is_confirmed_concluded(&self) -> bool { - match self { - &DisputeStatus::Confirmed | - &DisputeStatus::ConcludedFor(_) | - DisputeStatus::ConcludedAgainst(_) => true, - &DisputeStatus::Active => false, - } - } - - /// Transition the status to a new status after observing the dispute has concluded for the candidate. - /// This may be a no-op if the status was already concluded. - pub fn concluded_for(self, now: Timestamp) -> DisputeStatus { - match self { - DisputeStatus::Active | DisputeStatus::Confirmed => DisputeStatus::ConcludedFor(now), - DisputeStatus::ConcludedFor(at) => DisputeStatus::ConcludedFor(std::cmp::min(at, now)), - against => against, - } - } - - /// Transition the status to a new status after observing the dispute has concluded against the candidate. - /// This may be a no-op if the status was already concluded. - pub fn concluded_against(self, now: Timestamp) -> DisputeStatus { - match self { - DisputeStatus::Active | DisputeStatus::Confirmed => - DisputeStatus::ConcludedAgainst(now), - DisputeStatus::ConcludedFor(at) => - DisputeStatus::ConcludedAgainst(std::cmp::min(at, now)), - DisputeStatus::ConcludedAgainst(at) => - DisputeStatus::ConcludedAgainst(std::cmp::min(at, now)), - } - } - - /// Whether the disputed candidate is possibly invalid. - pub fn is_possibly_invalid(&self) -> bool { - match self { - DisputeStatus::Active | - DisputeStatus::Confirmed | - DisputeStatus::ConcludedAgainst(_) => true, - DisputeStatus::ConcludedFor(_) => false, - } - } - - /// Yields the timestamp this dispute concluded at, if any. - pub fn concluded_at(&self) -> Option { - match self { - DisputeStatus::Active | DisputeStatus::Confirmed => None, - DisputeStatus::ConcludedFor(at) | DisputeStatus::ConcludedAgainst(at) => Some(*at), - } - } -} - /// Get active disputes as iterator, preserving its `DisputeStatus`. pub fn get_active_with_status( recent_disputes: impl Iterator, now: Timestamp, ) -> impl Iterator { - recent_disputes.filter_map(move |(disputed, status)| { - status - .concluded_at() - .filter(|at| *at + ACTIVE_DURATION_SECS < now) - .map_or(Some((disputed, status)), |_| None) - }) + recent_disputes.filter(move |(_, status)| !dispute_is_inactive(status, &now)) } pub trait Clock: Send + Sync { diff --git a/node/core/dispute-coordinator/src/tests.rs b/node/core/dispute-coordinator/src/tests.rs index 39fdc3a037e5..ff85319599ce 100644 --- a/node/core/dispute-coordinator/src/tests.rs +++ b/node/core/dispute-coordinator/src/tests.rs @@ -49,6 +49,7 @@ use sp_keyring::Sr25519Keyring; use sp_keystore::{SyncCryptoStore, SyncCryptoStorePtr}; use ::test_helpers::{dummy_candidate_receipt_bad_sig, dummy_digest, dummy_hash}; +use polkadot_node_primitives::{Timestamp, ACTIVE_DURATION_SECS}; use polkadot_node_subsystem::{ jaeger, messages::{AllMessages, BlockDescription, RuntimeApiMessage, RuntimeApiRequest}, @@ -66,7 +67,7 @@ use crate::{ backend::Backend, metrics::Metrics, participation::{participation_full_happy_path, participation_missing_availability}, - status::{Clock, Timestamp, ACTIVE_DURATION_SECS}, + status::Clock, Config, DisputeCoordinatorSubsystem, }; diff --git a/node/core/provisioner/Cargo.toml b/node/core/provisioner/Cargo.toml index 2f5c1f9aa5dd..47e519087723 100644 --- a/node/core/provisioner/Cargo.toml +++ b/node/core/provisioner/Cargo.toml @@ -13,8 +13,8 @@ polkadot-primitives = { path = "../../../primitives" } polkadot-node-primitives = { path = "../../primitives" } polkadot-node-subsystem = { path = "../../subsystem" } polkadot-node-subsystem-util = { path = "../../subsystem-util" } -futures-timer = "3.0.2" rand = "0.8.5" +futures-timer = "3.0.2" fatality = "0.0.6" [dev-dependencies] @@ -22,6 +22,3 @@ sp-application-crypto = { git = "https://github.com/paritytech/substrate", branc sp-keystore = { git = "https://github.com/paritytech/substrate", branch = "master" } polkadot-node-subsystem-test-helpers = { path = "../../subsystem-test-helpers" } test-helpers = { package = "polkadot-primitives-test-helpers", path = "../../../primitives/test-helpers" } - -[features] -staging-client = [] diff --git a/node/core/provisioner/src/disputes/mod.rs b/node/core/provisioner/src/disputes/mod.rs new file mode 100644 index 000000000000..404e800702b1 --- /dev/null +++ b/node/core/provisioner/src/disputes/mod.rs @@ -0,0 +1,53 @@ +// Copyright 2017-2022 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! The disputes module is responsible for selecting dispute votes to be sent with the inherent data. It contains two +//! different implementations, extracted in two separate modules - `random_selection` and `prioritized_selection`. Which +//! implementation will be executed depends on the version of the runtime. Runtime v2 supports `random_selection`. Runtime +//! v3 and above - `prioritized_selection`. The entrypoint to these implementations is the `select_disputes` function. +//! prioritized_selection` is considered superior and will be the default one in the future. Refer to the documentation of +//! the modules for more details about each implementation. + +use crate::LOG_TARGET; +use futures::channel::oneshot; +use polkadot_node_primitives::CandidateVotes; +use polkadot_node_subsystem::{messages::DisputeCoordinatorMessage, overseer}; +use polkadot_primitives::v2::{CandidateHash, SessionIndex}; + +/// Request the relevant dispute statements for a set of disputes identified by `CandidateHash` and the `SessionIndex`. +async fn request_votes( + sender: &mut impl overseer::ProvisionerSenderTrait, + disputes_to_query: Vec<(SessionIndex, CandidateHash)>, +) -> Vec<(SessionIndex, CandidateHash, CandidateVotes)> { + let (tx, rx) = oneshot::channel(); + // Bounded by block production - `ProvisionerMessage::RequestInherentData`. + sender.send_unbounded_message(DisputeCoordinatorMessage::QueryCandidateVotes( + disputes_to_query, + tx, + )); + + match rx.await { + Ok(v) => v, + Err(oneshot::Canceled) => { + gum::warn!(target: LOG_TARGET, "Unable to query candidate votes"); + Vec::new() + }, + } +} + +pub(crate) mod prioritized_selection; + +pub(crate) mod random_selection; diff --git a/node/core/provisioner/src/disputes/prioritized_selection/mod.rs b/node/core/provisioner/src/disputes/prioritized_selection/mod.rs new file mode 100644 index 000000000000..056fd61b7227 --- /dev/null +++ b/node/core/provisioner/src/disputes/prioritized_selection/mod.rs @@ -0,0 +1,460 @@ +// Copyright 2017-2022 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! This module uses different approach for selecting dispute votes. It queries the Runtime +//! about the votes already known onchain and tries to select only relevant votes. Refer to +//! the documentation of `select_disputes` for more details about the actual implementation. + +use crate::{ + error::{Error, GetOnchainDisputesError}, + metrics, LOG_TARGET, +}; +use futures::channel::oneshot; +use polkadot_node_primitives::{dispute_is_inactive, CandidateVotes, DisputeStatus, Timestamp}; +use polkadot_node_subsystem::{ + errors::RuntimeApiError, + messages::{DisputeCoordinatorMessage, RuntimeApiMessage, RuntimeApiRequest}, + overseer, ActivatedLeaf, +}; +use polkadot_primitives::v2::{ + supermajority_threshold, CandidateHash, DisputeState, DisputeStatement, DisputeStatementSet, + Hash, InvalidDisputeStatementKind, MultiDisputeStatementSet, SessionIndex, + ValidDisputeStatementKind, ValidatorIndex, +}; +use std::{ + collections::{BTreeMap, HashMap}, + time::{SystemTime, UNIX_EPOCH}, +}; + +#[cfg(test)] +mod tests; + +/// The maximum number of disputes Provisioner will include in the inherent data. +/// Serves as a protection not to flood the Runtime with excessive data. +#[cfg(not(test))] +pub const MAX_DISPUTE_VOTES_FORWARDED_TO_RUNTIME: usize = 200_000; +#[cfg(test)] +pub const MAX_DISPUTE_VOTES_FORWARDED_TO_RUNTIME: usize = 200; + +/// Implements the `select_disputes` function which selects dispute votes which should +/// be sent to the Runtime. +/// +/// # How the prioritization works +/// +/// Generally speaking disputes can be described as: +/// * Active vs Inactive +/// * Known vs Unknown onchain +/// * Offchain vs Onchain +/// * Concluded onchain vs Unconcluded onchain +/// +/// Provisioner fetches all disputes from `dispute-coordinator` and separates them in multiple partitions. +/// Please refer to `struct PartitionedDisputes` for details about the actual partitions. +/// Each partition has got a priority implicitly assigned to it and the disputes are selected based on thus +/// priority (e.g. disputes in partition 1, then if there is space - disputes from partition 2 and so on). +/// +/// # Votes selection +/// +/// Besides the prioritization described above the votes in each partition are filtered too. Provisioner +/// fetches all onchain votes and filters them out from all partitions. As a result the Runtime receives +/// only fresh votes (votes it didn't know about). +/// +/// # How the onchain votes are fetched +/// +/// The logic outlined above relies on `RuntimeApiRequest::Disputes` message from the Runtime. The user +/// check the Runtime version before calling `select_disputes`. If the function is used with old runtime +/// an error is logged and the logic will continue with empty onchain votes HashMap. +pub async fn select_disputes( + sender: &mut Sender, + metrics: &metrics::Metrics, + leaf: &ActivatedLeaf, +) -> Result +where + Sender: overseer::ProvisionerSenderTrait, +{ + gum::trace!( + target: LOG_TARGET, + ?leaf, + "Selecting disputes for inherent data using prioritized selection" + ); + + // Fetch the onchain disputes. We'll do a prioritization based on them. + let onchain = match get_onchain_disputes(sender, leaf.hash.clone()).await { + Ok(r) => r, + Err(GetOnchainDisputesError::NotSupported(runtime_api_err, relay_parent)) => { + // Runtime version is checked before calling this method, so the error below should never happen! + gum::error!( + target: LOG_TARGET, + ?runtime_api_err, + ?relay_parent, + "Can't fetch onchain disputes, because runtime version is not recent enough. Will continue with empty onchain disputes set.", + ); + HashMap::new() + }, + Err(GetOnchainDisputesError::Channel) => { + // This error usually means the node is shutting down. Log just in case. + gum::debug!( + target: LOG_TARGET, + "Channel error occurred while fetching onchain disputes. Will continue with empty onchain disputes set.", + ); + HashMap::new() + }, + Err(GetOnchainDisputesError::Execution(runtime_api_err, parent_hash)) => { + gum::warn!( + target: LOG_TARGET, + ?runtime_api_err, + ?parent_hash, + "Unexpected execution error occurred while fetching onchain votes. Will continue with empty onchain disputes set.", + ); + HashMap::new() + }, + }; + + let recent_disputes = request_disputes(sender).await; + gum::trace!( + target: LOG_TARGET, + ?leaf, + "Got {} recent disputes and {} onchain disputes.", + recent_disputes.len(), + onchain.len(), + ); + + let partitioned = partition_recent_disputes(recent_disputes, &onchain); + metrics.on_partition_recent_disputes(&partitioned); + + if partitioned.inactive_unknown_onchain.len() > 0 { + gum::warn!( + target: LOG_TARGET, + ?leaf, + "Got {} inactive unknown onchain disputes. This should not happen!", + partitioned.inactive_unknown_onchain.len() + ); + } + let result = vote_selection(sender, partitioned, &onchain).await; + + into_multi_dispute_statement_set(metrics, result) +} + +/// Selects dispute votes from `PartitionedDisputes` which should be sent to the runtime. Votes which +/// are already onchain are filtered out. Result should be sorted by `(SessionIndex, CandidateHash)` +/// which is enforced by the `BTreeMap`. This is a requirement from the runtime. +async fn vote_selection( + sender: &mut Sender, + partitioned: PartitionedDisputes, + onchain: &HashMap<(SessionIndex, CandidateHash), DisputeState>, +) -> BTreeMap<(SessionIndex, CandidateHash), CandidateVotes> +where + Sender: overseer::ProvisionerSenderTrait, +{ + #[cfg(not(test))] + const BATCH_SIZE: usize = 1_100; + #[cfg(test)] + const BATCH_SIZE: usize = 11; + + // fetch in batches until there are enough votes + let mut disputes = partitioned.into_iter().collect::>(); + let mut total_votes_len = 0; + let mut result = BTreeMap::new(); + while !disputes.is_empty() { + let batch_size = std::cmp::min(BATCH_SIZE, disputes.len()); + let batch = Vec::from_iter(disputes.drain(0..batch_size)); + + // Filter votes which are already onchain + let votes = super::request_votes(sender, batch) + .await + .into_iter() + .map(|(session_index, candidate_hash, mut votes)| { + let onchain_state = + if let Some(onchain_state) = onchain.get(&(session_index, candidate_hash)) { + onchain_state + } else { + // onchain knows nothing about this dispute - add all votes + return (session_index, candidate_hash, votes) + }; + + votes.valid.retain(|validator_idx, (statement_kind, _)| { + is_vote_worth_to_keep(validator_idx, statement_kind, &onchain_state) + }); + votes.invalid.retain(|validator_idx, (statement_kind, _)| { + is_vote_worth_to_keep(validator_idx, statement_kind, &onchain_state) + }); + (session_index, candidate_hash, votes) + }) + .collect::>(); + + // Check if votes are within the limit + for (session_index, candidate_hash, selected_votes) in votes { + let votes_len = selected_votes.valid.len() + selected_votes.invalid.len(); + if votes_len + total_votes_len > MAX_DISPUTE_VOTES_FORWARDED_TO_RUNTIME { + // we are done - no more votes can be added + return result + } + result.insert((session_index, candidate_hash), selected_votes); + total_votes_len += votes_len + } + } + + result +} + +/// Contains disputes by partitions. Check the field comments for further details. +#[derive(Default)] +pub(crate) struct PartitionedDisputes { + /// Concluded and inactive disputes which are completely unknown for the Runtime. + /// Hopefully this should never happen. + /// Will be sent to the Runtime with FIRST priority. + pub inactive_unknown_onchain: Vec<(SessionIndex, CandidateHash)>, + /// Disputes which are INACTIVE locally but they are unconcluded for the Runtime. + /// A dispute can have enough local vote to conclude and at the same time the + /// Runtime knows nothing about them at treats it as unconcluded. This discrepancy + /// should be treated with high priority. + /// Will be sent to the Runtime with SECOND priority. + pub inactive_unconcluded_onchain: Vec<(SessionIndex, CandidateHash)>, + /// Active disputes completely unknown onchain. + /// Will be sent to the Runtime with THIRD priority. + pub active_unknown_onchain: Vec<(SessionIndex, CandidateHash)>, + /// Active disputes unconcluded onchain. + /// Will be sent to the Runtime with FOURTH priority. + pub active_unconcluded_onchain: Vec<(SessionIndex, CandidateHash)>, + /// Active disputes concluded onchain. New votes are not that important for + /// this partition. + /// Will be sent to the Runtime with FIFTH priority. + pub active_concluded_onchain: Vec<(SessionIndex, CandidateHash)>, + /// Inactive disputes which has concluded onchain. These are not interesting and + /// won't be sent to the Runtime. + /// Will be DROPPED + pub inactive_concluded_onchain: Vec<(SessionIndex, CandidateHash)>, +} + +impl PartitionedDisputes { + fn new() -> PartitionedDisputes { + Default::default() + } + + fn into_iter(self) -> impl Iterator { + self.inactive_unknown_onchain + .into_iter() + .chain(self.inactive_unconcluded_onchain.into_iter()) + .chain(self.active_unknown_onchain.into_iter()) + .chain(self.active_unconcluded_onchain.into_iter()) + .chain(self.active_concluded_onchain.into_iter()) + // inactive_concluded_onchain is dropped on purpose + } +} + +fn secs_since_epoch() -> Timestamp { + match SystemTime::now().duration_since(UNIX_EPOCH) { + Ok(d) => d.as_secs(), + Err(e) => { + gum::warn!( + target: LOG_TARGET, + err = ?e, + "Error getting system time." + ); + 0 + }, + } +} + +fn concluded_onchain(onchain_state: &DisputeState) -> bool { + // Check if there are enough onchain votes for or against to conclude the dispute + let supermajority = supermajority_threshold(onchain_state.validators_for.len()); + + onchain_state.validators_for.count_ones() >= supermajority || + onchain_state.validators_against.count_ones() >= supermajority +} + +fn partition_recent_disputes( + recent: Vec<(SessionIndex, CandidateHash, DisputeStatus)>, + onchain: &HashMap<(SessionIndex, CandidateHash), DisputeState>, +) -> PartitionedDisputes { + let mut partitioned = PartitionedDisputes::new(); + + // Drop any duplicates + let unique_recent = recent + .into_iter() + .map(|(session_index, candidate_hash, dispute_state)| { + ((session_index, candidate_hash), dispute_state) + }) + .collect::>(); + + // Split recent disputes in ACTIVE and INACTIVE + let time_now = &secs_since_epoch(); + let (active, inactive): ( + Vec<(SessionIndex, CandidateHash, DisputeStatus)>, + Vec<(SessionIndex, CandidateHash, DisputeStatus)>, + ) = unique_recent + .into_iter() + .map(|((session_index, candidate_hash), dispute_state)| { + (session_index, candidate_hash, dispute_state) + }) + .partition(|(_, _, status)| !dispute_is_inactive(status, time_now)); + + // Split ACTIVE in three groups... + for (session_index, candidate_hash, _) in active { + match onchain.get(&(session_index, candidate_hash)) { + Some(d) => match concluded_onchain(d) { + true => partitioned.active_concluded_onchain.push((session_index, candidate_hash)), + false => + partitioned.active_unconcluded_onchain.push((session_index, candidate_hash)), + }, + None => partitioned.active_unknown_onchain.push((session_index, candidate_hash)), + }; + } + + // ... and INACTIVE in three more + for (session_index, candidate_hash, _) in inactive { + match onchain.get(&(session_index, candidate_hash)) { + Some(onchain_state) => + if concluded_onchain(onchain_state) { + partitioned.inactive_concluded_onchain.push((session_index, candidate_hash)); + } else { + partitioned.inactive_unconcluded_onchain.push((session_index, candidate_hash)); + }, + None => partitioned.inactive_unknown_onchain.push((session_index, candidate_hash)), + } + } + + partitioned +} + +// Helper trait to obtain the value of vote for `InvalidDisputeStatementKind` and `ValidDisputeStatementKind`. +// The alternative was to pass a bool to `fn is_vote_worth_to_keep` explicitly but it's pointless as the value is already 'encoded' in the type. +trait VoteType { + fn is_valid() -> bool; +} + +impl VoteType for InvalidDisputeStatementKind { + fn is_valid() -> bool { + false + } +} + +impl VoteType for ValidDisputeStatementKind { + fn is_valid() -> bool { + true + } +} + +/// Determines if a vote is worth to be kept, based on the onchain disputes +fn is_vote_worth_to_keep( + validator_index: &ValidatorIndex, + _: &T, + onchain_state: &DisputeState, +) -> bool { + let offchain_vote = T::is_valid(); + let in_validators_for = onchain_state + .validators_for + .get(validator_index.0 as usize) + .as_deref() + .copied() + .unwrap_or(false); + let in_validators_against = onchain_state + .validators_against + .get(validator_index.0 as usize) + .as_deref() + .copied() + .unwrap_or(false); + + if in_validators_for && in_validators_against { + // The validator has double voted and runtime knows about this. Ignore this vote. + return false + } + + if offchain_vote && in_validators_against || !offchain_vote && in_validators_for { + // offchain vote differs from the onchain vote + // we need this vote to punish the offending validator + return true + } + + // The vote is valid. Return true if it is not seen onchain. + !in_validators_for && !in_validators_against +} + +/// Request disputes identified by `CandidateHash` and the `SessionIndex`. +async fn request_disputes( + sender: &mut impl overseer::ProvisionerSenderTrait, +) -> Vec<(SessionIndex, CandidateHash, DisputeStatus)> { + let (tx, rx) = oneshot::channel(); + let msg = DisputeCoordinatorMessage::RecentDisputes(tx); + + // Bounded by block production - `ProvisionerMessage::RequestInherentData`. + sender.send_unbounded_message(msg); + + let recent_disputes = rx.await.unwrap_or_else(|err| { + gum::warn!(target: LOG_TARGET, err=?err, "Unable to gather recent disputes"); + Vec::new() + }); + recent_disputes +} + +// This function produces the return value for `pub fn select_disputes()` +fn into_multi_dispute_statement_set( + metrics: &metrics::Metrics, + dispute_candidate_votes: BTreeMap<(SessionIndex, CandidateHash), CandidateVotes>, +) -> Result { + // Transform all `CandidateVotes` into `MultiDisputeStatementSet`. + Ok(dispute_candidate_votes + .into_iter() + .map(|((session_index, candidate_hash), votes)| { + let valid_statements = votes + .valid + .into_iter() + .map(|(i, (s, sig))| (DisputeStatement::Valid(s), i, sig)); + + let invalid_statements = votes + .invalid + .into_iter() + .map(|(i, (s, sig))| (DisputeStatement::Invalid(s), i, sig)); + + metrics.inc_valid_statements_by(valid_statements.len()); + metrics.inc_invalid_statements_by(invalid_statements.len()); + metrics.inc_dispute_statement_sets_by(1); + + DisputeStatementSet { + candidate_hash, + session: session_index, + statements: valid_statements.chain(invalid_statements).collect(), + } + }) + .collect()) +} + +/// Gets the on-chain disputes at a given block number and returns them as a `HashMap` so that searching in them is cheap. +pub async fn get_onchain_disputes( + sender: &mut Sender, + relay_parent: Hash, +) -> Result, GetOnchainDisputesError> +where + Sender: overseer::ProvisionerSenderTrait, +{ + gum::trace!(target: LOG_TARGET, ?relay_parent, "Fetching on-chain disputes"); + let (tx, rx) = oneshot::channel(); + sender + .send_message(RuntimeApiMessage::Request(relay_parent, RuntimeApiRequest::Disputes(tx))) + .await; + + rx.await + .map_err(|_| GetOnchainDisputesError::Channel) + .and_then(|res| { + res.map_err(|e| match e { + RuntimeApiError::Execution { .. } => + GetOnchainDisputesError::Execution(e, relay_parent), + RuntimeApiError::NotSupported { .. } => + GetOnchainDisputesError::NotSupported(e, relay_parent), + }) + }) + .map(|v| v.into_iter().map(|e| ((e.0, e.1), e.2)).collect()) +} diff --git a/node/core/provisioner/src/disputes/prioritized_selection/tests.rs b/node/core/provisioner/src/disputes/prioritized_selection/tests.rs new file mode 100644 index 000000000000..6b7f99a1df07 --- /dev/null +++ b/node/core/provisioner/src/disputes/prioritized_selection/tests.rs @@ -0,0 +1,706 @@ +// Copyright 2017-2022 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use super::super::{ + super::{tests::common::test_harness, *}, + prioritized_selection::*, +}; +use bitvec::prelude::*; +use futures::channel::mpsc; +use polkadot_node_primitives::{CandidateVotes, DisputeStatus, ACTIVE_DURATION_SECS}; +use polkadot_node_subsystem::messages::{ + AllMessages, DisputeCoordinatorMessage, RuntimeApiMessage, RuntimeApiRequest, +}; +use polkadot_node_subsystem_test_helpers::TestSubsystemSender; +use polkadot_primitives::v2::{ + CandidateHash, DisputeState, InvalidDisputeStatementKind, SessionIndex, + ValidDisputeStatementKind, ValidatorSignature, +}; +use std::sync::Arc; +use test_helpers; + +// +// Unit tests for various functions +// +#[test] +fn should_keep_vote_behaves() { + let onchain_state = DisputeState { + validators_for: bitvec![u8, Lsb0; 1, 0, 1, 0, 1], + validators_against: bitvec![u8, Lsb0; 0, 1, 0, 0, 1], + start: 1, + concluded_at: None, + }; + + let local_valid_known = (ValidatorIndex(0), ValidDisputeStatementKind::Explicit); + let local_valid_unknown = (ValidatorIndex(3), ValidDisputeStatementKind::Explicit); + + let local_invalid_known = (ValidatorIndex(1), InvalidDisputeStatementKind::Explicit); + let local_invalid_unknown = (ValidatorIndex(3), InvalidDisputeStatementKind::Explicit); + + assert_eq!( + is_vote_worth_to_keep(&local_valid_known.0, &local_valid_known.1, &onchain_state), + false + ); + assert_eq!( + is_vote_worth_to_keep(&local_valid_unknown.0, &local_valid_unknown.1, &onchain_state), + true + ); + assert_eq!( + is_vote_worth_to_keep(&local_invalid_known.0, &local_invalid_known.1, &onchain_state), + false + ); + assert_eq!( + is_vote_worth_to_keep(&local_invalid_unknown.0, &local_invalid_unknown.1, &onchain_state), + true + ); + + //double voting - onchain knows + let local_double_vote_onchain_knows = + (ValidatorIndex(4), InvalidDisputeStatementKind::Explicit); + assert_eq!( + is_vote_worth_to_keep( + &local_double_vote_onchain_knows.0, + &local_double_vote_onchain_knows.1, + &onchain_state + ), + false + ); + + //double voting - onchain doesn't know + let local_double_vote_onchain_doesnt_knows = + (ValidatorIndex(0), InvalidDisputeStatementKind::Explicit); + assert_eq!( + is_vote_worth_to_keep( + &local_double_vote_onchain_doesnt_knows.0, + &local_double_vote_onchain_doesnt_knows.1, + &onchain_state + ), + true + ); + + // empty onchain state + let empty_onchain_state = DisputeState { + validators_for: BitVec::new(), + validators_against: BitVec::new(), + start: 1, + concluded_at: None, + }; + assert_eq!( + is_vote_worth_to_keep( + &local_double_vote_onchain_doesnt_knows.0, + &local_double_vote_onchain_doesnt_knows.1, + &empty_onchain_state + ), + true + ); +} + +#[test] +fn partitioning_happy_case() { + let mut input = Vec::<(SessionIndex, CandidateHash, DisputeStatus)>::new(); + let mut onchain = HashMap::<(u32, CandidateHash), DisputeState>::new(); + let time_now = secs_since_epoch(); + + // Create one dispute for each partition + let inactive_unknown_onchain = ( + 0, + CandidateHash(Hash::random()), + DisputeStatus::ConcludedFor(time_now - ACTIVE_DURATION_SECS * 2), + ); + input.push(inactive_unknown_onchain.clone()); + + let inactive_unconcluded_onchain = ( + 1, + CandidateHash(Hash::random()), + DisputeStatus::ConcludedFor(time_now - ACTIVE_DURATION_SECS * 2), + ); + input.push(inactive_unconcluded_onchain.clone()); + onchain.insert( + (inactive_unconcluded_onchain.0, inactive_unconcluded_onchain.1.clone()), + DisputeState { + validators_for: bitvec![u8, Lsb0; 1, 1, 1, 0, 0, 0, 0, 0, 0], + validators_against: bitvec![u8, Lsb0; 0, 0, 0, 0, 0, 0, 0, 0, 0], + start: 1, + concluded_at: None, + }, + ); + + let active_unknown_onchain = (2, CandidateHash(Hash::random()), DisputeStatus::Active); + input.push(active_unknown_onchain.clone()); + + let active_unconcluded_onchain = (3, CandidateHash(Hash::random()), DisputeStatus::Active); + input.push(active_unconcluded_onchain.clone()); + onchain.insert( + (active_unconcluded_onchain.0, active_unconcluded_onchain.1.clone()), + DisputeState { + validators_for: bitvec![u8, Lsb0; 1, 1, 1, 0, 0, 0, 0, 0, 0], + validators_against: bitvec![u8, Lsb0; 0, 0, 0, 0, 0, 0, 0, 0, 0], + start: 1, + concluded_at: None, + }, + ); + + let active_concluded_onchain = (4, CandidateHash(Hash::random()), DisputeStatus::Active); + input.push(active_concluded_onchain.clone()); + onchain.insert( + (active_concluded_onchain.0, active_concluded_onchain.1.clone()), + DisputeState { + validators_for: bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1, 0], + validators_against: bitvec![u8, Lsb0; 0, 0, 0, 0, 0, 0, 0, 0, 0], + start: 1, + concluded_at: Some(3), + }, + ); + + let inactive_concluded_onchain = ( + 5, + CandidateHash(Hash::random()), + DisputeStatus::ConcludedFor(time_now - ACTIVE_DURATION_SECS * 2), + ); + input.push(inactive_concluded_onchain.clone()); + onchain.insert( + (inactive_concluded_onchain.0, inactive_concluded_onchain.1.clone()), + DisputeState { + validators_for: bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 0, 0], + validators_against: bitvec![u8, Lsb0; 0, 0, 0, 0, 0, 0, 0, 0, 0], + start: 1, + concluded_at: Some(3), + }, + ); + + let result = partition_recent_disputes(input, &onchain); + + // Check results + assert_eq!(result.inactive_unknown_onchain.len(), 1); + assert_eq!( + result.inactive_unknown_onchain.get(0).unwrap(), + &(inactive_unknown_onchain.0, inactive_unknown_onchain.1) + ); + + assert_eq!(result.inactive_unconcluded_onchain.len(), 1); + assert_eq!( + result.inactive_unconcluded_onchain.get(0).unwrap(), + &(inactive_unconcluded_onchain.0, inactive_unconcluded_onchain.1) + ); + + assert_eq!(result.active_unknown_onchain.len(), 1); + assert_eq!( + result.active_unknown_onchain.get(0).unwrap(), + &(active_unknown_onchain.0, active_unknown_onchain.1) + ); + + assert_eq!(result.active_unconcluded_onchain.len(), 1); + assert_eq!( + result.active_unconcluded_onchain.get(0).unwrap(), + &(active_unconcluded_onchain.0, active_unconcluded_onchain.1) + ); + + assert_eq!(result.active_concluded_onchain.len(), 1); + assert_eq!( + result.active_concluded_onchain.get(0).unwrap(), + &(active_concluded_onchain.0, active_concluded_onchain.1) + ); + + assert_eq!(result.inactive_concluded_onchain.len(), 1); + assert_eq!( + result.inactive_concluded_onchain.get(0).unwrap(), + &(inactive_concluded_onchain.0, inactive_concluded_onchain.1) + ); +} + +// This test verifies the double voting behavior. Currently we don't care if a supermajority is achieved with or +// without the 'help' of a double vote (a validator voting for and against at the same time). This makes the test +// a bit pointless but anyway I'm leaving it here to make this decision explicit and have the test code ready in +// case this behavior needs to be further tested in the future. +// Link to the PR with the discussions: https://github.com/paritytech/polkadot/pull/5567 +#[test] +fn partitioning_doubled_onchain_vote() { + let mut input = Vec::<(SessionIndex, CandidateHash, DisputeStatus)>::new(); + let mut onchain = HashMap::<(u32, CandidateHash), DisputeState>::new(); + + // Dispute A relies on a 'double onchain vote' to conclude. Validator with index 0 has voted both `for` and `against`. + // Despite that this dispute should be considered 'can conclude onchain'. + let dispute_a = (3, CandidateHash(Hash::random()), DisputeStatus::Active); + // Dispute B has supermajority + 1 votes, so the doubled onchain vote doesn't affect it. It should be considered + // as 'can conclude onchain'. + let dispute_b = (4, CandidateHash(Hash::random()), DisputeStatus::Active); + input.push(dispute_a.clone()); + input.push(dispute_b.clone()); + onchain.insert( + (dispute_a.0, dispute_a.1.clone()), + DisputeState { + validators_for: bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 0, 0], + validators_against: bitvec![u8, Lsb0; 1, 0, 0, 0, 0, 0, 0, 0, 0], + start: 1, + concluded_at: None, + }, + ); + onchain.insert( + (dispute_b.0, dispute_b.1.clone()), + DisputeState { + validators_for: bitvec![u8, Lsb0; 1, 1, 1, 1, 1, 1, 1, 1, 0], + validators_against: bitvec![u8, Lsb0; 1, 0, 0, 0, 0, 0, 0, 0, 0], + start: 1, + concluded_at: None, + }, + ); + + let result = partition_recent_disputes(input, &onchain); + + assert_eq!(result.active_unconcluded_onchain.len(), 0); + assert_eq!(result.active_concluded_onchain.len(), 2); +} + +#[test] +fn partitioning_duplicated_dispute() { + let mut input = Vec::<(SessionIndex, CandidateHash, DisputeStatus)>::new(); + let mut onchain = HashMap::<(u32, CandidateHash), DisputeState>::new(); + + let some_dispute = (3, CandidateHash(Hash::random()), DisputeStatus::Active); + input.push(some_dispute.clone()); + input.push(some_dispute.clone()); + onchain.insert( + (some_dispute.0, some_dispute.1.clone()), + DisputeState { + validators_for: bitvec![u8, Lsb0; 1, 1, 1, 0, 0, 0, 0, 0, 0], + validators_against: bitvec![u8, Lsb0; 0, 0, 0, 0, 0, 0, 0, 0, 0], + start: 1, + concluded_at: None, + }, + ); + + let result = partition_recent_disputes(input, &onchain); + + assert_eq!(result.active_unconcluded_onchain.len(), 1); + assert_eq!( + result.active_unconcluded_onchain.get(0).unwrap(), + &(some_dispute.0, some_dispute.1) + ); +} + +// +// end-to-end tests for select_disputes() +// + +async fn mock_overseer( + mut receiver: mpsc::UnboundedReceiver, + disputes_db: &mut TestDisputes, + vote_queries_count: &mut usize, +) { + while let Some(from_job) = receiver.next().await { + match from_job { + AllMessages::RuntimeApi(RuntimeApiMessage::Request( + _, + RuntimeApiRequest::Disputes(sender), + )) => { + let _ = sender.send(Ok(disputes_db + .onchain_disputes + .clone() + .into_iter() + .map(|(k, v)| (k.0, k.1, v)) + .collect::>())); + }, + AllMessages::RuntimeApi(_) => panic!("Unexpected RuntimeApi request"), + AllMessages::DisputeCoordinator(DisputeCoordinatorMessage::RecentDisputes(sender)) => { + let _ = sender.send(disputes_db.local_disputes.clone()); + }, + AllMessages::DisputeCoordinator(DisputeCoordinatorMessage::QueryCandidateVotes( + disputes, + sender, + )) => { + *vote_queries_count += 1; + let mut res = Vec::new(); + for d in disputes.iter() { + let v = disputes_db.votes_db.get(d).unwrap().clone(); + res.push((d.0, d.1, v)); + } + + let _ = sender.send(res); + }, + _ => panic!("Unexpected message: {:?}", from_job), + } + } +} + +fn leaf() -> ActivatedLeaf { + ActivatedLeaf { + hash: Hash::repeat_byte(0xAA), + number: 0xAA, + status: LeafStatus::Fresh, + span: Arc::new(jaeger::Span::Disabled), + } +} + +struct TestDisputes { + pub local_disputes: Vec<(SessionIndex, CandidateHash, DisputeStatus)>, + pub votes_db: HashMap<(SessionIndex, CandidateHash), CandidateVotes>, + pub onchain_disputes: HashMap<(u32, CandidateHash), DisputeState>, + validators_count: usize, +} + +impl TestDisputes { + pub fn new(validators_count: usize) -> TestDisputes { + TestDisputes { + local_disputes: Vec::<(SessionIndex, CandidateHash, DisputeStatus)>::new(), + votes_db: HashMap::<(SessionIndex, CandidateHash), CandidateVotes>::new(), + onchain_disputes: HashMap::<(u32, CandidateHash), DisputeState>::new(), + validators_count, + } + } + + // Offchain disputes are on node side + fn add_offchain_dispute( + &mut self, + dispute: (SessionIndex, CandidateHash, DisputeStatus), + local_votes_count: usize, + dummy_receipt: CandidateReceipt, + ) { + self.local_disputes.push(dispute.clone()); + self.votes_db.insert( + (dispute.0, dispute.1), + CandidateVotes { + candidate_receipt: dummy_receipt, + valid: TestDisputes::generate_local_votes( + ValidDisputeStatementKind::Explicit, + 0, + local_votes_count, + ), + invalid: BTreeMap::new(), + }, + ); + } + + fn add_onchain_dispute( + &mut self, + dispute: (SessionIndex, CandidateHash, DisputeStatus), + onchain_votes_count: usize, + ) { + let concluded_at = match dispute.2 { + DisputeStatus::Active | DisputeStatus::Confirmed => None, + DisputeStatus::ConcludedAgainst(_) | DisputeStatus::ConcludedFor(_) => Some(1), + }; + self.onchain_disputes.insert( + (dispute.0, dispute.1.clone()), + DisputeState { + validators_for: TestDisputes::generate_bitvec( + self.validators_count, + 0, + onchain_votes_count, + ), + validators_against: bitvec![u8, Lsb0; 0; self.validators_count], + start: 1, + concluded_at, + }, + ); + } + + pub fn add_unconfirmed_disputes_concluded_onchain( + &mut self, + dispute_count: usize, + ) -> (u32, usize) { + let local_votes_count = self.validators_count * 90 / 100; + let onchain_votes_count = self.validators_count * 80 / 100; + let session_idx = 0; + let lf = leaf(); + let dummy_receipt = test_helpers::dummy_candidate_receipt(lf.hash.clone()); + for _ in 0..dispute_count { + let d = (session_idx, CandidateHash(Hash::random()), DisputeStatus::Active); + self.add_offchain_dispute(d.clone(), local_votes_count, dummy_receipt.clone()); + self.add_onchain_dispute(d, onchain_votes_count); + } + + (session_idx, (local_votes_count - onchain_votes_count) * dispute_count) + } + + pub fn add_unconfirmed_disputes_unconcluded_onchain( + &mut self, + dispute_count: usize, + ) -> (u32, usize) { + let local_votes_count = self.validators_count * 90 / 100; + let onchain_votes_count = self.validators_count * 40 / 100; + let session_idx = 1; + let lf = leaf(); + let dummy_receipt = test_helpers::dummy_candidate_receipt(lf.hash.clone()); + for _ in 0..dispute_count { + let d = (session_idx, CandidateHash(Hash::random()), DisputeStatus::Active); + self.add_offchain_dispute(d.clone(), local_votes_count, dummy_receipt.clone()); + self.add_onchain_dispute(d, onchain_votes_count); + } + + (session_idx, (local_votes_count - onchain_votes_count) * dispute_count) + } + + pub fn add_unconfirmed_disputes_unknown_onchain( + &mut self, + dispute_count: usize, + ) -> (u32, usize) { + let local_votes_count = self.validators_count * 90 / 100; + let session_idx = 2; + let lf = leaf(); + let dummy_receipt = test_helpers::dummy_candidate_receipt(lf.hash.clone()); + for _ in 0..dispute_count { + let d = (session_idx, CandidateHash(Hash::random()), DisputeStatus::Active); + self.add_offchain_dispute(d.clone(), local_votes_count, dummy_receipt.clone()); + } + (session_idx, local_votes_count * dispute_count) + } + + pub fn add_concluded_disputes_known_onchain(&mut self, dispute_count: usize) -> (u32, usize) { + let local_votes_count = self.validators_count * 90 / 100; + let onchain_votes_count = self.validators_count * 75 / 100; + let session_idx = 3; + let lf = leaf(); + let dummy_receipt = test_helpers::dummy_candidate_receipt(lf.hash.clone()); + for _ in 0..dispute_count { + let d = (session_idx, CandidateHash(Hash::random()), DisputeStatus::ConcludedFor(0)); + self.add_offchain_dispute(d.clone(), local_votes_count, dummy_receipt.clone()); + self.add_onchain_dispute(d, onchain_votes_count); + } + (session_idx, (local_votes_count - onchain_votes_count) * dispute_count) + } + + pub fn add_concluded_disputes_unknown_onchain(&mut self, dispute_count: usize) -> (u32, usize) { + let local_votes_count = self.validators_count * 90 / 100; + let session_idx = 4; + let lf = leaf(); + let dummy_receipt = test_helpers::dummy_candidate_receipt(lf.hash.clone()); + for _ in 0..dispute_count { + let d = (session_idx, CandidateHash(Hash::random()), DisputeStatus::ConcludedFor(0)); + self.add_offchain_dispute(d.clone(), local_votes_count, dummy_receipt.clone()); + } + (session_idx, local_votes_count * dispute_count) + } + + fn generate_local_votes( + statement_kind: T, + start_idx: usize, + count: usize, + ) -> BTreeMap { + assert!(start_idx < count); + (start_idx..count) + .map(|idx| { + ( + ValidatorIndex(idx as u32), + (statement_kind.clone(), test_helpers::dummy_signature()), + ) + }) + .collect::>() + } + + fn generate_bitvec( + validator_count: usize, + start_idx: usize, + count: usize, + ) -> BitVec { + assert!(start_idx < count); + assert!(start_idx + count < validator_count); + let mut res = bitvec![u8, Lsb0; 0; validator_count]; + for idx in start_idx..count { + res.set(idx, true); + } + + res + } +} + +#[test] +fn normal_flow() { + const VALIDATOR_COUNT: usize = 10; + const DISPUTES_PER_BATCH: usize = 2; + const ACCEPTABLE_RUNTIME_VOTES_QUERIES_COUNT: usize = 1; + + let mut input = TestDisputes::new(VALIDATOR_COUNT); + + // active, concluded onchain + let (third_idx, third_votes) = + input.add_unconfirmed_disputes_concluded_onchain(DISPUTES_PER_BATCH); + + // active unconcluded onchain + let (first_idx, first_votes) = + input.add_unconfirmed_disputes_unconcluded_onchain(DISPUTES_PER_BATCH); + + //concluded disputes unknown onchain + let (fifth_idx, fifth_votes) = input.add_concluded_disputes_unknown_onchain(DISPUTES_PER_BATCH); + + // concluded disputes known onchain - these should be ignored + let (_, _) = input.add_concluded_disputes_known_onchain(DISPUTES_PER_BATCH); + + // active disputes unknown onchain + let (second_idx, second_votes) = + input.add_unconfirmed_disputes_unknown_onchain(DISPUTES_PER_BATCH); + + let metrics = metrics::Metrics::new_dummy(); + let mut vote_queries: usize = 0; + test_harness( + |r| mock_overseer(r, &mut input, &mut vote_queries), + |mut tx: TestSubsystemSender| async move { + let lf = leaf(); + let result = select_disputes(&mut tx, &metrics, &lf).await.unwrap(); + + assert!(!result.is_empty()); + + assert_eq!(result.len(), 4 * DISPUTES_PER_BATCH); + + // Naive checks that the result is partitioned correctly + let (first_batch, rest): (Vec, Vec) = + result.into_iter().partition(|d| d.session == first_idx); + assert_eq!(first_batch.len(), DISPUTES_PER_BATCH); + + let (second_batch, rest): (Vec, Vec) = + rest.into_iter().partition(|d| d.session == second_idx); + assert_eq!(second_batch.len(), DISPUTES_PER_BATCH); + + let (third_batch, rest): (Vec, Vec) = + rest.into_iter().partition(|d| d.session == third_idx); + assert_eq!(third_batch.len(), DISPUTES_PER_BATCH); + + let (fifth_batch, rest): (Vec, Vec) = + rest.into_iter().partition(|d| d.session == fifth_idx); + assert_eq!(fifth_batch.len(), DISPUTES_PER_BATCH); + + // Ensure there are no more disputes - fourth_batch should be dropped + assert_eq!(rest.len(), 0); + + assert_eq!( + first_batch.iter().map(|d| d.statements.len()).fold(0, |acc, v| acc + v), + first_votes + ); + assert_eq!( + second_batch.iter().map(|d| d.statements.len()).fold(0, |acc, v| acc + v), + second_votes + ); + assert_eq!( + third_batch.iter().map(|d| d.statements.len()).fold(0, |acc, v| acc + v), + third_votes + ); + assert_eq!( + fifth_batch.iter().map(|d| d.statements.len()).fold(0, |acc, v| acc + v), + fifth_votes + ); + }, + ); + assert!(vote_queries <= ACCEPTABLE_RUNTIME_VOTES_QUERIES_COUNT); +} + +#[test] +fn many_batches() { + const VALIDATOR_COUNT: usize = 10; + const DISPUTES_PER_PARTITION: usize = 10; + + // 10 disputes per partition * 4 partitions = 40 disputes + // BATCH_SIZE = 11 + // => There should be no more than 40 / 11 queries ( ~4 ) + const ACCEPTABLE_RUNTIME_VOTES_QUERIES_COUNT: usize = 4; + + let mut input = TestDisputes::new(VALIDATOR_COUNT); + + // active which can conclude onchain + input.add_unconfirmed_disputes_concluded_onchain(DISPUTES_PER_PARTITION); + + // active which can't conclude onchain + input.add_unconfirmed_disputes_unconcluded_onchain(DISPUTES_PER_PARTITION); + + //concluded disputes unknown onchain + input.add_concluded_disputes_unknown_onchain(DISPUTES_PER_PARTITION); + + // concluded disputes known onchain + input.add_concluded_disputes_known_onchain(DISPUTES_PER_PARTITION); + + // active disputes unknown onchain + input.add_unconfirmed_disputes_unknown_onchain(DISPUTES_PER_PARTITION); + + let metrics = metrics::Metrics::new_dummy(); + let mut vote_queries: usize = 0; + test_harness( + |r| mock_overseer(r, &mut input, &mut vote_queries), + |mut tx: TestSubsystemSender| async move { + let lf = leaf(); + let result = select_disputes(&mut tx, &metrics, &lf).await.unwrap(); + + assert!(!result.is_empty()); + + let vote_count = result.iter().map(|d| d.statements.len()).fold(0, |acc, v| acc + v); + + assert!( + MAX_DISPUTE_VOTES_FORWARDED_TO_RUNTIME - VALIDATOR_COUNT <= vote_count && + vote_count <= MAX_DISPUTE_VOTES_FORWARDED_TO_RUNTIME, + "vote_count: {}", + vote_count + ); + }, + ); + + assert!( + vote_queries <= ACCEPTABLE_RUNTIME_VOTES_QUERIES_COUNT, + "vote_queries: {} ACCEPTABLE_RUNTIME_VOTES_QUERIES_COUNT: {}", + vote_queries, + ACCEPTABLE_RUNTIME_VOTES_QUERIES_COUNT + ); +} + +#[test] +fn votes_above_limit() { + const VALIDATOR_COUNT: usize = 10; + const DISPUTES_PER_PARTITION: usize = 50; + const ACCEPTABLE_RUNTIME_VOTES_QUERIES_COUNT: usize = 4; + + let mut input = TestDisputes::new(VALIDATOR_COUNT); + + // active which can conclude onchain + let (_, second_votes) = + input.add_unconfirmed_disputes_concluded_onchain(DISPUTES_PER_PARTITION); + + // active which can't conclude onchain + let (_, first_votes) = + input.add_unconfirmed_disputes_unconcluded_onchain(DISPUTES_PER_PARTITION); + + //concluded disputes unknown onchain + let (_, third_votes) = input.add_concluded_disputes_unknown_onchain(DISPUTES_PER_PARTITION); + + assert!( + first_votes + second_votes + third_votes > 3 * MAX_DISPUTE_VOTES_FORWARDED_TO_RUNTIME, + "Total relevant votes generated: {}", + first_votes + second_votes + third_votes + ); + + let metrics = metrics::Metrics::new_dummy(); + let mut vote_queries: usize = 0; + test_harness( + |r| mock_overseer(r, &mut input, &mut vote_queries), + |mut tx: TestSubsystemSender| async move { + let lf = leaf(); + let result = select_disputes(&mut tx, &metrics, &lf).await.unwrap(); + + assert!(!result.is_empty()); + + let vote_count = result.iter().map(|d| d.statements.len()).fold(0, |acc, v| acc + v); + + assert!( + MAX_DISPUTE_VOTES_FORWARDED_TO_RUNTIME - VALIDATOR_COUNT <= vote_count && + vote_count <= MAX_DISPUTE_VOTES_FORWARDED_TO_RUNTIME, + "vote_count: {}", + vote_count + ); + }, + ); + + assert!( + vote_queries <= ACCEPTABLE_RUNTIME_VOTES_QUERIES_COUNT, + "vote_queries: {} ACCEPTABLE_RUNTIME_VOTES_QUERIES_COUNT: {}", + vote_queries, + ACCEPTABLE_RUNTIME_VOTES_QUERIES_COUNT + ); +} diff --git a/node/core/provisioner/src/disputes/random_selection/mod.rs b/node/core/provisioner/src/disputes/random_selection/mod.rs new file mode 100644 index 000000000000..96a00801cbac --- /dev/null +++ b/node/core/provisioner/src/disputes/random_selection/mod.rs @@ -0,0 +1,194 @@ +// Copyright 2017-2022 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! This module selects all RECENT disputes, fetches the votes for them from dispute-coordinator and +//! returns them as MultiDisputeStatementSet. If the RECENT disputes are more than +//! `MAX_DISPUTES_FORWARDED_TO_RUNTIME` constant - the ACTIVE disputes plus a random selection of +//! RECENT disputes (up to `MAX_DISPUTES_FORWARDED_TO_RUNTIME`) are returned instead. +//! If the ACTIVE disputes are also above `MAX_DISPUTES_FORWARDED_TO_RUNTIME` limit - a random selection +//! of them is generated. + +use crate::{error::Error, metrics, LOG_TARGET}; +use futures::channel::oneshot; +use polkadot_node_subsystem::{messages::DisputeCoordinatorMessage, overseer}; +use polkadot_primitives::v2::{ + CandidateHash, DisputeStatement, DisputeStatementSet, MultiDisputeStatementSet, SessionIndex, +}; +use std::collections::HashSet; + +#[derive(Debug)] +enum RequestType { + /// Query recent disputes, could be an excessive amount. + Recent, + /// Query the currently active and very recently concluded disputes. + Active, +} + +/// Request open disputes identified by `CandidateHash` and the `SessionIndex`. +async fn request_disputes( + sender: &mut impl overseer::ProvisionerSenderTrait, + active_or_recent: RequestType, +) -> Vec<(SessionIndex, CandidateHash)> { + let disputes = match active_or_recent { + RequestType::Recent => { + let (tx, rx) = oneshot::channel(); + let msg = DisputeCoordinatorMessage::RecentDisputes(tx); + sender.send_unbounded_message(msg); + let recent_disputes = match rx.await { + Ok(r) => r, + Err(oneshot::Canceled) => { + gum::warn!( + target: LOG_TARGET, + "Unable to gather {:?} disputes", + active_or_recent + ); + Vec::new() + }, + }; + recent_disputes + .into_iter() + .map(|(sesion_idx, candodate_hash, _)| (sesion_idx, candodate_hash)) + .collect::>() + }, + RequestType::Active => { + let (tx, rx) = oneshot::channel(); + let msg = DisputeCoordinatorMessage::ActiveDisputes(tx); + sender.send_unbounded_message(msg); + let active_disputes = match rx.await { + Ok(r) => r, + Err(oneshot::Canceled) => { + gum::warn!( + target: LOG_TARGET, + "Unable to gather {:?} disputes", + active_or_recent + ); + Vec::new() + }, + }; + active_disputes + }, + }; + + disputes +} + +/// Extend `acc` by `n` random, picks of not-yet-present in `acc` items of `recent` without repetition and additions of recent. +fn extend_by_random_subset_without_repetition( + acc: &mut Vec<(SessionIndex, CandidateHash)>, + extension: Vec<(SessionIndex, CandidateHash)>, + n: usize, +) { + use rand::Rng; + + let lut = acc.iter().cloned().collect::>(); + + let mut unique_new = + extension.into_iter().filter(|recent| !lut.contains(recent)).collect::>(); + + // we can simply add all + if unique_new.len() <= n { + acc.extend(unique_new) + } else { + acc.reserve(n); + let mut rng = rand::thread_rng(); + for _ in 0..n { + let idx = rng.gen_range(0..unique_new.len()); + acc.push(unique_new.swap_remove(idx)); + } + } + // assure sorting stays candid according to session index + acc.sort_unstable_by(|a, b| a.0.cmp(&b.0)); +} + +pub async fn select_disputes( + sender: &mut Sender, + metrics: &metrics::Metrics, +) -> Result +where + Sender: overseer::ProvisionerSenderTrait, +{ + gum::trace!(target: LOG_TARGET, "Selecting disputes for inherent data using random selection"); + + /// The maximum number of disputes Provisioner will include in the inherent data. + /// Serves as a protection not to flood the Runtime with excessive data. + const MAX_DISPUTES_FORWARDED_TO_RUNTIME: usize = 1_000; + + // We use `RecentDisputes` instead of `ActiveDisputes` because redundancy is fine. + // It's heavier than `ActiveDisputes` but ensures that everything from the dispute + // window gets on-chain, unlike `ActiveDisputes`. + // In case of an overload condition, we limit ourselves to active disputes, and fill up to the + // upper bound of disputes to pass to wasm `fn create_inherent_data`. + // If the active ones are already exceeding the bounds, randomly select a subset. + let recent = request_disputes(sender, RequestType::Recent).await; + let disputes = if recent.len() > MAX_DISPUTES_FORWARDED_TO_RUNTIME { + gum::warn!( + target: LOG_TARGET, + "Recent disputes are excessive ({} > {}), reduce to active ones, and selected", + recent.len(), + MAX_DISPUTES_FORWARDED_TO_RUNTIME + ); + let mut active = request_disputes(sender, RequestType::Active).await; + let n_active = active.len(); + let active = if active.len() > MAX_DISPUTES_FORWARDED_TO_RUNTIME { + let mut picked = Vec::with_capacity(MAX_DISPUTES_FORWARDED_TO_RUNTIME); + extend_by_random_subset_without_repetition( + &mut picked, + active, + MAX_DISPUTES_FORWARDED_TO_RUNTIME, + ); + picked + } else { + extend_by_random_subset_without_repetition( + &mut active, + recent, + MAX_DISPUTES_FORWARDED_TO_RUNTIME.saturating_sub(n_active), + ); + active + }; + active + } else { + recent + }; + + // Load all votes for all disputes from the coordinator. + let dispute_candidate_votes = super::request_votes(sender, disputes).await; + + // Transform all `CandidateVotes` into `MultiDisputeStatementSet`. + Ok(dispute_candidate_votes + .into_iter() + .map(|(session_index, candidate_hash, votes)| { + let valid_statements = votes + .valid + .into_iter() + .map(|(i, (s, sig))| (DisputeStatement::Valid(s), i, sig)); + + let invalid_statements = votes + .invalid + .into_iter() + .map(|(i, (s, sig))| (DisputeStatement::Invalid(s), i, sig)); + + metrics.inc_valid_statements_by(valid_statements.len()); + metrics.inc_invalid_statements_by(invalid_statements.len()); + metrics.inc_dispute_statement_sets_by(1); + + DisputeStatementSet { + candidate_hash, + session: session_index, + statements: valid_statements.chain(invalid_statements).collect(), + } + }) + .collect()) +} diff --git a/node/core/provisioner/src/error.rs b/node/core/provisioner/src/error.rs index 05e437854eac..9fb958c4f339 100644 --- a/node/core/provisioner/src/error.rs +++ b/node/core/provisioner/src/error.rs @@ -88,9 +88,7 @@ pub enum GetOnchainDisputesError { #[error("runtime execution error occurred while fetching onchain disputes for parent {1}")] Execution(#[source] RuntimeApiError, Hash), - #[error( - "runtime doesn't support RuntimeApiRequest::Disputes/RuntimeApiRequest::StagingDisputes for parent {1}" - )] + #[error("runtime doesn't support RuntimeApiRequest::Disputes for parent {1}")] NotSupported(#[source] RuntimeApiError, Hash), } diff --git a/node/core/provisioner/src/lib.rs b/node/core/provisioner/src/lib.rs index 0f3099c7df33..0754e2c4c87e 100644 --- a/node/core/provisioner/src/lib.rs +++ b/node/core/provisioner/src/lib.rs @@ -25,29 +25,27 @@ use futures::{ }; use futures_timer::Delay; -use polkadot_node_primitives::CandidateVotes; use polkadot_node_subsystem::{ jaeger, messages::{ - CandidateBackingMessage, ChainApiMessage, DisputeCoordinatorMessage, ProvisionableData, - ProvisionerInherentData, ProvisionerMessage, + CandidateBackingMessage, ChainApiMessage, ProvisionableData, ProvisionerInherentData, + ProvisionerMessage, RuntimeApiMessage, RuntimeApiRequest, }, overseer, ActivatedLeaf, ActiveLeavesUpdate, FromOrchestra, LeafStatus, OverseerSignal, - PerLeafSpan, SpawnedSubsystem, SubsystemError, + PerLeafSpan, RuntimeApiError, SpawnedSubsystem, SubsystemError, }; use polkadot_node_subsystem_util::{ request_availability_cores, request_persisted_validation_data, TimeoutExt, }; use polkadot_primitives::v2::{ - BackedCandidate, BlockNumber, CandidateHash, CandidateReceipt, CoreState, DisputeState, - DisputeStatement, DisputeStatementSet, Hash, MultiDisputeStatementSet, OccupiedCoreAssumption, - SessionIndex, SignedAvailabilityBitfield, ValidatorIndex, + BackedCandidate, BlockNumber, CandidateReceipt, CoreState, Hash, OccupiedCoreAssumption, + SignedAvailabilityBitfield, ValidatorIndex, }; -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap}; +mod disputes; mod error; mod metrics; -mod onchain_disputes; pub use self::metrics::*; use error::{Error, FatalResult}; @@ -62,6 +60,9 @@ const SEND_INHERENT_DATA_TIMEOUT: std::time::Duration = core::time::Duration::fr const LOG_TARGET: &str = "parachain::provisioner"; +const PRIORITIZED_SELECTION_RUNTIME_VERSION_REQUIREMENT: u32 = + RuntimeApiRequest::DISPUTES_RUNTIME_REQUIREMENT; + /// The provisioner subsystem. pub struct ProvisionerSubsystem { metrics: Metrics, @@ -361,7 +362,18 @@ async fn send_inherent_data( relay_parent = ?leaf.hash, "Selecting disputes" ); - let disputes = select_disputes(from_job, metrics, leaf).await?; + + let disputes = match has_required_runtime( + from_job, + leaf.hash.clone(), + PRIORITIZED_SELECTION_RUNTIME_VERSION_REQUIREMENT, + ) + .await + { + true => disputes::prioritized_selection::select_disputes(from_job, metrics, leaf).await?, + false => disputes::random_selection::select_disputes(from_job, metrics).await?, + }; + gum::trace!( target: LOG_TARGET, relay_parent = ?leaf.hash, @@ -677,275 +689,55 @@ fn bitfields_indicate_availability( 3 * availability.count_ones() >= 2 * availability.len() } -#[derive(Debug)] -enum RequestType { - /// Query recent disputes, could be an excessive amount. - Recent, - /// Query the currently active and very recently concluded disputes. - Active, -} - -/// Request open disputes identified by `CandidateHash` and the `SessionIndex`. -async fn request_disputes( - sender: &mut impl overseer::ProvisionerSenderTrait, - active_or_recent: RequestType, -) -> Vec<(SessionIndex, CandidateHash)> { - let (tx, rx) = oneshot::channel(); - let msg = match active_or_recent { - RequestType::Recent => DisputeCoordinatorMessage::RecentDisputes(tx), - RequestType::Active => DisputeCoordinatorMessage::ActiveDisputes(tx), - }; - // Bounded by block production - `ProvisionerMessage::RequestInherentData`. - sender.send_unbounded_message(msg); - - let recent_disputes = match rx.await { - Ok(r) => r, - Err(oneshot::Canceled) => { - gum::warn!(target: LOG_TARGET, "Unable to gather {:?} disputes", active_or_recent); - Vec::new() - }, - }; - recent_disputes -} - -/// Request the relevant dispute statements for a set of disputes identified by `CandidateHash` and the `SessionIndex`. -async fn request_votes( +// If we have to be absolutely precise here, this method gets the version of the `ParachainHost` api. +// For brevity we'll just call it 'runtime version'. +async fn has_required_runtime( sender: &mut impl overseer::ProvisionerSenderTrait, - disputes_to_query: Vec<(SessionIndex, CandidateHash)>, -) -> Vec<(SessionIndex, CandidateHash, CandidateVotes)> { - // No need to send dummy request, if nothing to request: - if disputes_to_query.is_empty() { - gum::trace!(target: LOG_TARGET, "No disputes, nothing to request - returning empty `Vec`."); + relay_parent: Hash, + required_runtime_version: u32, +) -> bool { + gum::trace!(target: LOG_TARGET, ?relay_parent, "Fetching ParachainHost runtime api version"); - return Vec::new() - } let (tx, rx) = oneshot::channel(); - // Bounded by block production - `ProvisionerMessage::RequestInherentData`. - sender.send_unbounded_message(DisputeCoordinatorMessage::QueryCandidateVotes( - disputes_to_query, - tx, - )); + sender + .send_message(RuntimeApiMessage::Request(relay_parent, RuntimeApiRequest::Version(tx))) + .await; match rx.await { - Ok(v) => v, - Err(oneshot::Canceled) => { - gum::warn!(target: LOG_TARGET, "Unable to query candidate votes"); - Vec::new() + Result::Ok(Ok(runtime_version)) => { + gum::trace!( + target: LOG_TARGET, + ?relay_parent, + ?runtime_version, + ?required_runtime_version, + "Fetched ParachainHost runtime api version" + ); + runtime_version >= required_runtime_version }, - } -} - -/// Extend `acc` by `n` random, picks of not-yet-present in `acc` items of `recent` without repetition and additions of recent. -fn extend_by_random_subset_without_repetition( - acc: &mut Vec<(SessionIndex, CandidateHash)>, - extension: Vec<(SessionIndex, CandidateHash)>, - n: usize, -) { - use rand::Rng; - - let lut = acc.iter().cloned().collect::>(); - - let mut unique_new = - extension.into_iter().filter(|recent| !lut.contains(recent)).collect::>(); - - // we can simply add all - if unique_new.len() <= n { - acc.extend(unique_new) - } else { - acc.reserve(n); - let mut rng = rand::thread_rng(); - for _ in 0..n { - let idx = rng.gen_range(0..unique_new.len()); - acc.push(unique_new.swap_remove(idx)); - } - } - // assure sorting stays candid according to session index - acc.sort_unstable_by(|a, b| a.0.cmp(&b.0)); -} - -/// The maximum number of disputes Provisioner will include in the inherent data. -/// Serves as a protection not to flood the Runtime with excessive data. -const MAX_DISPUTES_FORWARDED_TO_RUNTIME: usize = 1_000; - -async fn select_disputes( - sender: &mut impl overseer::ProvisionerSenderTrait, - metrics: &metrics::Metrics, - _leaf: &ActivatedLeaf, -) -> Result { - // Helper lambda - // Gets the active disputes as input and partitions it in seen and unseen disputes by the Runtime - // Returns as much unseen disputes as possible and optionally some seen disputes up to `MAX_DISPUTES_FORWARDED_TO_RUNTIME` limit. - let generate_unseen_active_subset = - |active: Vec<(SessionIndex, CandidateHash)>, - onchain: HashMap<(SessionIndex, CandidateHash), DisputeState>| - -> Vec<(SessionIndex, CandidateHash)> { - let (seen_onchain, mut unseen_onchain): ( - Vec<(SessionIndex, CandidateHash)>, - Vec<(SessionIndex, CandidateHash)>, - ) = active.into_iter().partition(|d| onchain.contains_key(d)); - - if unseen_onchain.len() > MAX_DISPUTES_FORWARDED_TO_RUNTIME { - // Even unseen on-chain don't fit within the limit. Add as many as possible. - let mut unseen_subset = Vec::with_capacity(MAX_DISPUTES_FORWARDED_TO_RUNTIME); - extend_by_random_subset_without_repetition( - &mut unseen_subset, - unseen_onchain, - MAX_DISPUTES_FORWARDED_TO_RUNTIME, - ); - unseen_subset - } else { - // Add all unseen onchain disputes and as much of the seen ones as there is space. - let n_unseen_onchain = unseen_onchain.len(); - extend_by_random_subset_without_repetition( - &mut unseen_onchain, - seen_onchain, - MAX_DISPUTES_FORWARDED_TO_RUNTIME.saturating_sub(n_unseen_onchain), - ); - unseen_onchain - } - }; - - // Helper lambda - // Extends the active disputes with recent ones up to `MAX_DISPUTES_FORWARDED_TO_RUNTIME` limit. Unseen recent disputes are prioritised. - let generate_active_and_unseen_recent_subset = - |recent: Vec<(SessionIndex, CandidateHash)>, - mut active: Vec<(SessionIndex, CandidateHash)>, - onchain: HashMap<(SessionIndex, CandidateHash), DisputeState>| - -> Vec<(SessionIndex, CandidateHash)> { - let mut n_active = active.len(); - // All active disputes can be sent. Fill the rest of the space with recent ones. - // We assume there is not enough space for all recent disputes. So we prioritise the unseen ones. - let (seen_onchain, unseen_onchain): ( - Vec<(SessionIndex, CandidateHash)>, - Vec<(SessionIndex, CandidateHash)>, - ) = recent.into_iter().partition(|d| onchain.contains_key(d)); - - extend_by_random_subset_without_repetition( - &mut active, - unseen_onchain, - MAX_DISPUTES_FORWARDED_TO_RUNTIME.saturating_sub(n_active), + Result::Ok(Err(RuntimeApiError::Execution { source: error, .. })) => { + gum::trace!( + target: LOG_TARGET, + ?relay_parent, + ?error, + "Execution error while fetching ParachainHost runtime api version" ); - n_active = active.len(); - - if n_active < MAX_DISPUTES_FORWARDED_TO_RUNTIME { - // Looks like we can add some of the seen disputes too - extend_by_random_subset_without_repetition( - &mut active, - seen_onchain, - MAX_DISPUTES_FORWARDED_TO_RUNTIME.saturating_sub(n_active), - ); - } - active - }; - - gum::trace!( - target: LOG_TARGET, - relay_parent = ?_leaf.hash, - "Request recent disputes" - ); - - // We use `RecentDisputes` instead of `ActiveDisputes` because redundancy is fine. - // It's heavier than `ActiveDisputes` but ensures that everything from the dispute - // window gets on-chain, unlike `ActiveDisputes`. - // In case of an overload condition, we limit ourselves to active disputes, and fill up to the - // upper bound of disputes to pass to wasm `fn create_inherent_data`. - // If the active ones are already exceeding the bounds, randomly select a subset. - let recent = request_disputes(sender, RequestType::Recent).await; - - gum::trace!( - target: LOG_TARGET, - relay_paent = ?_leaf.hash, - "Received recent disputes" - ); - - gum::trace!( - target: LOG_TARGET, - relay_paent = ?_leaf.hash, - "Request on chain disputes" - ); - - // On chain disputes are fetched from the runtime. We want to prioritise the inclusion of unknown - // disputes in the inherent data. The call relies on staging Runtime API. If the staging API is not - // enabled in the binary an empty set is generated which doesn't affect the rest of the logic. - let onchain = match onchain_disputes::get_onchain_disputes(sender, _leaf.hash.clone()).await { - Ok(r) => r, - Err(e) => { - gum::debug!( + false + }, + Result::Ok(Err(RuntimeApiError::NotSupported { .. })) => { + gum::trace!( target: LOG_TARGET, - ?e, - "Can't fetch onchain disputes. Will continue with empty onchain disputes set.", + ?relay_parent, + "NotSupported error while fetching ParachainHost runtime api version" ); - HashMap::new() + false }, - }; - - gum::trace!( - target: LOG_TARGET, - relay_paent = ?_leaf.hash, - "Received on chain disputes" - ); - - gum::trace!( - target: LOG_TARGET, - relay_paent = ?_leaf.hash, - "Filtering disputes" - ); - - let disputes = if recent.len() > MAX_DISPUTES_FORWARDED_TO_RUNTIME { - gum::warn!( - target: LOG_TARGET, - "Recent disputes are excessive ({} > {}), reduce to active ones, and selected", - recent.len(), - MAX_DISPUTES_FORWARDED_TO_RUNTIME - ); - let active = request_disputes(sender, RequestType::Active).await; - if active.len() > MAX_DISPUTES_FORWARDED_TO_RUNTIME { - generate_unseen_active_subset(active, onchain) - } else { - generate_active_and_unseen_recent_subset(recent, active, onchain) - } - } else { - recent - }; - - gum::trace!( - target: LOG_TARGET, - relay_paent = ?_leaf.hash, - "Calling `request_votes`" - ); - - // Load all votes for all disputes from the coordinator. - let dispute_candidate_votes = request_votes(sender, disputes).await; - - gum::trace!( - target: LOG_TARGET, - relay_paent = ?_leaf.hash, - "Finished `request_votes`" - ); - - // Transform all `CandidateVotes` into `MultiDisputeStatementSet`. - Ok(dispute_candidate_votes - .into_iter() - .map(|(session_index, candidate_hash, votes)| { - let valid_statements = votes - .valid - .into_iter() - .map(|(i, (s, sig))| (DisputeStatement::Valid(s), i, sig)); - - let invalid_statements = votes - .invalid - .into_iter() - .map(|(i, (s, sig))| (DisputeStatement::Invalid(s), i, sig)); - - metrics.inc_valid_statements_by(valid_statements.len()); - metrics.inc_invalid_statements_by(invalid_statements.len()); - metrics.inc_dispute_statement_sets_by(1); - - DisputeStatementSet { - candidate_hash, - session: session_index, - statements: valid_statements.chain(invalid_statements).collect(), - } - }) - .collect()) + Result::Err(_) => { + gum::trace!( + target: LOG_TARGET, + ?relay_parent, + "Cancelled error while fetching ParachainHost runtime api version" + ); + false + }, + } } diff --git a/node/core/provisioner/src/metrics.rs b/node/core/provisioner/src/metrics.rs index 829293dfed53..b8032a32f89d 100644 --- a/node/core/provisioner/src/metrics.rs +++ b/node/core/provisioner/src/metrics.rs @@ -14,6 +14,7 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . +use crate::disputes::prioritized_selection::PartitionedDisputes; use polkadot_node_subsystem_util::metrics::{self, prometheus}; #[derive(Clone)] @@ -28,6 +29,9 @@ struct MetricsInner { /// 4 hours on Polkadot. The metrics are updated only when the node authors a block, so values vary across nodes. inherent_data_dispute_statement_sets: prometheus::Counter, inherent_data_dispute_statements: prometheus::CounterVec, + + /// The disputes received from `disputes-coordinator` by partition + partitioned_disputes: prometheus::CounterVec, } /// Provisioner metrics. @@ -95,6 +99,44 @@ impl Metrics { .inc_by(disputes.try_into().unwrap_or(0)); } } + + pub(crate) fn on_partition_recent_disputes(&self, disputes: &PartitionedDisputes) { + if let Some(metrics) = &self.0 { + let PartitionedDisputes { + inactive_unknown_onchain, + inactive_unconcluded_onchain: inactive_unconcluded_known_onchain, + active_unknown_onchain, + active_unconcluded_onchain, + active_concluded_onchain, + inactive_concluded_onchain: inactive_concluded_known_onchain, + } = disputes; + + metrics + .partitioned_disputes + .with_label_values(&["inactive_unknown_onchain"]) + .inc_by(inactive_unknown_onchain.len().try_into().unwrap_or(0)); + metrics + .partitioned_disputes + .with_label_values(&["inactive_unconcluded_known_onchain"]) + .inc_by(inactive_unconcluded_known_onchain.len().try_into().unwrap_or(0)); + metrics + .partitioned_disputes + .with_label_values(&["active_unknown_onchain"]) + .inc_by(active_unknown_onchain.len().try_into().unwrap_or(0)); + metrics + .partitioned_disputes + .with_label_values(&["active_unconcluded_onchain"]) + .inc_by(active_unconcluded_onchain.len().try_into().unwrap_or(0)); + metrics + .partitioned_disputes + .with_label_values(&["active_concluded_onchain"]) + .inc_by(active_concluded_onchain.len().try_into().unwrap_or(0)); + metrics + .partitioned_disputes + .with_label_values(&["inactive_concluded_known_onchain"]) + .inc_by(inactive_concluded_known_onchain.len().try_into().unwrap_or(0)); + } + } } impl metrics::Metrics for Metrics { @@ -150,6 +192,16 @@ impl metrics::Metrics for Metrics { )?, registry, )?, + partitioned_disputes: prometheus::register( + prometheus::CounterVec::new( + prometheus::Opts::new( + "polkadot_parachain_provisioner_partitioned_disputes", + "some fancy description", + ), + &["partition"], + )?, + ®istry, + )?, }; Ok(Metrics(Some(metrics))) } diff --git a/node/core/provisioner/src/onchain_disputes.rs b/node/core/provisioner/src/onchain_disputes.rs deleted file mode 100644 index 6810f512173f..000000000000 --- a/node/core/provisioner/src/onchain_disputes.rs +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright 2017-2022 Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - -use crate::error::GetOnchainDisputesError; -use polkadot_node_subsystem::overseer; -use polkadot_primitives::v2::{CandidateHash, DisputeState, Hash, SessionIndex}; -use std::collections::HashMap; - -pub async fn get_onchain_disputes( - _sender: &mut Sender, - _relay_parent: Hash, -) -> Result, GetOnchainDisputesError> -where - Sender: overseer::ProvisionerSenderTrait, -{ - let _onchain = Result::< - HashMap<(SessionIndex, CandidateHash), DisputeState>, - GetOnchainDisputesError, - >::Ok(HashMap::new()); - #[cfg(feature = "staging-client")] - let _onchain = self::staging_impl::get_onchain_disputes(_sender, _relay_parent).await; - - _onchain -} - -// Merge this module with the outer (current one) when promoting to stable -#[cfg(feature = "staging-client")] -mod staging_impl { - use super::*; // remove this when promoting to stable - use crate::LOG_TARGET; - use futures::channel::oneshot; - use polkadot_node_subsystem::{ - errors::RuntimeApiError, - messages::{RuntimeApiMessage, RuntimeApiRequest}, - SubsystemSender, - }; - - /// Gets the on-chain disputes at a given block number and returns them as a `HashSet` so that searching in them is cheap. - pub async fn get_onchain_disputes( - sender: &mut impl SubsystemSender, - relay_parent: Hash, - ) -> Result, GetOnchainDisputesError> { - gum::trace!(target: LOG_TARGET, ?relay_parent, "Fetching on-chain disputes"); - let (tx, rx) = oneshot::channel(); - sender - .send_message( - RuntimeApiMessage::Request(relay_parent, RuntimeApiRequest::StagingDisputes(tx)) - .into(), - ) - .await; - - rx.await - .map_err(|_| GetOnchainDisputesError::Channel) - .and_then(|res| { - res.map_err(|e| match e { - RuntimeApiError::Execution { .. } => - GetOnchainDisputesError::Execution(e, relay_parent), - RuntimeApiError::NotSupported { .. } => - GetOnchainDisputesError::NotSupported(e, relay_parent), - }) - }) - .map(|v| v.into_iter().map(|e| ((e.0, e.1), e.2)).collect()) - } -} diff --git a/node/core/provisioner/src/tests.rs b/node/core/provisioner/src/tests.rs index d0ca425210ed..08eba8eabe80 100644 --- a/node/core/provisioner/src/tests.rs +++ b/node/core/provisioner/src/tests.rs @@ -195,7 +195,7 @@ mod select_availability_bitfields { } } -mod common { +pub(crate) mod common { use super::super::*; use futures::channel::mpsc; use polkadot_node_subsystem::messages::AllMessages; @@ -497,403 +497,3 @@ mod select_candidates { ) } } - -mod select_disputes { - use super::{super::*, common::test_harness}; - use futures::channel::mpsc; - use polkadot_node_subsystem::{ - messages::{AllMessages, DisputeCoordinatorMessage, RuntimeApiMessage, RuntimeApiRequest}, - RuntimeApiError, - }; - use polkadot_node_subsystem_test_helpers::TestSubsystemSender; - use polkadot_primitives::v2::DisputeState; - use std::sync::Arc; - use test_helpers; - - // Global Test Data - fn recent_disputes(len: usize) -> Vec<(SessionIndex, CandidateHash)> { - let mut res = Vec::with_capacity(len); - for _ in 0..len { - res.push((0, CandidateHash(Hash::random()))); - } - - res - } - - // same as recent_disputes() but with SessionIndex set to 1 - fn active_disputes(len: usize) -> Vec<(SessionIndex, CandidateHash)> { - let mut res = Vec::with_capacity(len); - for _ in 0..len { - res.push((1, CandidateHash(Hash::random()))); - } - - res - } - - fn leaf() -> ActivatedLeaf { - ActivatedLeaf { - hash: Hash::repeat_byte(0xAA), - number: 0xAA, - status: LeafStatus::Fresh, - span: Arc::new(jaeger::Span::Disabled), - } - } - - async fn mock_overseer( - leaf: ActivatedLeaf, - mut receiver: mpsc::UnboundedReceiver, - onchain_disputes: Result, RuntimeApiError>, - recent_disputes: Vec<(SessionIndex, CandidateHash)>, - active_disputes: Vec<(SessionIndex, CandidateHash)>, - ) { - while let Some(from_job) = receiver.next().await { - match from_job { - AllMessages::RuntimeApi(RuntimeApiMessage::Request( - _, - RuntimeApiRequest::StagingDisputes(sender), - )) => { - let _ = sender.send(onchain_disputes.clone()); - }, - AllMessages::RuntimeApi(_) => panic!("Unexpected RuntimeApi request"), - AllMessages::DisputeCoordinator(DisputeCoordinatorMessage::RecentDisputes( - sender, - )) => { - let _ = sender.send(recent_disputes.clone()); - }, - AllMessages::DisputeCoordinator(DisputeCoordinatorMessage::ActiveDisputes( - sender, - )) => { - let _ = sender.send(active_disputes.clone()); - }, - AllMessages::DisputeCoordinator( - DisputeCoordinatorMessage::QueryCandidateVotes(disputes, sender), - ) => { - let mut res = Vec::new(); - let v = CandidateVotes { - candidate_receipt: test_helpers::dummy_candidate_receipt(leaf.hash.clone()), - valid: BTreeMap::new(), - invalid: BTreeMap::new(), - }; - for r in disputes.iter() { - res.push((r.0, r.1, v.clone())); - } - - let _ = sender.send(res); - }, - _ => panic!("Unexpected message: {:?}", from_job), - } - } - } - - #[test] - fn recent_disputes_are_withing_onchain_limit() { - const RECENT_DISPUTES_SIZE: usize = 10; - let metrics = metrics::Metrics::new_dummy(); - let onchain_disputes = Ok(Vec::new()); - let active_disputes = Vec::new(); - let recent_disputes = recent_disputes(RECENT_DISPUTES_SIZE); - - let recent_disputes_overseer = recent_disputes.clone(); - test_harness( - |r| { - mock_overseer( - leaf(), - r, - onchain_disputes, - recent_disputes_overseer, - active_disputes, - ) - }, - |mut tx: TestSubsystemSender| async move { - let lf = leaf(); - let disputes = select_disputes(&mut tx, &metrics, &lf).await.unwrap(); - - assert!(!disputes.is_empty()); - - let result = disputes.iter().zip(recent_disputes.iter()); - // We should get all recent disputes. - for (d, r) in result { - assert_eq!(d.session, r.0); - assert_eq!(d.candidate_hash, r.1); - } - }, - ) - } - - #[test] - fn recent_disputes_are_too_much_but_active_are_within_limit() { - const RECENT_DISPUTES_SIZE: usize = MAX_DISPUTES_FORWARDED_TO_RUNTIME + 10; - const ACTIVE_DISPUTES_SIZE: usize = MAX_DISPUTES_FORWARDED_TO_RUNTIME; - let metrics = metrics::Metrics::new_dummy(); - let onchain_disputes = Ok(Vec::new()); - let recent_disputes = recent_disputes(RECENT_DISPUTES_SIZE); - let active_disputes = active_disputes(ACTIVE_DISPUTES_SIZE); - - let active_disputes_overseer = active_disputes.clone(); - test_harness( - |r| { - mock_overseer( - leaf(), - r, - onchain_disputes, - recent_disputes, - active_disputes_overseer, - ) - }, - |mut tx: TestSubsystemSender| async move { - let lf = leaf(); - let disputes = select_disputes(&mut tx, &metrics, &lf).await.unwrap(); - - assert!(!disputes.is_empty()); - - let result = disputes.iter().zip(active_disputes.iter()); - // We should get all active disputes. - for (d, r) in result { - assert_eq!(d.session, r.0); - assert_eq!(d.candidate_hash, r.1); - } - }, - ) - } - - #[test] - fn recent_disputes_are_too_much_but_active_are_less_than_the_limit() { - // In this case all active disputes + a random set of recent disputes should be returned - const RECENT_DISPUTES_SIZE: usize = MAX_DISPUTES_FORWARDED_TO_RUNTIME + 10; - const ACTIVE_DISPUTES_SIZE: usize = MAX_DISPUTES_FORWARDED_TO_RUNTIME - 10; - let metrics = metrics::Metrics::new_dummy(); - let onchain_disputes = Ok(Vec::new()); - let recent_disputes = recent_disputes(RECENT_DISPUTES_SIZE); - let active_disputes = active_disputes(ACTIVE_DISPUTES_SIZE); - - let active_disputes_overseer = active_disputes.clone(); - test_harness( - |r| { - mock_overseer( - leaf(), - r, - onchain_disputes, - recent_disputes, - active_disputes_overseer, - ) - }, - |mut tx: TestSubsystemSender| async move { - let lf = leaf(); - let disputes = select_disputes(&mut tx, &metrics, &lf).await.unwrap(); - - assert!(!disputes.is_empty()); - - // Recent disputes are generated with `SessionIndex` = 0 - let (res_recent, res_active): (Vec, Vec) = - disputes.into_iter().partition(|d| d.session == 0); - - // It should be good enough the count the number of active disputes and not compare them one by one. Checking the exact values is already covered by the previous tests. - assert_eq!(res_active.len(), active_disputes.len()); // We have got all active disputes - assert_eq!(res_active.len() + res_recent.len(), MAX_DISPUTES_FORWARDED_TO_RUNTIME); - // And some recent ones. - }, - ) - } - - //These tests rely on staging Runtime functions so they are separated and compiled conditionally. - #[cfg(feature = "staging-client")] - mod staging_tests { - use super::*; - - fn dummy_dispute_state() -> DisputeState { - DisputeState { - validators_for: BitVec::new(), - validators_against: BitVec::new(), - start: 0, - concluded_at: None, - } - } - - #[test] - fn recent_disputes_are_too_much_active_fits_test_recent_prioritisation() { - // In this case recent disputes are above `MAX_DISPUTES_FORWARDED_TO_RUNTIME` limit and the active ones are below it. - // The expected behaviour is to send all active disputes and extend the set with recent ones. During the extension the disputes unknown for the Runtime are added with priority. - const RECENT_DISPUTES_SIZE: usize = MAX_DISPUTES_FORWARDED_TO_RUNTIME + 10; - const ACTIVE_DISPUTES_SIZE: usize = MAX_DISPUTES_FORWARDED_TO_RUNTIME - 10; - const ONCHAIN_DISPUTE_SIZE: usize = RECENT_DISPUTES_SIZE - 9; - let metrics = metrics::Metrics::new_dummy(); - let recent_disputes = recent_disputes(RECENT_DISPUTES_SIZE); - let active_disputes = active_disputes(ACTIVE_DISPUTES_SIZE); - let onchain_disputes: Result< - Vec<(SessionIndex, CandidateHash, DisputeState)>, - RuntimeApiError, - > = Ok(Vec::from(&recent_disputes[0..ONCHAIN_DISPUTE_SIZE]) - .iter() - .map(|(session_index, candidate_hash)| { - (*session_index, candidate_hash.clone(), dummy_dispute_state()) - }) - .collect()); - let active_disputes_overseer = active_disputes.clone(); - let recent_disputes_overseer = recent_disputes.clone(); - test_harness( - |r| { - mock_overseer( - leaf(), - r, - onchain_disputes, - recent_disputes_overseer, - active_disputes_overseer, - ) - }, - |mut tx: TestSubsystemSender| async move { - let lf = leaf(); - let disputes = select_disputes(&mut tx, &metrics, &lf).await.unwrap(); - - assert!(!disputes.is_empty()); - - // Recent disputes are generated with `SessionIndex` = 0 - let (res_recent, res_active): ( - Vec, - Vec, - ) = disputes.into_iter().partition(|d| d.session == 0); - - // It should be good enough the count the number of the disputes and not compare them one by one as this was already covered in other tests. - assert_eq!(res_active.len(), active_disputes.len()); // We've got all active disputes. - assert_eq!( - res_recent.len(), - MAX_DISPUTES_FORWARDED_TO_RUNTIME - active_disputes.len() - ); // And some recent ones. - - // Check if the recent disputes were unknown for the Runtime. - let expected_recent_disputes = - Vec::from(&recent_disputes[ONCHAIN_DISPUTE_SIZE..]); - let res_recent_set: HashSet<(SessionIndex, CandidateHash)> = HashSet::from_iter( - res_recent.iter().map(|d| (d.session, d.candidate_hash)), - ); - - // Explicitly check that all unseen disputes are sent to the Runtime. - for d in &expected_recent_disputes { - assert_eq!(res_recent_set.contains(d), true); - } - }, - ) - } - - #[test] - fn active_disputes_are_too_much_test_active_prioritisation() { - // In this case the active disputes are above the `MAX_DISPUTES_FORWARDED_TO_RUNTIME` limit so the unseen ones should be prioritised. - const RECENT_DISPUTES_SIZE: usize = MAX_DISPUTES_FORWARDED_TO_RUNTIME + 10; - const ACTIVE_DISPUTES_SIZE: usize = MAX_DISPUTES_FORWARDED_TO_RUNTIME + 10; - const ONCHAIN_DISPUTE_SIZE: usize = ACTIVE_DISPUTES_SIZE - 9; - - let metrics = metrics::Metrics::new_dummy(); - let recent_disputes = recent_disputes(RECENT_DISPUTES_SIZE); - let active_disputes = active_disputes(ACTIVE_DISPUTES_SIZE); - let onchain_disputes: Result< - Vec<(SessionIndex, CandidateHash, DisputeState)>, - RuntimeApiError, - > = Ok(Vec::from(&active_disputes[0..ONCHAIN_DISPUTE_SIZE]) - .iter() - .map(|(session_index, candidate_hash)| { - (*session_index, candidate_hash.clone(), dummy_dispute_state()) - }) - .collect()); - let active_disputes_overseer = active_disputes.clone(); - let recent_disputes_overseer = recent_disputes.clone(); - test_harness( - |r| { - mock_overseer( - leaf(), - r, - onchain_disputes, - recent_disputes_overseer, - active_disputes_overseer, - ) - }, - |mut tx: TestSubsystemSender| async move { - let lf = leaf(); - let disputes = select_disputes(&mut tx, &metrics, &lf).await.unwrap(); - - assert!(!disputes.is_empty()); - - // Recent disputes are generated with `SessionIndex` = 0 - let (res_recent, res_active): ( - Vec, - Vec, - ) = disputes.into_iter().partition(|d| d.session == 0); - - // It should be good enough the count the number of the disputes and not compare them one by one - assert_eq!(res_recent.len(), 0); // We expect no recent disputes - assert_eq!(res_active.len(), MAX_DISPUTES_FORWARDED_TO_RUNTIME); - - let expected_active_disputes = - Vec::from(&active_disputes[ONCHAIN_DISPUTE_SIZE..]); - let res_active_set: HashSet<(SessionIndex, CandidateHash)> = HashSet::from_iter( - res_active.iter().map(|d| (d.session, d.candidate_hash)), - ); - - // Explicitly check that the unseen disputes are delivered to the Runtime. - for d in &expected_active_disputes { - assert_eq!(res_active_set.contains(d), true); - } - }, - ) - } - - #[test] - fn active_disputes_are_too_much_and_are_all_unseen() { - // In this case there are a lot of active disputes unseen by the Runtime. The focus of the test is to verify that in such cases known disputes are NOT sent to the Runtime. - const RECENT_DISPUTES_SIZE: usize = MAX_DISPUTES_FORWARDED_TO_RUNTIME + 10; - const ACTIVE_DISPUTES_SIZE: usize = MAX_DISPUTES_FORWARDED_TO_RUNTIME + 5; - const ONCHAIN_DISPUTE_SIZE: usize = 5; - - let metrics = metrics::Metrics::new_dummy(); - let recent_disputes = recent_disputes(RECENT_DISPUTES_SIZE); - let active_disputes = active_disputes(ACTIVE_DISPUTES_SIZE); - let onchain_disputes: Result< - Vec<(SessionIndex, CandidateHash, DisputeState)>, - RuntimeApiError, - > = Ok(Vec::from(&active_disputes[0..ONCHAIN_DISPUTE_SIZE]) - .iter() - .map(|(session_index, candidate_hash)| { - (*session_index, candidate_hash.clone(), dummy_dispute_state()) - }) - .collect()); - let active_disputes_overseer = active_disputes.clone(); - let recent_disputes_overseer = recent_disputes.clone(); - test_harness( - |r| { - mock_overseer( - leaf(), - r, - onchain_disputes, - recent_disputes_overseer, - active_disputes_overseer, - ) - }, - |mut tx: TestSubsystemSender| async move { - let lf = leaf(); - let disputes = select_disputes(&mut tx, &metrics, &lf).await.unwrap(); - assert!(!disputes.is_empty()); - - // Recent disputes are generated with `SessionIndex` = 0 - let (res_recent, res_active): ( - Vec, - Vec, - ) = disputes.into_iter().partition(|d| d.session == 0); - - // It should be good enough the count the number of the disputes and not compare them one by one - assert_eq!(res_recent.len(), 0); - assert_eq!(res_active.len(), MAX_DISPUTES_FORWARDED_TO_RUNTIME); - - // For sure we don't want to see any of this disputes in the result - let unexpected_active_disputes = - Vec::from(&active_disputes[0..ONCHAIN_DISPUTE_SIZE]); - let res_active_set: HashSet<(SessionIndex, CandidateHash)> = HashSet::from_iter( - res_active.iter().map(|d| (d.session, d.candidate_hash)), - ); - - // Verify that the result DOESN'T contain known disputes (because there is an excessive number of unknown onces). - for d in &unexpected_active_disputes { - assert_eq!(res_active_set.contains(d), false); - } - }, - ) - } - } -} diff --git a/node/core/runtime-api/src/cache.rs b/node/core/runtime-api/src/cache.rs index 6f5fdc5d4657..0fe9b74dc86d 100644 --- a/node/core/runtime-api/src/cache.rs +++ b/node/core/runtime-api/src/cache.rs @@ -463,5 +463,5 @@ pub(crate) enum RequestResult { SubmitPvfCheckStatement(Hash, PvfCheckStatement, ValidatorSignature, ()), ValidationCodeHash(Hash, ParaId, OccupiedCoreAssumption, Option), Version(Hash, u32), - StagingDisputes(Hash, Vec<(SessionIndex, CandidateHash, DisputeState)>), + Disputes(Hash, Vec<(SessionIndex, CandidateHash, DisputeState)>), } diff --git a/node/core/runtime-api/src/lib.rs b/node/core/runtime-api/src/lib.rs index a815b76a8d7c..36355b5759e6 100644 --- a/node/core/runtime-api/src/lib.rs +++ b/node/core/runtime-api/src/lib.rs @@ -153,7 +153,7 @@ where .cache_validation_code_hash((relay_parent, para_id, assumption), hash), Version(relay_parent, version) => self.requests_cache.cache_version(relay_parent, version), - StagingDisputes(relay_parent, disputes) => + Disputes(relay_parent, disputes) => self.requests_cache.cache_disputes(relay_parent, disputes), } } @@ -256,8 +256,8 @@ where Request::ValidationCodeHash(para, assumption, sender) => query!(validation_code_hash(para, assumption), sender) .map(|sender| Request::ValidationCodeHash(para, assumption, sender)), - Request::StagingDisputes(sender) => - query!(disputes(), sender).map(|sender| Request::StagingDisputes(sender)), + Request::Disputes(sender) => + query!(disputes(), sender).map(|sender| Request::Disputes(sender)), } } @@ -351,8 +351,9 @@ where let _timer = metrics.time_make_runtime_api_request(); macro_rules! query { - ($req_variant:ident, $api_name:ident ($($param:expr),*), ver = $version:literal, $sender:expr) => {{ + ($req_variant:ident, $api_name:ident ($($param:expr),*), ver = $version:expr, $sender:expr) => {{ let sender = $sender; + let version: u32 = $version; // enforce type for the version expression let runtime_version = client.api_version_parachain_host(relay_parent).await .unwrap_or_else(|e| { gum::warn!( @@ -370,7 +371,7 @@ where 0 }); - let res = if runtime_version >= $version { + let res = if runtime_version >= version { client.$api_name(relay_parent $(, $param.clone() )*).await .map_err(|e| RuntimeApiError::Execution { runtime_api_name: stringify!($api_name), @@ -499,7 +500,7 @@ where }, Request::ValidationCodeHash(para, assumption, sender) => query!(ValidationCodeHash, validation_code_hash(para, assumption), ver = 2, sender), - Request::StagingDisputes(sender) => - query!(StagingDisputes, staging_get_disputes(), ver = 2, sender), + Request::Disputes(sender) => + query!(Disputes, disputes(), ver = Request::DISPUTES_RUNTIME_REQUIREMENT, sender), } } diff --git a/node/core/runtime-api/src/tests.rs b/node/core/runtime-api/src/tests.rs index b1a1bba73769..9c14561b1f48 100644 --- a/node/core/runtime-api/src/tests.rs +++ b/node/core/runtime-api/src/tests.rs @@ -23,11 +23,11 @@ use polkadot_node_subsystem_test_helpers::make_subsystem_context; use polkadot_primitives::{ runtime_api::ParachainHost, v2::{ - AuthorityDiscoveryId, Block, BlockNumber, CandidateEvent, CandidateHash, - CommittedCandidateReceipt, CoreState, DisputeState, GroupRotationInfo, Id as ParaId, - InboundDownwardMessage, InboundHrmpMessage, OccupiedCoreAssumption, - PersistedValidationData, PvfCheckStatement, ScrapedOnChainVotes, SessionIndex, SessionInfo, - ValidationCode, ValidationCodeHash, ValidatorId, ValidatorIndex, ValidatorSignature, + AuthorityDiscoveryId, Block, CandidateEvent, CommittedCandidateReceipt, CoreState, + GroupRotationInfo, Id as ParaId, InboundDownwardMessage, InboundHrmpMessage, + OccupiedCoreAssumption, PersistedValidationData, PvfCheckStatement, ScrapedOnChainVotes, + SessionIndex, SessionInfo, ValidationCode, ValidationCodeHash, ValidatorId, ValidatorIndex, + ValidatorSignature, }, }; use sp_api::ProvideRuntimeApi; @@ -196,10 +196,6 @@ sp_api::mock_impl_runtime_apis! { ) -> Option { self.validation_code_hash.get(¶).map(|c| c.clone()) } - - fn staging_get_disputes() -> Vec<(SessionIndex, CandidateHash, DisputeState)> { - unimplemented!() - } } impl BabeApi for MockRuntimeApi { diff --git a/node/network/approval-distribution/Cargo.toml b/node/network/approval-distribution/Cargo.toml index 68fae906c21e..e5c2865e4afe 100644 --- a/node/network/approval-distribution/Cargo.toml +++ b/node/network/approval-distribution/Cargo.toml @@ -21,6 +21,7 @@ sp-core = { git = "https://github.com/paritytech/substrate", branch = "master", polkadot-node-subsystem-util = { path = "../../subsystem-util" } polkadot-node-subsystem-test-helpers = { path = "../../subsystem-test-helpers" } +polkadot-primitives-test-helpers = { path = "../../../primitives/test-helpers" } assert_matches = "1.4.0" schnorrkel = { version = "0.9.1", default-features = false } diff --git a/node/network/approval-distribution/src/tests.rs b/node/network/approval-distribution/src/tests.rs index b3d44bfe8c1e..a96a89bb58eb 100644 --- a/node/network/approval-distribution/src/tests.rs +++ b/node/network/approval-distribution/src/tests.rs @@ -25,6 +25,7 @@ use polkadot_node_subsystem::messages::{network_bridge_event, AllMessages, Appro use polkadot_node_subsystem_test_helpers as test_helpers; use polkadot_node_subsystem_util::TimeoutExt as _; use polkadot_primitives::v2::{AuthorityDiscoveryId, BlakeTwo256, HashT}; +use polkadot_primitives_test_helpers::dummy_signature; use rand::SeedableRng; use sp_authority_discovery::AuthorityPair as AuthorityDiscoveryPair; use sp_core::crypto::Pair as PairT; @@ -32,10 +33,6 @@ use std::time::Duration; type VirtualOverseer = test_helpers::TestSubsystemContextHandle; -fn dummy_signature() -> polkadot_primitives::v2::ValidatorSignature { - sp_core::crypto::UncheckedFrom::unchecked_from([1u8; 64]) -} - fn test_harness>( mut state: State, test_fn: impl FnOnce(VirtualOverseer) -> T, diff --git a/node/network/dispute-distribution/Cargo.toml b/node/network/dispute-distribution/Cargo.toml index 509ac1545d4c..3e2f4f0a6a6a 100644 --- a/node/network/dispute-distribution/Cargo.toml +++ b/node/network/dispute-distribution/Cargo.toml @@ -6,6 +6,7 @@ edition = "2021" [dependencies] futures = "0.3.21" +futures-timer = "3.0.2" gum = { package = "tracing-gum", path = "../../gum" } derive_more = "0.99.17" parity-scale-codec = { version = "3.1.5", features = ["std"] } @@ -21,6 +22,7 @@ sp-keystore = { git = "https://github.com/paritytech/substrate", branch = "maste thiserror = "1.0.31" fatality = "0.0.6" lru = "0.7.7" +indexmap = "1.9.1" [dev-dependencies] async-trait = "0.1.57" diff --git a/node/network/dispute-distribution/src/lib.rs b/node/network/dispute-distribution/src/lib.rs index aefd66e0ae79..56683d94e0c2 100644 --- a/node/network/dispute-distribution/src/lib.rs +++ b/node/network/dispute-distribution/src/lib.rs @@ -24,6 +24,8 @@ //! The sender is responsible for getting our vote out, see [`sender`]. The receiver handles //! incoming [`DisputeRequest`]s and offers spam protection, see [`receiver`]. +use std::time::Duration; + use futures::{channel::mpsc, FutureExt, StreamExt, TryFutureExt}; use polkadot_node_network_protocol::authority_discovery::AuthorityDiscovery; @@ -64,16 +66,19 @@ use self::sender::{DisputeSender, TaskFinish}; /// via a dedicated channel and forwarding them to the dispute coordinator via /// `DisputeCoordinatorMessage::ImportStatements`. Being the interface to the network and untrusted /// nodes, the reality is not that simple of course. Before importing statements the receiver will -/// make sure as good as it can to filter out malicious/unwanted/spammy requests. For this it does -/// the following: +/// batch up imports as well as possible for efficient imports while maintaining timely dispute +/// resolution and handling of spamming validators: /// /// - Drop all messages from non validator nodes, for this it requires the [`AuthorityDiscovery`] /// service. -/// - Drop messages from a node, if we are already importing a message from that node (flood). -/// - Drop messages from nodes, that provided us messages where the statement import failed. +/// - Drop messages from a node, if it sends at a too high rate. +/// - Filter out duplicate messages (over some period of time). /// - Drop any obviously invalid votes (invalid signatures for example). /// - Ban peers whose votes were deemed invalid. /// +/// In general dispute-distribution works on limiting the work the dispute-coordinator will have to +/// do, while at the same time making it aware of new disputes as fast as possible. +/// /// For successfully imported votes, we will confirm the receipt of the message back to the sender. /// This way a received confirmation guarantees, that the vote has been stored to disk by the /// receiver. @@ -93,6 +98,20 @@ pub use metrics::Metrics; const LOG_TARGET: &'static str = "parachain::dispute-distribution"; +/// Rate limit on the `receiver` side. +/// +/// If messages from one peer come in at a higher rate than every `RECEIVE_RATE_LIMIT` on average, we +/// start dropping messages from that peer to enforce that limit. +pub const RECEIVE_RATE_LIMIT: Duration = Duration::from_millis(100); + +/// Rate limit on the `sender` side. +/// +/// In order to not hit the `RECEIVE_RATE_LIMIT` on the receiving side, we limit out sending rate as +/// well. +/// +/// We add 50ms extra, just to have some save margin to the `RECEIVE_RATE_LIMIT`. +pub const SEND_RATE_LIMIT: Duration = RECEIVE_RATE_LIMIT.saturating_add(Duration::from_millis(50)); + /// The dispute distribution subsystem. pub struct DisputeDistributionSubsystem { /// Easy and efficient runtime access for this subsystem. @@ -172,6 +191,12 @@ where ctx.spawn("disputes-receiver", receiver.run().boxed()) .map_err(FatalError::SpawnTask)?; + // Process messages for sending side. + // + // Note: We want the sender to be rate limited and we are currently taking advantage of the + // fact that the root task of this subsystem is only concerned with sending: Functions of + // `DisputeSender` might back pressure if the rate limit is hit, which will slow down this + // loop. If this fact ever changes, we will likely need another task. loop { let message = MuxedMessage::receive(&mut ctx, &mut self.sender_rx).await; match message { @@ -247,9 +272,10 @@ impl MuxedMessage { // ends. let from_overseer = ctx.recv().fuse(); futures::pin_mut!(from_overseer, from_sender); - futures::select!( - msg = from_overseer => MuxedMessage::Subsystem(msg.map_err(FatalError::SubsystemReceive)), + // We select biased to make sure we finish up loose ends, before starting new work. + futures::select_biased!( msg = from_sender.next() => MuxedMessage::Sender(msg), + msg = from_overseer => MuxedMessage::Subsystem(msg.map_err(FatalError::SubsystemReceive)), ) } } diff --git a/node/network/dispute-distribution/src/metrics.rs b/node/network/dispute-distribution/src/metrics.rs index 3f717bd105c3..aa2feeaad3a0 100644 --- a/node/network/dispute-distribution/src/metrics.rs +++ b/node/network/dispute-distribution/src/metrics.rs @@ -72,9 +72,12 @@ impl Metrics { } /// Statements have been imported. - pub fn on_imported(&self, label: &'static str) { + pub fn on_imported(&self, label: &'static str, num_requests: usize) { if let Some(metrics) = &self.0 { - metrics.imported_requests.with_label_values(&[label]).inc() + metrics + .imported_requests + .with_label_values(&[label]) + .inc_by(num_requests as u64) } } diff --git a/node/network/dispute-distribution/src/receiver/batches/batch.rs b/node/network/dispute-distribution/src/receiver/batches/batch.rs new file mode 100644 index 000000000000..d96de1dfe8c3 --- /dev/null +++ b/node/network/dispute-distribution/src/receiver/batches/batch.rs @@ -0,0 +1,215 @@ +// Copyright 2022 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use std::{collections::HashMap, time::Instant}; + +use gum::CandidateHash; +use polkadot_node_network_protocol::{ + request_response::{incoming::OutgoingResponseSender, v1::DisputeRequest}, + PeerId, +}; +use polkadot_node_primitives::SignedDisputeStatement; +use polkadot_primitives::v2::{CandidateReceipt, ValidatorIndex}; + +use crate::receiver::{BATCH_COLLECTING_INTERVAL, MIN_KEEP_BATCH_ALIVE_VOTES}; + +use super::MAX_BATCH_LIFETIME; + +/// A batch of votes to be imported into the `dispute-coordinator`. +/// +/// Vote imports are way more efficient when performed in batches, hence we batch together incoming +/// votes until the rate of incoming votes falls below a threshold, then we import into the dispute +/// coordinator. +/// +/// A `Batch` keeps track of the votes to be imported and the current incoming rate, on rate update +/// it will "flush" in case the incoming rate dropped too low, preparing the import. +pub struct Batch { + /// The actual candidate this batch is concerned with. + candidate_receipt: CandidateReceipt, + + /// Cache of `CandidateHash` (candidate_receipt.hash()). + candidate_hash: CandidateHash, + + /// All valid votes received in this batch so far. + /// + /// We differentiate between valid and invalid votes, so we can detect (and drop) duplicates, + /// while still allowing validators to equivocate. + /// + /// Detecting and rejecting duplicates is crucial in order to effectively enforce + /// `MIN_KEEP_BATCH_ALIVE_VOTES` per `BATCH_COLLECTING_INTERVAL`. If we would count duplicates + /// here, the mechanism would be broken. + valid_votes: HashMap, + + /// All invalid votes received in this batch so far. + invalid_votes: HashMap, + + /// How many votes have been batched since the last tick/creation. + votes_batched_since_last_tick: u32, + + /// Expiry time for the batch. + /// + /// By this time the latest this batch will get flushed. + best_before: Instant, + + /// Requesters waiting for a response. + requesters: Vec<(PeerId, OutgoingResponseSender)>, +} + +/// Result of checking a batch every `BATCH_COLLECTING_INTERVAL`. +pub(super) enum TickResult { + /// Batch is still alive, please call `tick` again at the given `Instant`. + Alive(Batch, Instant), + /// Batch is done, ready for import! + Done(PreparedImport), +} + +/// Ready for import. +pub struct PreparedImport { + pub candidate_receipt: CandidateReceipt, + pub statements: Vec<(SignedDisputeStatement, ValidatorIndex)>, + /// Information about original requesters. + pub requesters: Vec<(PeerId, OutgoingResponseSender)>, +} + +impl From for PreparedImport { + fn from(batch: Batch) -> Self { + let Batch { + candidate_receipt, + valid_votes, + invalid_votes, + requesters: pending_responses, + .. + } = batch; + + let statements = valid_votes + .into_iter() + .chain(invalid_votes.into_iter()) + .map(|(index, statement)| (statement, index)) + .collect(); + + Self { candidate_receipt, statements, requesters: pending_responses } + } +} + +impl Batch { + /// Create a new empty batch based on the given `CandidateReceipt`. + /// + /// To create a `Batch` use Batches::find_batch`. + /// + /// Arguments: + /// + /// * `candidate_receipt` - The candidate this batch is meant to track votes for. + /// * `now` - current time stamp for calculating the first tick. + /// + /// Returns: A batch and the first `Instant` you are supposed to call `tick`. + pub(super) fn new(candidate_receipt: CandidateReceipt, now: Instant) -> (Self, Instant) { + let s = Self { + candidate_hash: candidate_receipt.hash(), + candidate_receipt, + valid_votes: HashMap::new(), + invalid_votes: HashMap::new(), + votes_batched_since_last_tick: 0, + best_before: Instant::now() + MAX_BATCH_LIFETIME, + requesters: Vec::new(), + }; + let next_tick = s.calculate_next_tick(now); + (s, next_tick) + } + + /// Receipt of the candidate this batch is batching votes for. + pub fn candidate_receipt(&self) -> &CandidateReceipt { + &self.candidate_receipt + } + + /// Add votes from a validator into the batch. + /// + /// The statements are supposed to be the valid and invalid statements received in a + /// `DisputeRequest`. + /// + /// The given `pending_response` is the corresponding response sender for responding to `peer`. + /// If at least one of the votes is new as far as this batch is concerned we record the + /// pending_response, for later use. In case both votes are known already, we return the + /// response sender as an `Err` value. + pub fn add_votes( + &mut self, + valid_vote: (SignedDisputeStatement, ValidatorIndex), + invalid_vote: (SignedDisputeStatement, ValidatorIndex), + peer: PeerId, + pending_response: OutgoingResponseSender, + ) -> Result<(), OutgoingResponseSender> { + debug_assert!(valid_vote.0.candidate_hash() == invalid_vote.0.candidate_hash()); + debug_assert!(valid_vote.0.candidate_hash() == &self.candidate_hash); + + let mut duplicate = true; + + if self.valid_votes.insert(valid_vote.1, valid_vote.0).is_none() { + self.votes_batched_since_last_tick += 1; + duplicate = false; + } + if self.invalid_votes.insert(invalid_vote.1, invalid_vote.0).is_none() { + self.votes_batched_since_last_tick += 1; + duplicate = false; + } + + if duplicate { + Err(pending_response) + } else { + self.requesters.push((peer, pending_response)); + Ok(()) + } + } + + /// Check batch for liveness. + /// + /// This function is supposed to be called at instants given at construction and as returned as + /// part of `TickResult`. + pub(super) fn tick(mut self, now: Instant) -> TickResult { + if self.votes_batched_since_last_tick >= MIN_KEEP_BATCH_ALIVE_VOTES && + now < self.best_before + { + // Still good: + let next_tick = self.calculate_next_tick(now); + // Reset counter: + self.votes_batched_since_last_tick = 0; + TickResult::Alive(self, next_tick) + } else { + TickResult::Done(PreparedImport::from(self)) + } + } + + /// Calculate when the next tick should happen. + /// + /// This will usually return `now + BATCH_COLLECTING_INTERVAL`, except if the lifetime of this batch + /// would exceed `MAX_BATCH_LIFETIME`. + /// + /// # Arguments + /// + /// * `now` - The current time. + fn calculate_next_tick(&self, now: Instant) -> Instant { + let next_tick = now + BATCH_COLLECTING_INTERVAL; + if next_tick < self.best_before { + next_tick + } else { + self.best_before + } + } +} + +// Test tick behaviour: +// - If less than `MIN_KEEP_BATCH_ALIVE_VOTES` trickled in since last tick - batch should become +// done. +// - If batch surpassed its `best_before` it should become done. +// - Batch does not count duplicate votes. diff --git a/node/network/dispute-distribution/src/receiver/batches/mod.rs b/node/network/dispute-distribution/src/receiver/batches/mod.rs new file mode 100644 index 000000000000..d9561adb7b83 --- /dev/null +++ b/node/network/dispute-distribution/src/receiver/batches/mod.rs @@ -0,0 +1,170 @@ +// Copyright 2022 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use std::{ + collections::{hash_map, HashMap}, + time::{Duration, Instant}, +}; + +use futures::future::pending; + +use polkadot_node_network_protocol::request_response::DISPUTE_REQUEST_TIMEOUT; +use polkadot_primitives::v2::{CandidateHash, CandidateReceipt}; + +use crate::{ + receiver::batches::{batch::TickResult, waiting_queue::PendingWake}, + LOG_TARGET, +}; + +pub use self::batch::{Batch, PreparedImport}; +use self::waiting_queue::WaitingQueue; + +use super::{ + error::{JfyiError, JfyiResult}, + BATCH_COLLECTING_INTERVAL, +}; + +/// A single batch (per candidate) as managed by `Batches`. +mod batch; + +/// Queue events in time and wait for them to become ready. +mod waiting_queue; + +/// Safe-guard in case votes trickle in real slow. +/// +/// If the batch life time exceeded the time the sender is willing to wait for a confirmation, we +/// would trigger pointless re-sends. +const MAX_BATCH_LIFETIME: Duration = DISPUTE_REQUEST_TIMEOUT.saturating_sub(Duration::from_secs(2)); + +/// Limit the number of batches that can be alive at any given time. +/// +/// Reasoning for this number, see guide. +pub const MAX_BATCHES: usize = 1000; + +/// Manage batches. +/// +/// - Batches can be found via `find_batch()` in order to add votes to them/check they exist. +/// - Batches can be checked for being ready for flushing in order to import contained votes. +pub struct Batches { + /// The batches we manage. + /// + /// Kept invariants: + /// For each entry in `batches`, there exists an entry in `waiting_queue` as well - we wait on + /// all batches! + batches: HashMap, + /// Waiting queue for waiting for batches to become ready for `tick`. + /// + /// Kept invariants by `Batches`: + /// For each entry in the waiting_queue there exists a corresponding entry in `batches`. + waiting_queue: WaitingQueue, +} + +/// A found batch is either really found or got created so it can be found. +pub enum FoundBatch<'a> { + /// Batch just got created. + Created(&'a mut Batch), + /// Batch already existed. + Found(&'a mut Batch), +} + +impl Batches { + /// Create new empty `Batches`. + pub fn new() -> Self { + debug_assert!( + MAX_BATCH_LIFETIME > BATCH_COLLECTING_INTERVAL, + "Unexpectedly low `MAX_BATCH_LIFETIME`, please check parameters." + ); + Self { batches: HashMap::new(), waiting_queue: WaitingQueue::new() } + } + + /// Find a particular batch. + /// + /// That is either find it, or we create it as reflected by the result `FoundBatch`. + pub fn find_batch( + &mut self, + candidate_hash: CandidateHash, + candidate_receipt: CandidateReceipt, + ) -> JfyiResult { + if self.batches.len() >= MAX_BATCHES { + return Err(JfyiError::MaxBatchLimitReached) + } + debug_assert!(candidate_hash == candidate_receipt.hash()); + let result = match self.batches.entry(candidate_hash) { + hash_map::Entry::Vacant(vacant) => { + let now = Instant::now(); + let (created, ready_at) = Batch::new(candidate_receipt, now); + let pending_wake = PendingWake { payload: candidate_hash, ready_at }; + self.waiting_queue.push(pending_wake); + FoundBatch::Created(vacant.insert(created)) + }, + hash_map::Entry::Occupied(occupied) => FoundBatch::Found(occupied.into_mut()), + }; + Ok(result) + } + + /// Wait for the next `tick` to check for ready batches. + /// + /// This function blocks (returns `Poll::Pending`) until at least one batch can be + /// checked for readiness meaning that `BATCH_COLLECTING_INTERVAL` has passed since the last + /// check for that batch or it reached end of life. + /// + /// If this `Batches` instance is empty (does not actually contain any batches), then this + /// function will always return `Poll::Pending`. + /// + /// Returns: A `Vec` of all `PreparedImport`s from batches that became ready. + pub async fn check_batches(&mut self) -> Vec { + let now = Instant::now(); + + let mut imports = Vec::new(); + + // Wait for at least one batch to become ready: + self.waiting_queue.wait_ready(now).await; + + // Process all ready entries: + while let Some(wake) = self.waiting_queue.pop_ready(now) { + let batch = self.batches.remove(&wake.payload); + debug_assert!( + batch.is_some(), + "Entries referenced in `waiting_queue` are supposed to exist!" + ); + let batch = match batch { + None => return pending().await, + Some(batch) => batch, + }; + match batch.tick(now) { + TickResult::Done(import) => { + gum::trace!( + target: LOG_TARGET, + candidate_hash = ?wake.payload, + "Batch became ready." + ); + imports.push(import); + }, + TickResult::Alive(old_batch, next_tick) => { + gum::trace!( + target: LOG_TARGET, + candidate_hash = ?wake.payload, + "Batch found to be still alive on check." + ); + let pending_wake = PendingWake { payload: wake.payload, ready_at: next_tick }; + self.waiting_queue.push(pending_wake); + self.batches.insert(wake.payload, old_batch); + }, + } + } + imports + } +} diff --git a/node/network/dispute-distribution/src/receiver/batches/waiting_queue.rs b/node/network/dispute-distribution/src/receiver/batches/waiting_queue.rs new file mode 100644 index 000000000000..3bbaa6376da1 --- /dev/null +++ b/node/network/dispute-distribution/src/receiver/batches/waiting_queue.rs @@ -0,0 +1,204 @@ +// Copyright 2022 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use std::{cmp::Ordering, collections::BinaryHeap, time::Instant}; + +use futures::future::pending; +use futures_timer::Delay; + +/// Wait asynchronously for given `Instant`s one after the other. +/// +/// `PendingWake`s can be inserted and `WaitingQueue` makes `wait_ready()` to always wait for the +/// next `Instant` in the queue. +pub struct WaitingQueue { + /// All pending wakes we are supposed to wait on in order. + pending_wakes: BinaryHeap>, + /// Wait for next `PendingWake`. + waker: Option, +} + +/// Represents some event waiting to be processed at `ready_at`. +/// +/// This is an event in `WaitingQueue`. It provides an `Ord` instance, that sorts descending with +/// regard to `Instant` (so we get a `min-heap` with the earliest `Instant` at the top). +#[derive(Eq, PartialEq)] +pub struct PendingWake { + pub payload: Payload, + pub ready_at: Instant, +} + +impl WaitingQueue { + /// Get a new empty `WaitingQueue`. + /// + /// If you call `pop` on this queue immediately, it will always return `Poll::Pending`. + pub fn new() -> Self { + Self { pending_wakes: BinaryHeap::new(), waker: None } + } + + /// Push a `PendingWake`. + /// + /// The next call to `wait_ready` will make sure to wake soon enough to process that new event in a + /// timely manner. + pub fn push(&mut self, wake: PendingWake) { + self.pending_wakes.push(wake); + // Reset waker as it is potentially obsolete now: + self.waker = None; + } + + /// Pop the next ready item. + /// + /// This function does not wait, if nothing is ready right now as determined by the passed + /// `now` time stamp, this function simply returns `None`. + pub fn pop_ready(&mut self, now: Instant) -> Option> { + let is_ready = self.pending_wakes.peek().map_or(false, |p| p.ready_at <= now); + if is_ready { + Some(self.pending_wakes.pop().expect("We just peeked. qed.")) + } else { + None + } + } + + /// Don't pop, just wait until something is ready. + /// + /// Once this function returns `Poll::Ready(())` `pop_ready()` will return `Some`, if passed + /// the same `Instant`. + /// + /// Whether ready or not is determined based on the passed time stamp `now` which should be the + /// current time as returned by `Instant::now()` + /// + /// This function waits asynchronously for an item to become ready. If there is no more item, + /// this call will wait forever (return Poll::Pending without scheduling a wake). + pub async fn wait_ready(&mut self, now: Instant) { + if let Some(waker) = &mut self.waker { + // Previous timer was not done yet. + waker.await + } + + let next_waiting = self.pending_wakes.peek(); + let is_ready = next_waiting.map_or(false, |p| p.ready_at <= now); + if is_ready { + return + } + + self.waker = next_waiting.map(|p| Delay::new(p.ready_at.duration_since(now))); + match &mut self.waker { + None => return pending().await, + Some(waker) => waker.await, + } + } +} + +impl PartialOrd> for PendingWake { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for PendingWake { + fn cmp(&self, other: &Self) -> Ordering { + // Reverse order for min-heap: + match other.ready_at.cmp(&self.ready_at) { + Ordering::Equal => other.payload.cmp(&self.payload), + o => o, + } + } +} +#[cfg(test)] +mod tests { + use std::{ + task::Poll, + time::{Duration, Instant}, + }; + + use assert_matches::assert_matches; + use futures::{future::poll_fn, pin_mut, Future}; + + use crate::LOG_TARGET; + + use super::{PendingWake, WaitingQueue}; + + #[test] + fn wait_ready_waits_for_earliest_event_always() { + sp_tracing::try_init_simple(); + let mut queue = WaitingQueue::new(); + let now = Instant::now(); + let start = now; + queue.push(PendingWake { payload: 1u32, ready_at: now + Duration::from_millis(3) }); + // Push another one in order: + queue.push(PendingWake { payload: 2u32, ready_at: now + Duration::from_millis(5) }); + // Push one out of order: + queue.push(PendingWake { payload: 0u32, ready_at: now + Duration::from_millis(1) }); + // Push another one at same timestamp (should become ready at the same time) + queue.push(PendingWake { payload: 10u32, ready_at: now + Duration::from_millis(1) }); + + futures::executor::block_on(async move { + // No time passed yet - nothing should be ready. + assert!(queue.pop_ready(now).is_none(), "No time has passed, nothing should be ready"); + + // Receive them in order at expected times: + queue.wait_ready(now).await; + gum::trace!(target: LOG_TARGET, "After first wait."); + + let now = start + Duration::from_millis(1); + assert!(Instant::now() - start >= Duration::from_millis(1)); + assert_eq!(queue.pop_ready(now).map(|p| p.payload), Some(0u32)); + // One more should be ready: + assert_eq!(queue.pop_ready(now).map(|p| p.payload), Some(10u32)); + assert!(queue.pop_ready(now).is_none(), "No more entry expected to be ready."); + + queue.wait_ready(now).await; + gum::trace!(target: LOG_TARGET, "After second wait."); + let now = start + Duration::from_millis(3); + assert!(Instant::now() - start >= Duration::from_millis(3)); + assert_eq!(queue.pop_ready(now).map(|p| p.payload), Some(1u32)); + assert!(queue.pop_ready(now).is_none(), "No more entry expected to be ready."); + + // Push in between wait: + poll_fn(|cx| { + let fut = queue.wait_ready(now); + pin_mut!(fut); + assert_matches!(fut.poll(cx), Poll::Pending); + Poll::Ready(()) + }) + .await; + queue.push(PendingWake { payload: 3u32, ready_at: start + Duration::from_millis(4) }); + + queue.wait_ready(now).await; + // Newly pushed element should have become ready: + gum::trace!(target: LOG_TARGET, "After third wait."); + let now = start + Duration::from_millis(4); + assert!(Instant::now() - start >= Duration::from_millis(4)); + assert_eq!(queue.pop_ready(now).map(|p| p.payload), Some(3u32)); + assert!(queue.pop_ready(now).is_none(), "No more entry expected to be ready."); + + queue.wait_ready(now).await; + gum::trace!(target: LOG_TARGET, "After fourth wait."); + let now = start + Duration::from_millis(5); + assert!(Instant::now() - start >= Duration::from_millis(5)); + assert_eq!(queue.pop_ready(now).map(|p| p.payload), Some(2u32)); + assert!(queue.pop_ready(now).is_none(), "No more entry expected to be ready."); + + // queue empty - should wait forever now: + poll_fn(|cx| { + let fut = queue.wait_ready(now); + pin_mut!(fut); + assert_matches!(fut.poll(cx), Poll::Pending); + Poll::Ready(()) + }) + .await; + }); + } +} diff --git a/node/network/dispute-distribution/src/receiver/error.rs b/node/network/dispute-distribution/src/receiver/error.rs index ce578cc8e0f9..4477335440d0 100644 --- a/node/network/dispute-distribution/src/receiver/error.rs +++ b/node/network/dispute-distribution/src/receiver/error.rs @@ -19,8 +19,10 @@ use fatality::Nested; +use gum::CandidateHash; use polkadot_node_network_protocol::{request_response::incoming, PeerId}; use polkadot_node_subsystem_util::runtime; +use polkadot_primitives::v2::AuthorityDiscoveryId; use crate::LOG_TARGET; @@ -35,8 +37,8 @@ pub enum Error { #[error("Retrieving next incoming request failed.")] IncomingRequest(#[from] incoming::Error), - #[error("Sending back response to peer {0} failed.")] - SendResponse(PeerId), + #[error("Sending back response to peers {0:#?} failed.")] + SendResponses(Vec), #[error("Changing peer's ({0}) reputation failed.")] SetPeerReputation(PeerId), @@ -44,16 +46,29 @@ pub enum Error { #[error("Dispute request with invalid signatures, from peer {0}.")] InvalidSignature(PeerId), - #[error("Import of dispute got canceled for peer {0} - import failed for some reason.")] - ImportCanceled(PeerId), + #[error("Received votes from peer {0} have been completely redundant.")] + RedundantMessage(PeerId), + + #[error("Import of dispute got canceled for candidate {0} - import failed for some reason.")] + ImportCanceled(CandidateHash), #[error("Peer {0} attempted to participate in dispute and is not a validator.")] NotAValidator(PeerId), + + #[error("Force flush for batch that could not be found attempted, candidate hash: {0}")] + ForceFlushBatchDoesNotExist(CandidateHash), + + // Should never happen in practice: + #[error("We needed to drop messages, because we reached limit on concurrent batches.")] + MaxBatchLimitReached, + + #[error("Authority {0} sent messages at a too high rate.")] + AuthorityFlooding(AuthorityDiscoveryId), } pub type Result = std::result::Result; -pub type JfyiErrorResult = std::result::Result; +pub type JfyiResult = std::result::Result; /// Utility for eating top level errors and log them. /// diff --git a/node/network/dispute-distribution/src/receiver/mod.rs b/node/network/dispute-distribution/src/receiver/mod.rs index 9193947e78d1..ee92100d190f 100644 --- a/node/network/dispute-distribution/src/receiver/mod.rs +++ b/node/network/dispute-distribution/src/receiver/mod.rs @@ -15,20 +15,20 @@ // along with Polkadot. If not, see . use std::{ - collections::HashSet, pin::Pin, task::{Context, Poll}, + time::Duration, }; use futures::{ channel::oneshot, - future::{poll_fn, BoxFuture}, + future::poll_fn, pin_mut, - stream::{FusedStream, FuturesUnordered, StreamExt}, - Future, FutureExt, Stream, + stream::{FuturesUnordered, StreamExt}, + Future, }; -use lru::LruCache; +use gum::CandidateHash; use polkadot_node_network_protocol::{ authority_discovery::AuthorityDiscovery, request_response::{ @@ -51,20 +51,47 @@ use crate::{ }; mod error; -use self::error::{log_error, JfyiError, JfyiErrorResult, Result}; + +/// Rate limiting queues for incoming requests by peers. +mod peer_queues; + +/// Batch imports together. +mod batches; + +use self::{ + batches::{Batches, FoundBatch, PreparedImport}, + error::{log_error, JfyiError, JfyiResult, Result}, + peer_queues::PeerQueues, +}; const COST_INVALID_REQUEST: Rep = Rep::CostMajor("Received message could not be decoded."); const COST_INVALID_SIGNATURE: Rep = Rep::Malicious("Signatures were invalid."); -const COST_INVALID_CANDIDATE: Rep = Rep::Malicious("Reported candidate was not available."); +const COST_INVALID_IMPORT: Rep = + Rep::Malicious("Import was deemed invalid by dispute-coordinator."); const COST_NOT_A_VALIDATOR: Rep = Rep::CostMajor("Reporting peer was not a validator."); +/// Mildly punish peers exceeding their rate limit. +/// +/// For honest peers this should rarely happen, but if it happens we would not want to disconnect +/// too quickly. Minor cost should suffice for disconnecting any real flooder. +const COST_APPARENT_FLOOD: Rep = Rep::CostMinor("Peer exceeded the rate limit."); -/// How many statement imports we want to issue in parallel: -pub const MAX_PARALLEL_IMPORTS: usize = 10; +/// How many votes must have arrived in the last `BATCH_COLLECTING_INTERVAL` +/// +/// in order for a batch to stay alive and not get flushed/imported to the dispute-coordinator. +/// +/// This ensures a timely import of batches. +#[cfg(not(test))] +pub const MIN_KEEP_BATCH_ALIVE_VOTES: u32 = 10; +#[cfg(test)] +pub const MIN_KEEP_BATCH_ALIVE_VOTES: u32 = 2; -/// State for handling incoming `DisputeRequest` messages. +/// Time we allow to pass for new votes to trickle in. /// -/// This is supposed to run as its own task in order to easily impose back pressure on the incoming -/// request channel and at the same time to drop flood messages as fast as possible. +/// See `MIN_KEEP_BATCH_ALIVE_VOTES` above. +/// Should be greater or equal to `RECEIVE_RATE_LIMIT` (there is no point in checking any faster). +pub const BATCH_COLLECTING_INTERVAL: Duration = Duration::from_millis(500); + +/// State for handling incoming `DisputeRequest` messages. pub struct DisputesReceiver { /// Access to session information. runtime: RuntimeInfo, @@ -75,18 +102,17 @@ pub struct DisputesReceiver { /// Channel to retrieve incoming requests from. receiver: IncomingRequestReceiver, + /// Rate limiting queue for each peer (only authorities). + peer_queues: PeerQueues, + + /// Currently active batches of imports per candidate. + batches: Batches, + /// Authority discovery service: authority_discovery: AD, - /// Imports currently being processed. - pending_imports: PendingImports, - - /// We keep record of the last banned peers. - /// - /// This is needed because once we ban a peer, we will very likely still have pending requests - /// in the incoming channel - we should not waste time recovering availability for those, as we - /// already know the peer is malicious. - banned_peers: LruCache, + /// Imports currently being processed by the `dispute-coordinator`. + pending_imports: FuturesUnordered, /// Log received requests. metrics: Metrics, @@ -100,36 +126,24 @@ enum MuxedMessage { /// /// - We need to make sure responses are actually sent (therefore we need to await futures /// promptly). - /// - We need to update `banned_peers` accordingly to the result. - ConfirmedImport(JfyiErrorResult<(PeerId, ImportStatementsResult)>), + /// - We need to punish peers whose import got rejected. + ConfirmedImport(ImportResult), /// A new request has arrived and should be handled. NewRequest(IncomingRequest), -} -impl MuxedMessage { - async fn receive( - pending_imports: &mut PendingImports, - pending_requests: &mut IncomingRequestReceiver, - ) -> Result { - poll_fn(|ctx| { - let next_req = pending_requests.recv(|| vec![COST_INVALID_REQUEST]); - pin_mut!(next_req); - if let Poll::Ready(r) = next_req.poll(ctx) { - return match r { - Err(e) => Poll::Ready(Err(incoming::Error::from(e).into())), - Ok(v) => Poll::Ready(Ok(Self::NewRequest(v))), - } - } - // In case of Ready(None) return `Pending` below - we want to wait for the next request - // in that case. - if let Poll::Ready(Some(v)) = pending_imports.poll_next_unpin(ctx) { - return Poll::Ready(Ok(Self::ConfirmedImport(v))) - } - Poll::Pending - }) - .await - } + /// Rate limit timer hit - is is time to process one row of messages. + /// + /// This is the result of calling self.peer_queues.pop_reqs(). + WakePeerQueuesPopReqs(Vec>), + + /// It is time to check batches. + /// + /// Every `BATCH_COLLECTING_INTERVAL` we check whether less than `MIN_KEEP_BATCH_ALIVE_VOTES` + /// new votes arrived, if so the batch is ready for import. + /// + /// This is the result of calling self.batches.check_batches(). + WakeCheckBatches(Vec), } impl DisputesReceiver @@ -152,11 +166,10 @@ where runtime, sender, receiver, + peer_queues: PeerQueues::new(), + batches: Batches::new(), authority_discovery, - pending_imports: PendingImports::new(), - // Size of MAX_PARALLEL_IMPORTS ensures we are going to immediately get rid of any - // malicious requests still pending in the incoming queue. - banned_peers: LruCache::new(MAX_PARALLEL_IMPORTS), + pending_imports: FuturesUnordered::new(), metrics, } } @@ -180,60 +193,132 @@ where } } - /// Actual work happening here. + /// Actual work happening here in three phases: + /// + /// 1. Receive and queue incoming messages until the rate limit timer hits. + /// 2. Do import/batching for the head of all queues. + /// 3. Check and flush any ready batches. async fn run_inner(&mut self) -> Result<()> { - let msg = MuxedMessage::receive(&mut self.pending_imports, &mut self.receiver).await?; + let msg = self.receive_message().await?; - let incoming = match msg { - // We need to clean up futures, to make sure responses are sent: - MuxedMessage::ConfirmedImport(m_bad) => { - self.ban_bad_peer(m_bad)?; - return Ok(()) + match msg { + MuxedMessage::NewRequest(req) => { + // Phase 1: + self.metrics.on_received_request(); + self.dispatch_to_queues(req).await?; }, - MuxedMessage::NewRequest(req) => req, - }; + MuxedMessage::WakePeerQueuesPopReqs(reqs) => { + // Phase 2: + for req in reqs { + // No early return - we cannot cancel imports of one peer, because the import of + // another failed: + match log_error(self.start_import_or_batch(req).await) { + Ok(()) => {}, + Err(fatal) => return Err(fatal.into()), + } + } + }, + MuxedMessage::WakeCheckBatches(ready_imports) => { + // Phase 3: + self.import_ready_batches(ready_imports).await; + }, + MuxedMessage::ConfirmedImport(import_result) => { + self.update_imported_requests_metrics(&import_result); + // Confirm imports to requesters/punish them on invalid imports: + send_responses_to_requesters(import_result).await?; + }, + } + + Ok(()) + } - self.metrics.on_received_request(); + /// Receive one `MuxedMessage`. + /// + /// + /// Dispatching events to messages as they happen. + async fn receive_message(&mut self) -> Result { + poll_fn(|ctx| { + // In case of Ready(None), we want to wait for pending requests: + if let Poll::Ready(Some(v)) = self.pending_imports.poll_next_unpin(ctx) { + return Poll::Ready(Ok(MuxedMessage::ConfirmedImport(v?))) + } + + let rate_limited = self.peer_queues.pop_reqs(); + pin_mut!(rate_limited); + // We poll rate_limit before batches, so we don't unnecessarily delay importing to + // batches. + if let Poll::Ready(reqs) = rate_limited.poll(ctx) { + return Poll::Ready(Ok(MuxedMessage::WakePeerQueuesPopReqs(reqs))) + } + + let ready_batches = self.batches.check_batches(); + pin_mut!(ready_batches); + if let Poll::Ready(ready_batches) = ready_batches.poll(ctx) { + return Poll::Ready(Ok(MuxedMessage::WakeCheckBatches(ready_batches))) + } - let peer = incoming.peer; + let next_req = self.receiver.recv(|| vec![COST_INVALID_REQUEST]); + pin_mut!(next_req); + if let Poll::Ready(r) = next_req.poll(ctx) { + return match r { + Err(e) => Poll::Ready(Err(incoming::Error::from(e).into())), + Ok(v) => Poll::Ready(Ok(MuxedMessage::NewRequest(v))), + } + } + Poll::Pending + }) + .await + } - // Only accept messages from validators: - if self.authority_discovery.get_authority_ids_by_peer_id(peer).await.is_none() { - incoming - .send_outgoing_response(OutgoingResponse { + /// Process incoming requests. + /// + /// - Check sender is authority + /// - Dispatch message to corresponding queue in `peer_queues`. + /// - If queue is full, drop message and change reputation of sender. + async fn dispatch_to_queues(&mut self, req: IncomingRequest) -> JfyiResult<()> { + let peer = req.peer; + // Only accept messages from validators, in case there are multiple `AuthorityId`s, we + // just take the first one. On session boundaries this might allow validators to double + // their rate limit for a short period of time, which seems acceptable. + let authority_id = match self + .authority_discovery + .get_authority_ids_by_peer_id(peer) + .await + .and_then(|s| s.into_iter().next()) + { + None => { + req.send_outgoing_response(OutgoingResponse { result: Err(()), reputation_changes: vec![COST_NOT_A_VALIDATOR], sent_feedback: None, }) - .map_err(|_| JfyiError::SendResponse(peer))?; - - return Err(JfyiError::NotAValidator(peer).into()) - } - - // Immediately drop requests from peers that already have requests in flight or have - // been banned recently (flood protection): - if self.pending_imports.peer_is_pending(&peer) || self.banned_peers.contains(&peer) { - gum::trace!( - target: LOG_TARGET, - ?peer, - "Dropping message from peer (banned/pending import)" - ); - return Ok(()) - } + .map_err(|_| JfyiError::SendResponses(vec![peer]))?; + return Err(JfyiError::NotAValidator(peer).into()) + }, + Some(auth_id) => auth_id, + }; - // Wait for a free slot: - if self.pending_imports.len() >= MAX_PARALLEL_IMPORTS as usize { - // Wait for one to finish: - let r = self.pending_imports.next().await; - self.ban_bad_peer(r.expect("pending_imports.len() is greater 0. qed."))?; + // Queue request: + if let Err((authority_id, req)) = self.peer_queues.push_req(authority_id, req) { + req.send_outgoing_response(OutgoingResponse { + result: Err(()), + reputation_changes: vec![COST_APPARENT_FLOOD], + sent_feedback: None, + }) + .map_err(|_| JfyiError::SendResponses(vec![peer]))?; + return Err(JfyiError::AuthorityFlooding(authority_id)) } - - // All good - initiate import. - self.start_import(incoming).await + Ok(()) } - /// Start importing votes for the given request. - async fn start_import(&mut self, incoming: IncomingRequest) -> Result<()> { + /// Start importing votes for the given request or batch. + /// + /// Signature check and in case we already have an existing batch we import to that batch, + /// otherwise import to `dispute-coordinator` directly and open a batch. + async fn start_import_or_batch( + &mut self, + incoming: IncomingRequest, + ) -> Result<()> { let IncomingRequest { peer, payload, pending_response } = incoming; let info = self @@ -263,128 +348,170 @@ where Ok(votes) => votes, }; - let (pending_confirmation, confirmation_rx) = oneshot::channel(); - self.sender - .send_message(DisputeCoordinatorMessage::ImportStatements { - candidate_receipt, - session: valid_vote.0.session_index(), - statements: vec![valid_vote, invalid_vote], - pending_confirmation: Some(pending_confirmation), - }) - .await; + let candidate_hash = *valid_vote.0.candidate_hash(); + + match self.batches.find_batch(candidate_hash, candidate_receipt)? { + FoundBatch::Created(batch) => { + // There was no entry yet - start import immediately: + gum::trace!( + target: LOG_TARGET, + ?candidate_hash, + "No batch yet - triggering immediate import" + ); + let import = PreparedImport { + candidate_receipt: batch.candidate_receipt().clone(), + statements: vec![valid_vote, invalid_vote], + requesters: vec![(peer, pending_response)], + }; + self.start_import(import).await; + }, + FoundBatch::Found(batch) => { + gum::trace!(target: LOG_TARGET, ?candidate_hash, "Batch exists - batching request"); + let batch_result = + batch.add_votes(valid_vote, invalid_vote, peer, pending_response); + + if let Err(pending_response) = batch_result { + // We don't expect honest peers to send redundant votes within a single batch, + // as the timeout for retry is much higher. Still we don't want to punish the + // node as it might not be the node's fault. Some other (malicious) node could have been + // faster sending the same votes in order to harm the reputation of that honest + // node. Given that we already have a rate limit, if a validator chooses to + // waste available rate with redundant votes - so be it. The actual dispute + // resolution is unaffected. + gum::debug!( + target: LOG_TARGET, + "Peer sent completely redundant votes within a single batch - that looks fishy!", + ); + pending_response + .send_outgoing_response(OutgoingResponse { + // While we have seen duplicate votes, we cannot confirm as we don't + // know yet whether the batch is going to be confirmed, so we assume + // the worst. We don't want to push the pending response to the batch + // either as that would be unbounded, only limited by the rate limit. + result: Err(()), + reputation_changes: Vec::new(), + sent_feedback: None, + }) + .map_err(|_| JfyiError::SendResponses(vec![peer]))?; + return Err(From::from(JfyiError::RedundantMessage(peer))) + } + }, + } - self.pending_imports.push(peer, confirmation_rx, pending_response); Ok(()) } - /// Await an import and ban any misbehaving peers. - /// - /// In addition we report import metrics. - fn ban_bad_peer( - &mut self, - result: JfyiErrorResult<(PeerId, ImportStatementsResult)>, - ) -> JfyiErrorResult<()> { - match result? { - (_, ImportStatementsResult::ValidImport) => { - self.metrics.on_imported(SUCCEEDED); - }, - (bad_peer, ImportStatementsResult::InvalidImport) => { - self.metrics.on_imported(FAILED); - self.banned_peers.put(bad_peer, ()); - }, + /// Trigger import into the dispute-coordinator of ready batches (`PreparedImport`s). + async fn import_ready_batches(&mut self, ready_imports: Vec) { + for import in ready_imports { + self.start_import(import).await; } - Ok(()) } -} -/// Manage pending imports in a way that preserves invariants. -struct PendingImports { - /// Futures in flight. - futures: - FuturesUnordered)>>, - /// Peers whose requests are currently in flight. - peers: HashSet, -} + /// Start import and add response receiver to `pending_imports`. + async fn start_import(&mut self, import: PreparedImport) { + let PreparedImport { candidate_receipt, statements, requesters } = import; + let (session_index, candidate_hash) = match statements.iter().next() { + None => { + gum::debug!( + target: LOG_TARGET, + candidate_hash = ?candidate_receipt.hash(), + "Not importing empty batch" + ); + return + }, + Some(vote) => (vote.0.session_index(), vote.0.candidate_hash().clone()), + }; -impl PendingImports { - pub fn new() -> Self { - Self { futures: FuturesUnordered::new(), peers: HashSet::new() } - } + let (pending_confirmation, confirmation_rx) = oneshot::channel(); + self.sender + .send_message(DisputeCoordinatorMessage::ImportStatements { + candidate_receipt, + session: session_index, + statements, + pending_confirmation: Some(pending_confirmation), + }) + .await; - pub fn push( - &mut self, - peer: PeerId, - handled: oneshot::Receiver, - pending_response: OutgoingResponseSender, - ) { - self.peers.insert(peer); - self.futures.push( - async move { - let r = respond_to_request(peer, handled, pending_response).await; - (peer, r) - } - .boxed(), - ) - } + let pending = + PendingImport { candidate_hash, requesters, pending_response: confirmation_rx }; - /// Returns the number of contained futures. - pub fn len(&self) -> usize { - self.futures.len() + self.pending_imports.push(pending); } - /// Check whether a peer has a pending import. - pub fn peer_is_pending(&self, peer: &PeerId) -> bool { - self.peers.contains(peer) + fn update_imported_requests_metrics(&self, result: &ImportResult) { + let label = match result.result { + ImportStatementsResult::ValidImport => SUCCEEDED, + ImportStatementsResult::InvalidImport => FAILED, + }; + self.metrics.on_imported(label, result.requesters.len()); } } -impl Stream for PendingImports { - type Item = JfyiErrorResult<(PeerId, ImportStatementsResult)>; - fn poll_next(mut self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll> { - match Pin::new(&mut self.futures).poll_next(ctx) { - Poll::Pending => Poll::Pending, - Poll::Ready(None) => Poll::Ready(None), - Poll::Ready(Some((peer, result))) => { - self.peers.remove(&peer); - Poll::Ready(Some(result.map(|r| (peer, r)))) - }, - } - } -} -impl FusedStream for PendingImports { - fn is_terminated(&self) -> bool { - self.futures.is_terminated() - } -} +async fn send_responses_to_requesters(import_result: ImportResult) -> JfyiResult<()> { + let ImportResult { requesters, result } = import_result; -// Future for `PendingImports` -// -// - Wait for import -// - Punish peer -// - Deliver result -async fn respond_to_request( - peer: PeerId, - handled: oneshot::Receiver, - pending_response: OutgoingResponseSender, -) -> JfyiErrorResult { - let result = handled.await.map_err(|_| JfyiError::ImportCanceled(peer))?; - - let response = match result { - ImportStatementsResult::ValidImport => OutgoingResponse { + let mk_response = match result { + ImportStatementsResult::ValidImport => || OutgoingResponse { result: Ok(DisputeResponse::Confirmed), reputation_changes: Vec::new(), sent_feedback: None, }, - ImportStatementsResult::InvalidImport => OutgoingResponse { + ImportStatementsResult::InvalidImport => || OutgoingResponse { result: Err(()), - reputation_changes: vec![COST_INVALID_CANDIDATE], + reputation_changes: vec![COST_INVALID_IMPORT], sent_feedback: None, }, }; - pending_response - .send_outgoing_response(response) - .map_err(|_| JfyiError::SendResponse(peer))?; + let mut sending_failed_for = Vec::new(); + for (peer, pending_response) in requesters { + if let Err(()) = pending_response.send_outgoing_response(mk_response()) { + sending_failed_for.push(peer); + } + } + + if !sending_failed_for.is_empty() { + Err(JfyiError::SendResponses(sending_failed_for)) + } else { + Ok(()) + } +} + +/// A future that resolves into an `ImportResult` when ready. +/// +/// This future is used on `dispute-coordinator` import messages for the oneshot response receiver +/// to: +/// - Keep track of concerned `CandidateHash` for reporting errors. +/// - Keep track of requesting peers so we can confirm the import/punish them on invalid imports. +struct PendingImport { + candidate_hash: CandidateHash, + requesters: Vec<(PeerId, OutgoingResponseSender)>, + pending_response: oneshot::Receiver, +} + +/// A `PendingImport` becomes an `ImportResult` once done. +struct ImportResult { + /// Requesters of that import. + requesters: Vec<(PeerId, OutgoingResponseSender)>, + /// Actual result of the import. + result: ImportStatementsResult, +} - Ok(result) +impl PendingImport { + async fn wait_for_result(&mut self) -> JfyiResult { + let result = (&mut self.pending_response) + .await + .map_err(|_| JfyiError::ImportCanceled(self.candidate_hash))?; + Ok(ImportResult { requesters: std::mem::take(&mut self.requesters), result }) + } +} + +impl Future for PendingImport { + type Output = JfyiResult; + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let fut = self.wait_for_result(); + pin_mut!(fut); + fut.poll(cx) + } } diff --git a/node/network/dispute-distribution/src/receiver/peer_queues.rs b/node/network/dispute-distribution/src/receiver/peer_queues.rs new file mode 100644 index 000000000000..1c53ee271f47 --- /dev/null +++ b/node/network/dispute-distribution/src/receiver/peer_queues.rs @@ -0,0 +1,141 @@ +// Copyright 2022 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use std::collections::{hash_map::Entry, HashMap, VecDeque}; + +use futures::future::pending; +use futures_timer::Delay; +use polkadot_node_network_protocol::request_response::{v1::DisputeRequest, IncomingRequest}; +use polkadot_primitives::v2::AuthorityDiscoveryId; + +use crate::RECEIVE_RATE_LIMIT; + +/// How many messages we are willing to queue per peer (validator). +/// +/// The larger this value is, the larger bursts are allowed to be without us dropping messages. On +/// the flip side this gets allocated per validator, so for a size of 10 this will result +/// in 10_000 * size_of(`IncomingRequest`) in the worst case. +/// +/// `PEER_QUEUE_CAPACITY` must not be 0 for obvious reasons. +#[cfg(not(test))] +pub const PEER_QUEUE_CAPACITY: usize = 10; +#[cfg(test)] +pub const PEER_QUEUE_CAPACITY: usize = 2; + +/// Queues for messages from authority peers for rate limiting. +/// +/// Invariants ensured: +/// +/// 1. No queue will ever have more than `PEER_QUEUE_CAPACITY` elements. +/// 2. There are no empty queues. Whenever a queue gets empty, it is removed. This way checking +/// whether there are any messages queued is cheap. +/// 3. As long as not empty, `pop_reqs` will, if called in sequence, not return `Ready` more often +/// than once for every `RECEIVE_RATE_LIMIT`, but it will always return Ready eventually. +/// 4. If empty `pop_reqs` will never return `Ready`, but will always be `Pending`. +pub struct PeerQueues { + /// Actual queues. + queues: HashMap>>, + + /// Delay timer for establishing the rate limit. + rate_limit_waker: Option, +} + +impl PeerQueues { + /// New empty `PeerQueues`. + pub fn new() -> Self { + Self { queues: HashMap::new(), rate_limit_waker: None } + } + + /// Push an incoming request for a given authority. + /// + /// Returns: `Ok(())` if succeeded, `Err((args))` if capacity is reached. + pub fn push_req( + &mut self, + peer: AuthorityDiscoveryId, + req: IncomingRequest, + ) -> Result<(), (AuthorityDiscoveryId, IncomingRequest)> { + let queue = match self.queues.entry(peer) { + Entry::Vacant(vacant) => vacant.insert(VecDeque::new()), + Entry::Occupied(occupied) => { + if occupied.get().len() >= PEER_QUEUE_CAPACITY { + return Err((occupied.key().clone(), req)) + } + occupied.into_mut() + }, + }; + queue.push_back(req); + + // We have at least one element to process - rate limit `waker` needs to exist now: + self.ensure_waker(); + Ok(()) + } + + /// Pop all heads and return them for processing. + /// + /// This gets one message from each peer that has sent at least one. + /// + /// This function is rate limited, if called in sequence it will not return more often than + /// every `RECEIVE_RATE_LIMIT`. + /// + /// NOTE: If empty this function will not return `Ready` at all, but will always be `Pending`. + pub async fn pop_reqs(&mut self) -> Vec> { + self.wait_for_waker().await; + + let mut heads = Vec::with_capacity(self.queues.len()); + let old_queues = std::mem::replace(&mut self.queues, HashMap::new()); + for (k, mut queue) in old_queues.into_iter() { + let front = queue.pop_front(); + debug_assert!(front.is_some(), "Invariant that queues are never empty is broken."); + + if let Some(front) = front { + heads.push(front); + } + if !queue.is_empty() { + self.queues.insert(k, queue); + } + } + + if !self.is_empty() { + // Still not empty - we should get woken at some point. + self.ensure_waker(); + } + + heads + } + + /// Whether or not all queues are empty. + pub fn is_empty(&self) -> bool { + self.queues.is_empty() + } + + /// Ensure there is an active `waker`. + /// + /// Checks whether one exists and if not creates one. + fn ensure_waker(&mut self) -> &mut Delay { + self.rate_limit_waker.get_or_insert(Delay::new(RECEIVE_RATE_LIMIT)) + } + + /// Wait for `waker` if it exists, or be `Pending` forever. + /// + /// Afterwards it gets set back to `None`. + async fn wait_for_waker(&mut self) { + match self.rate_limit_waker.as_mut() { + None => pending().await, + Some(waker) => waker.await, + } + self.rate_limit_waker = None; + } +} diff --git a/node/network/dispute-distribution/src/sender/mod.rs b/node/network/dispute-distribution/src/sender/mod.rs index 5312528b413e..09b902173ede 100644 --- a/node/network/dispute-distribution/src/sender/mod.rs +++ b/node/network/dispute-distribution/src/sender/mod.rs @@ -14,10 +14,21 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -use std::collections::{hash_map::Entry, HashMap, HashSet}; - -use futures::channel::{mpsc, oneshot}; - +use std::{ + collections::{HashMap, HashSet}, + pin::Pin, + task::Poll, + time::Duration, +}; + +use futures::{ + channel::{mpsc, oneshot}, + future::poll_fn, + Future, +}; + +use futures_timer::Delay; +use indexmap::{map::Entry, IndexMap}; use polkadot_node_network_protocol::request_response::v1::DisputeRequest; use polkadot_node_primitives::{CandidateVotes, DisputeMessage, SignedDisputeStatement}; use polkadot_node_subsystem::{messages::DisputeCoordinatorMessage, overseer, ActiveLeavesUpdate}; @@ -28,22 +39,27 @@ use polkadot_primitives::v2::{CandidateHash, DisputeStatement, Hash, SessionInde /// /// It is going to spawn real tasks as it sees fit for getting the votes of the particular dispute /// out. +/// +/// As we assume disputes have a priority, we start sending for disputes in the order +/// `start_sender` got called. mod send_task; use send_task::SendTask; pub use send_task::TaskFinish; -/// Error and [`Result`] type for sender +/// Error and [`Result`] type for sender. mod error; pub use error::{Error, FatalError, JfyiError, Result}; use self::error::JfyiErrorResult; -use crate::{Metrics, LOG_TARGET}; +use crate::{Metrics, LOG_TARGET, SEND_RATE_LIMIT}; /// The `DisputeSender` keeps track of all ongoing disputes we need to send statements out. /// /// For each dispute a `SendTask` is responsible for sending to the concerned validators for that /// particular dispute. The `DisputeSender` keeps track of those tasks, informs them about new /// sessions/validator sets and cleans them up when they become obsolete. +/// +/// The unit of work for the `DisputeSender` is a dispute, represented by `SendTask`s. pub struct DisputeSender { /// All heads we currently consider active. active_heads: Vec, @@ -54,11 +70,16 @@ pub struct DisputeSender { active_sessions: HashMap, /// All ongoing dispute sendings this subsystem is aware of. - disputes: HashMap, + /// + /// Using an `IndexMap` so items can be iterated in the order of insertion. + disputes: IndexMap, /// Sender to be cloned for `SendTask`s. tx: mpsc::Sender, + /// Future for delaying too frequent creation of dispute sending tasks. + rate_limit: RateLimit, + /// Metrics for reporting stats about sent requests. metrics: Metrics, } @@ -70,19 +91,25 @@ impl DisputeSender { Self { active_heads: Vec::new(), active_sessions: HashMap::new(), - disputes: HashMap::new(), + disputes: IndexMap::new(), tx, + rate_limit: RateLimit::new(), metrics, } } /// Create a `SendTask` for a particular new dispute. + /// + /// This function is rate-limited by `SEND_RATE_LIMIT`. It will block if called too frequently + /// in order to maintain the limit. pub async fn start_sender( &mut self, ctx: &mut Context, runtime: &mut RuntimeInfo, msg: DisputeMessage, ) -> Result<()> { + self.rate_limit.limit().await; + let req: DisputeRequest = msg.into(); let candidate_hash = req.0.candidate_receipt.hash(); match self.disputes.entry(candidate_hash) { @@ -112,6 +139,8 @@ impl DisputeSender { /// - Get new authorities to send messages to. /// - Get rid of obsolete tasks and disputes. /// - Get dispute sending started in case we missed one for some reason (e.g. on node startup) + /// + /// This function ensures the `SEND_RATE_LIMIT`, therefore it might block. pub async fn update_leaves( &mut self, ctx: &mut Context, @@ -134,21 +163,38 @@ impl DisputeSender { let active_disputes: HashSet<_> = active_disputes.into_iter().map(|(_, c)| c).collect(); - // Cleanup obsolete senders: + // Cleanup obsolete senders (retain keeps order of remaining elements): self.disputes .retain(|candidate_hash, _| active_disputes.contains(candidate_hash)); + // Iterates in order of insertion: + let mut should_rate_limit = true; for dispute in self.disputes.values_mut() { if have_new_sessions || dispute.has_failed_sends() { - dispute + if should_rate_limit { + self.rate_limit.limit().await; + } + let sends_happened = dispute .refresh_sends(ctx, runtime, &self.active_sessions, &self.metrics) .await?; + // Only rate limit if we actually sent something out _and_ it was not just because + // of errors on previous sends. + // + // Reasoning: It would not be acceptable to slow down the whole subsystem, just + // because of a few bad peers having problems. It is actually better to risk + // running into their rate limit in that case and accept a minor reputation change. + should_rate_limit = sends_happened && have_new_sessions; } } - // This should only be non-empty on startup, but if not - we got you covered: + // This should only be non-empty on startup, but if not - we got you covered. + // + // Initial order will not be maintained in that case, but that should be fine as disputes + // recovered at startup will be relatively "old" anyway and we assume that no more than a + // third of the validators will go offline at any point in time anyway. for dispute in unknown_disputes { - self.start_send_for_dispute(ctx, runtime, dispute).await? + self.rate_limit.limit().await; + self.start_send_for_dispute(ctx, runtime, dispute).await?; } Ok(()) } @@ -317,6 +363,46 @@ impl DisputeSender { } } +/// Rate limiting logic. +/// +/// Suitable for the sending side. +struct RateLimit { + limit: Delay, +} + +impl RateLimit { + /// Create new `RateLimit` that is immediately ready. + fn new() -> Self { + // Start with an empty duration, as there has not been any previous call. + Self { limit: Delay::new(Duration::new(0, 0)) } + } + + /// Initialized with actual `SEND_RATE_LIMIT` duration. + fn new_limit() -> Self { + Self { limit: Delay::new(SEND_RATE_LIMIT) } + } + + /// Wait until ready and prepare for next call. + async fn limit(&mut self) { + // Wait for rate limit and add some logging: + poll_fn(|cx| { + let old_limit = Pin::new(&mut self.limit); + match old_limit.poll(cx) { + Poll::Pending => { + gum::debug!( + target: LOG_TARGET, + "Sending rate limit hit, slowing down requests" + ); + Poll::Pending + }, + Poll::Ready(()) => Poll::Ready(()), + } + }) + .await; + *self = Self::new_limit(); + } +} + /// Retrieve the currently active sessions. /// /// List is all indices of all active sessions together with the head that was used for the query. diff --git a/node/network/dispute-distribution/src/sender/send_task.rs b/node/network/dispute-distribution/src/sender/send_task.rs index a2b8cdcf7441..89b5c099bde9 100644 --- a/node/network/dispute-distribution/src/sender/send_task.rs +++ b/node/network/dispute-distribution/src/sender/send_task.rs @@ -42,13 +42,15 @@ use crate::{ /// Delivery status for a particular dispute. /// /// Keeps track of all the validators that have to be reached for a dispute. +/// +/// The unit of work for a `SendTask` is an authority/validator. pub struct SendTask { - /// The request we are supposed to get out to all parachain validators of the dispute's session + /// The request we are supposed to get out to all `parachain` validators of the dispute's session /// and to all current authorities. request: DisputeRequest, /// The set of authorities we need to send our messages to. This set will change at session - /// boundaries. It will always be at least the parachain validators of the session where the + /// boundaries. It will always be at least the `parachain` validators of the session where the /// dispute happened and the authorities of the current sessions as determined by active heads. deliveries: HashMap, @@ -100,6 +102,10 @@ impl TaskResult { #[overseer::contextbounds(DisputeDistribution, prefix = self::overseer)] impl SendTask { /// Initiates sending a dispute message to peers. + /// + /// Creation of new `SendTask`s is subject to rate limiting. As each `SendTask` will trigger + /// sending a message to each validator, hence for employing a per-peer rate limit, we need to + /// limit the construction of new `SendTask`s. pub async fn new( ctx: &mut Context, runtime: &mut RuntimeInfo, @@ -118,15 +124,22 @@ impl SendTask { /// /// This function is called at construction and should also be called whenever a session change /// happens and on a regular basis to ensure we are retrying failed attempts. + /// + /// This might resend to validators and is thus subject to any rate limiting we might want. + /// Calls to this function for different instances should be rate limited according to + /// `SEND_RATE_LIMIT`. + /// + /// Returns: `True` if this call resulted in new requests. pub async fn refresh_sends( &mut self, ctx: &mut Context, runtime: &mut RuntimeInfo, active_sessions: &HashMap, metrics: &Metrics, - ) -> Result<()> { + ) -> Result { let new_authorities = self.get_relevant_validators(ctx, runtime, active_sessions).await?; + // Note this will also contain all authorities for which sending failed previously: let add_authorities = new_authorities .iter() .filter(|a| !self.deliveries.contains_key(a)) @@ -141,12 +154,14 @@ impl SendTask { send_requests(ctx, self.tx.clone(), add_authorities, self.request.clone(), metrics) .await?; + let was_empty = new_statuses.is_empty(); + self.has_failed_sends = false; self.deliveries.extend(new_statuses.into_iter()); - Ok(()) + Ok(!was_empty) } - /// Whether any sends have failed since the last refreshed. + /// Whether any sends have failed since the last refresh. pub fn has_failed_sends(&self) -> bool { self.has_failed_sends } @@ -193,9 +208,8 @@ impl SendTask { /// Determine all validators that should receive the given dispute requests. /// - /// This is all parachain validators of the session the candidate occurred and all authorities + /// This is all `parachain` validators of the session the candidate occurred and all authorities /// of all currently active sessions, determined by currently active heads. - async fn get_relevant_validators( &self, ctx: &mut Context, @@ -293,7 +307,7 @@ async fn wait_response_task( gum::debug!( target: LOG_TARGET, %err, - "Failed to notify susystem about dispute sending result." + "Failed to notify subsystem about dispute sending result." ); } } diff --git a/node/network/dispute-distribution/src/tests/mock.rs b/node/network/dispute-distribution/src/tests/mock.rs index 08428d5852cc..aa2a4485d480 100644 --- a/node/network/dispute-distribution/src/tests/mock.rs +++ b/node/network/dispute-distribution/src/tests/mock.rs @@ -20,6 +20,7 @@ use std::{ collections::{HashMap, HashSet}, sync::Arc, + time::Instant, }; use async_trait::async_trait; @@ -38,6 +39,8 @@ use polkadot_primitives::v2::{ }; use polkadot_primitives_test_helpers::dummy_candidate_descriptor; +use crate::LOG_TARGET; + pub const MOCK_SESSION_INDEX: SessionIndex = 1; pub const MOCK_NEXT_SESSION_INDEX: SessionIndex = 2; pub const MOCK_VALIDATORS: [Sr25519Keyring; 6] = [ @@ -54,6 +57,8 @@ pub const MOCK_AUTHORITIES_NEXT_SESSION: [Sr25519Keyring; 2] = pub const FERDIE_INDEX: ValidatorIndex = ValidatorIndex(0); pub const ALICE_INDEX: ValidatorIndex = ValidatorIndex(1); +pub const BOB_INDEX: ValidatorIndex = ValidatorIndex(2); +pub const CHARLIE_INDEX: ValidatorIndex = ValidatorIndex(3); lazy_static! { @@ -148,12 +153,22 @@ pub async fn make_dispute_message( invalid_validator: ValidatorIndex, ) -> DisputeMessage { let candidate_hash = candidate.hash(); + let before_request = Instant::now(); let valid_vote = make_explicit_signed(MOCK_VALIDATORS[valid_validator.0 as usize], candidate_hash, true) .await; + gum::trace!( + "Passed time for valid vote: {:#?}", + Instant::now().saturating_duration_since(before_request) + ); + let before_request = Instant::now(); let invalid_vote = make_explicit_signed(MOCK_VALIDATORS[invalid_validator.0 as usize], candidate_hash, false) .await; + gum::trace!( + "Passed time for invald vote: {:#?}", + Instant::now().saturating_duration_since(before_request) + ); DisputeMessage::from_signed_statements( valid_vote, valid_validator, @@ -206,10 +221,15 @@ impl AuthorityDiscovery for MockAuthorityDiscovery { ) -> Option> { for (a, p) in self.peer_ids.iter() { if p == &peer_id { - return Some(HashSet::from([MOCK_VALIDATORS_DISCOVERY_KEYS - .get(&a) - .unwrap() - .clone()])) + let result = + HashSet::from([MOCK_VALIDATORS_DISCOVERY_KEYS.get(&a).unwrap().clone()]); + gum::trace!( + target: LOG_TARGET, + %peer_id, + ?result, + "Returning authority ids for peer id" + ); + return Some(result) } } diff --git a/node/network/dispute-distribution/src/tests/mod.rs b/node/network/dispute-distribution/src/tests/mod.rs index 8ef8286ea197..56cdd467fd62 100644 --- a/node/network/dispute-distribution/src/tests/mod.rs +++ b/node/network/dispute-distribution/src/tests/mod.rs @@ -17,12 +17,17 @@ //! Subsystem unit tests -use std::{collections::HashSet, sync::Arc, task::Poll, time::Duration}; +use std::{ + collections::HashSet, + sync::Arc, + task::Poll, + time::{Duration, Instant}, +}; use assert_matches::assert_matches; use futures::{ channel::{mpsc, oneshot}, - future::poll_fn, + future::{poll_fn, ready}, pin_mut, Future, SinkExt, }; use futures_timer::Delay; @@ -52,7 +57,7 @@ use polkadot_node_subsystem_test_helpers::{ mock::make_ferdie_keystore, subsystem_test_harness, TestSubsystemContextHandle, }; use polkadot_primitives::v2::{ - AuthorityDiscoveryId, CandidateHash, Hash, SessionIndex, SessionInfo, + AuthorityDiscoveryId, CandidateHash, CandidateReceipt, Hash, SessionIndex, SessionInfo, }; use self::mock::{ @@ -60,7 +65,11 @@ use self::mock::{ MOCK_AUTHORITY_DISCOVERY, MOCK_NEXT_SESSION_INDEX, MOCK_NEXT_SESSION_INFO, MOCK_SESSION_INDEX, MOCK_SESSION_INFO, }; -use crate::{DisputeDistributionSubsystem, Metrics, LOG_TARGET}; +use crate::{ + receiver::BATCH_COLLECTING_INTERVAL, + tests::mock::{BOB_INDEX, CHARLIE_INDEX}, + DisputeDistributionSubsystem, Metrics, LOG_TARGET, SEND_RATE_LIMIT, +}; /// Useful mock providers. pub mod mock; @@ -72,49 +81,108 @@ fn send_dispute_sends_dispute() { let relay_parent = Hash::random(); let candidate = make_candidate_receipt(relay_parent); - let message = make_dispute_message(candidate.clone(), ALICE_INDEX, FERDIE_INDEX).await; - handle - .send(FromOrchestra::Communication { - msg: DisputeDistributionMessage::SendDispute(message.clone()), - }) - .await; + send_dispute(&mut handle, candidate, true).await; + conclude(&mut handle).await; + }; + test_harness(test); +} + +#[test] +fn send_honors_rate_limit() { + sp_tracing::try_init_simple(); + let test = |mut handle: TestSubsystemContextHandle, _req_cfg| async move { + let _ = handle_subsystem_startup(&mut handle, None).await; + + let relay_parent = Hash::random(); + let candidate = make_candidate_receipt(relay_parent); + let before_request = Instant::now(); + send_dispute(&mut handle, candidate, true).await; + // First send should not be rate limited: + gum::trace!("Passed time: {:#?}", Instant::now().saturating_duration_since(before_request)); + // This test would likely be flaky on CI: + //assert!(Instant::now().saturating_duration_since(before_request) < SEND_RATE_LIMIT); + + let relay_parent = Hash::random(); + let candidate = make_candidate_receipt(relay_parent); + send_dispute(&mut handle, candidate, false).await; + // Second send should be rate limited: + gum::trace!( + "Passed time for send_dispute: {:#?}", + Instant::now().saturating_duration_since(before_request) + ); + assert!(Instant::now() - before_request >= SEND_RATE_LIMIT); + conclude(&mut handle).await; + }; + test_harness(test); +} + +/// Helper for sending a new dispute to dispute-distribution sender and handling resulting messages. +async fn send_dispute( + handle: &mut TestSubsystemContextHandle, + candidate: CandidateReceipt, + needs_session_info: bool, +) { + let before_request = Instant::now(); + let message = make_dispute_message(candidate.clone(), ALICE_INDEX, FERDIE_INDEX).await; + gum::trace!( + "Passed time for making message: {:#?}", + Instant::now().saturating_duration_since(before_request) + ); + let before_request = Instant::now(); + handle + .send(FromOrchestra::Communication { + msg: DisputeDistributionMessage::SendDispute(message.clone()), + }) + .await; + gum::trace!( + "Passed time for sending message: {:#?}", + Instant::now().saturating_duration_since(before_request) + ); + if needs_session_info { // Requests needed session info: assert_matches!( - handle.recv().await, - AllMessages::RuntimeApi( - RuntimeApiMessage::Request( - hash, - RuntimeApiRequest::SessionInfo(session_index, tx) + handle.recv().await, + AllMessages::RuntimeApi( + RuntimeApiMessage::Request( + hash, + RuntimeApiRequest::SessionInfo(session_index, tx) ) ) => { - assert_eq!(session_index, MOCK_SESSION_INDEX); - assert_eq!( - hash, - message.candidate_receipt().descriptor.relay_parent + assert_eq!(session_index, MOCK_SESSION_INDEX); + assert_eq!( + hash, + message.candidate_receipt().descriptor.relay_parent ); - tx.send(Ok(Some(MOCK_SESSION_INFO.clone()))).expect("Receiver should stay alive."); - } + tx.send(Ok(Some(MOCK_SESSION_INFO.clone()))).expect("Receiver should stay alive."); + } ); + } - let expected_receivers = { - let info = &MOCK_SESSION_INFO; - info.discovery_keys - .clone() - .into_iter() - .filter(|a| a != &Sr25519Keyring::Ferdie.public().into()) - .collect() - // All validators are also authorities in the first session, so we are - // done here. - }; - check_sent_requests(&mut handle, expected_receivers, true).await; - - conclude(&mut handle).await; + let expected_receivers = { + let info = &MOCK_SESSION_INFO; + info.discovery_keys + .clone() + .into_iter() + .filter(|a| a != &Sr25519Keyring::Ferdie.public().into()) + .collect() + // All validators are also authorities in the first session, so we are + // done here. }; - test_harness(test); + check_sent_requests(handle, expected_receivers, true).await; } +// Things to test: +// x Request triggers import +// x Subsequent imports get batched +// x Batch gets flushed. +// x Batch gets renewed. +// x Non authority requests get dropped. +// x Sending rate limit is honored. +// x Receiving rate limit is honored. +// x Duplicate requests on batch are dropped + #[test] -fn received_request_triggers_import() { +fn received_non_authorities_are_dropped() { let test = |mut handle: TestSubsystemContextHandle, mut req_cfg: RequestResponseConfig| async move { let req_tx = req_cfg.inbound_queue.as_mut().unwrap(); @@ -140,110 +208,271 @@ fn received_request_triggers_import() { assert_eq!(reputation_changes.len(), 1); } ); + conclude(&mut handle).await; + }; + test_harness(test); +} + +#[test] +fn received_request_triggers_import() { + let test = |mut handle: TestSubsystemContextHandle, + mut req_cfg: RequestResponseConfig| async move { + let req_tx = req_cfg.inbound_queue.as_mut().unwrap(); + let _ = handle_subsystem_startup(&mut handle, None).await; + + let relay_parent = Hash::random(); + let candidate = make_candidate_receipt(relay_parent); + let message = make_dispute_message(candidate.clone(), ALICE_INDEX, FERDIE_INDEX).await; - // Nested valid and invalid import. - // - // Nested requests from same peer should get dropped. For the invalid request even - // subsequent requests should get dropped. nested_network_dispute_request( &mut handle, req_tx, MOCK_AUTHORITY_DISCOVERY.get_peer_id_by_authority(Sr25519Keyring::Alice), message.clone().into(), - ImportStatementsResult::InvalidImport, + ImportStatementsResult::ValidImport, true, - move |handle, req_tx, message| { - nested_network_dispute_request( - handle, - req_tx, - MOCK_AUTHORITY_DISCOVERY.get_peer_id_by_authority(Sr25519Keyring::Bob), - message.clone().into(), - ImportStatementsResult::ValidImport, - false, - move |_, req_tx, message| async move { - // Another request from Alice should get dropped (request already in - // flight): - { - let rx_response = send_network_dispute_request( - req_tx, - MOCK_AUTHORITY_DISCOVERY - .get_peer_id_by_authority(Sr25519Keyring::Alice), - message.clone(), - ) - .await; - - assert_matches!( - rx_response.await, - Err(err) => { - gum::trace!( - target: LOG_TARGET, - ?err, - "Request got dropped - other request already in flight" - ); - } - ); - } - // Another request from Bob should get dropped (request already in - // flight): - { - let rx_response = send_network_dispute_request( - req_tx, - MOCK_AUTHORITY_DISCOVERY - .get_peer_id_by_authority(Sr25519Keyring::Bob), - message.clone(), - ) - .await; - - assert_matches!( - rx_response.await, - Err(err) => { - gum::trace!( - target: LOG_TARGET, - ?err, - "Request got dropped - other request already in flight" - ); - } - ); - } - }, - ) - }, + move |_handle, _req_tx, _message| ready(()), ) .await; - // Subsequent sends from Alice should fail (peer is banned): - { - let rx_response = send_network_dispute_request( - req_tx, - MOCK_AUTHORITY_DISCOVERY.get_peer_id_by_authority(Sr25519Keyring::Alice), - message.clone().into(), - ) - .await; + gum::trace!(target: LOG_TARGET, "Concluding."); + conclude(&mut handle).await; + }; + test_harness(test); +} + +#[test] +fn batching_works() { + let test = |mut handle: TestSubsystemContextHandle, + mut req_cfg: RequestResponseConfig| async move { + let req_tx = req_cfg.inbound_queue.as_mut().unwrap(); + let _ = handle_subsystem_startup(&mut handle, None).await; + + let relay_parent = Hash::random(); + let candidate = make_candidate_receipt(relay_parent); + let message = make_dispute_message(candidate.clone(), ALICE_INDEX, FERDIE_INDEX).await; + + // Initial request should get forwarded immediately: + nested_network_dispute_request( + &mut handle, + req_tx, + MOCK_AUTHORITY_DISCOVERY.get_peer_id_by_authority(Sr25519Keyring::Alice), + message.clone().into(), + ImportStatementsResult::ValidImport, + true, + move |_handle, _req_tx, _message| ready(()), + ) + .await; + let mut rx_responses = Vec::new(); + + let message = make_dispute_message(candidate.clone(), BOB_INDEX, FERDIE_INDEX).await; + let peer = MOCK_AUTHORITY_DISCOVERY.get_peer_id_by_authority(Sr25519Keyring::Bob); + rx_responses.push(send_network_dispute_request(req_tx, peer, message.clone().into()).await); + + let message = make_dispute_message(candidate.clone(), CHARLIE_INDEX, FERDIE_INDEX).await; + let peer = MOCK_AUTHORITY_DISCOVERY.get_peer_id_by_authority(Sr25519Keyring::Charlie); + rx_responses.push(send_network_dispute_request(req_tx, peer, message.clone().into()).await); + gum::trace!("Imported 3 votes into batch"); + + Delay::new(BATCH_COLLECTING_INTERVAL).await; + gum::trace!("Batch should still be alive"); + // Batch should still be alive (2 new votes): + // Let's import two more votes, but fully duplicates - should not extend batch live. + gum::trace!("Importing duplicate votes"); + let mut rx_responses_duplicate = Vec::new(); + let message = make_dispute_message(candidate.clone(), BOB_INDEX, FERDIE_INDEX).await; + let peer = MOCK_AUTHORITY_DISCOVERY.get_peer_id_by_authority(Sr25519Keyring::Bob); + rx_responses_duplicate + .push(send_network_dispute_request(req_tx, peer, message.clone().into()).await); + + let message = make_dispute_message(candidate.clone(), CHARLIE_INDEX, FERDIE_INDEX).await; + let peer = MOCK_AUTHORITY_DISCOVERY.get_peer_id_by_authority(Sr25519Keyring::Charlie); + rx_responses_duplicate + .push(send_network_dispute_request(req_tx, peer, message.clone().into()).await); + + for rx_response in rx_responses_duplicate { assert_matches!( rx_response.await, - Err(err) => { + Ok(resp) => { + let sc_network::config::OutgoingResponse { + result, + reputation_changes, + sent_feedback: _, + } = resp; gum::trace!( target: LOG_TARGET, - ?err, - "Request got dropped - peer is banned." + ?reputation_changes, + "Received reputation changes." + ); + // We don't punish on that. + assert_eq!(reputation_changes.len(), 0); + + assert_matches!(result, Err(())); + } + ); + } + + Delay::new(BATCH_COLLECTING_INTERVAL).await; + gum::trace!("Batch should be ready now (only duplicates have been added)"); + + let pending_confirmation = assert_matches!( + handle.recv().await, + AllMessages::DisputeCoordinator( + DisputeCoordinatorMessage::ImportStatements { + candidate_receipt: _, + session, + statements, + pending_confirmation: Some(pending_confirmation), + } + ) => { + assert_eq!(session, MOCK_SESSION_INDEX); + assert_eq!(statements.len(), 3); + pending_confirmation + } + ); + pending_confirmation.send(ImportStatementsResult::ValidImport).unwrap(); + + for rx_response in rx_responses { + assert_matches!( + rx_response.await, + Ok(resp) => { + let sc_network::config::OutgoingResponse { + result, + reputation_changes: _, + sent_feedback, + } = resp; + + let result = result.unwrap(); + let decoded = + ::decode(&mut result.as_slice()).unwrap(); + + assert!(decoded == DisputeResponse::Confirmed); + if let Some(sent_feedback) = sent_feedback { + sent_feedback.send(()).unwrap(); + } + gum::trace!( + target: LOG_TARGET, + "Valid import happened." ); + } ); } - // But should work fine for Bob: + gum::trace!(target: LOG_TARGET, "Concluding."); + conclude(&mut handle).await; + }; + test_harness(test); +} + +#[test] +fn receive_rate_limit_is_enforced() { + let test = |mut handle: TestSubsystemContextHandle, + mut req_cfg: RequestResponseConfig| async move { + let req_tx = req_cfg.inbound_queue.as_mut().unwrap(); + let _ = handle_subsystem_startup(&mut handle, None).await; + + let relay_parent = Hash::random(); + let candidate = make_candidate_receipt(relay_parent); + let message = make_dispute_message(candidate.clone(), ALICE_INDEX, FERDIE_INDEX).await; + + // Initial request should get forwarded immediately: nested_network_dispute_request( &mut handle, req_tx, - MOCK_AUTHORITY_DISCOVERY.get_peer_id_by_authority(Sr25519Keyring::Bob), + MOCK_AUTHORITY_DISCOVERY.get_peer_id_by_authority(Sr25519Keyring::Alice), message.clone().into(), ImportStatementsResult::ValidImport, - false, - |_, _, _| async {}, + true, + move |_handle, _req_tx, _message| ready(()), ) .await; + let mut rx_responses = Vec::new(); + + let peer = MOCK_AUTHORITY_DISCOVERY.get_peer_id_by_authority(Sr25519Keyring::Bob); + + let message = make_dispute_message(candidate.clone(), BOB_INDEX, FERDIE_INDEX).await; + rx_responses.push(send_network_dispute_request(req_tx, peer, message.clone().into()).await); + + let message = make_dispute_message(candidate.clone(), CHARLIE_INDEX, FERDIE_INDEX).await; + rx_responses.push(send_network_dispute_request(req_tx, peer, message.clone().into()).await); + + gum::trace!("Import one too much:"); + + let message = make_dispute_message(candidate.clone(), CHARLIE_INDEX, ALICE_INDEX).await; + let rx_response_flood = + send_network_dispute_request(req_tx, peer, message.clone().into()).await; + + assert_matches!( + rx_response_flood.await, + Ok(resp) => { + let sc_network::config::OutgoingResponse { + result: _, + reputation_changes, + sent_feedback: _, + } = resp; + gum::trace!( + target: LOG_TARGET, + ?reputation_changes, + "Received reputation changes." + ); + // Received punishment for flood: + assert_eq!(reputation_changes.len(), 1); + } + ); + gum::trace!("Need to wait 2 patch intervals:"); + Delay::new(BATCH_COLLECTING_INTERVAL).await; + Delay::new(BATCH_COLLECTING_INTERVAL).await; + + gum::trace!("Batch should be ready now"); + + let pending_confirmation = assert_matches!( + handle.recv().await, + AllMessages::DisputeCoordinator( + DisputeCoordinatorMessage::ImportStatements { + candidate_receipt: _, + session, + statements, + pending_confirmation: Some(pending_confirmation), + } + ) => { + assert_eq!(session, MOCK_SESSION_INDEX); + // Only 3 as fourth was flood: + assert_eq!(statements.len(), 3); + pending_confirmation + } + ); + pending_confirmation.send(ImportStatementsResult::ValidImport).unwrap(); + + for rx_response in rx_responses { + assert_matches!( + rx_response.await, + Ok(resp) => { + let sc_network::config::OutgoingResponse { + result, + reputation_changes: _, + sent_feedback, + } = resp; + + let result = result.unwrap(); + let decoded = + ::decode(&mut result.as_slice()).unwrap(); + + assert!(decoded == DisputeResponse::Confirmed); + if let Some(sent_feedback) = sent_feedback { + sent_feedback.send(()).unwrap(); + } + gum::trace!( + target: LOG_TARGET, + "Valid import happened." + ); + + } + ); + } + gum::trace!(target: LOG_TARGET, "Concluding."); conclude(&mut handle).await; }; diff --git a/node/network/protocol/src/request_response/mod.rs b/node/network/protocol/src/request_response/mod.rs index fb955286990e..f4e0dd11f479 100644 --- a/node/network/protocol/src/request_response/mod.rs +++ b/node/network/protocol/src/request_response/mod.rs @@ -121,6 +121,10 @@ const POV_RESPONSE_SIZE: u64 = MAX_POV_SIZE as u64 + 10_000; /// This is `MAX_CODE_SIZE` plus some additional space for protocol overhead. const STATEMENT_RESPONSE_SIZE: u64 = MAX_CODE_SIZE as u64 + 10_000; +/// We can have relative large timeouts here, there is no value of hitting a +/// timeout as we want to get statements through to each node in any case. +pub const DISPUTE_REQUEST_TIMEOUT: Duration = Duration::from_secs(12); + impl Protocol { /// Get a configuration for a given Request response protocol. /// @@ -194,9 +198,7 @@ impl Protocol { /// Responses are just confirmation, in essence not even a bit. So 100 seems /// plenty. max_response_size: 100, - /// We can have relative large timeouts here, there is no value of hitting a - /// timeout as we want to get statements through to each node in any case. - request_timeout: Duration::from_secs(12), + request_timeout: DISPUTE_REQUEST_TIMEOUT, inbound_queue: Some(tx), }, }; diff --git a/node/primitives/src/disputes/mod.rs b/node/primitives/src/disputes/mod.rs index ec7bb6abc3b7..ee047c7bcc22 100644 --- a/node/primitives/src/disputes/mod.rs +++ b/node/primitives/src/disputes/mod.rs @@ -30,6 +30,8 @@ use polkadot_primitives::v2::{ /// `DisputeMessage` and related types. mod message; pub use message::{DisputeMessage, Error as DisputeMessageCheckError, UncheckedDisputeMessage}; +mod status; +pub use status::{dispute_is_inactive, DisputeStatus, Timestamp, ACTIVE_DURATION_SECS}; /// A checked dispute statement from an associated validator. #[derive(Debug, Clone)] diff --git a/node/primitives/src/disputes/status.rs b/node/primitives/src/disputes/status.rs new file mode 100644 index 000000000000..44aed9b78e20 --- /dev/null +++ b/node/primitives/src/disputes/status.rs @@ -0,0 +1,125 @@ +// Copyright 2017-2022 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use parity_scale_codec::{Decode, Encode}; + +/// Timestamp based on the 1 Jan 1970 UNIX base, which is persistent across node restarts and OS reboots. +pub type Timestamp = u64; + +/// The status of dispute. This is a state machine which can be altered by the +/// helper methods. +#[derive(Debug, Clone, Copy, Encode, Decode, PartialEq)] +pub enum DisputeStatus { + /// The dispute is active and unconcluded. + #[codec(index = 0)] + Active, + /// The dispute has been concluded in favor of the candidate + /// since the given timestamp. + #[codec(index = 1)] + ConcludedFor(Timestamp), + /// The dispute has been concluded against the candidate + /// since the given timestamp. + /// + /// This takes precedence over `ConcludedFor` in the case that + /// both are true, which is impossible unless a large amount of + /// validators are participating on both sides. + #[codec(index = 2)] + ConcludedAgainst(Timestamp), + /// Dispute has been confirmed (more than `byzantine_threshold` have already participated/ or + /// we have seen the candidate included already/participated successfully ourselves). + #[codec(index = 3)] + Confirmed, +} + +impl DisputeStatus { + /// Initialize the status to the active state. + pub fn active() -> DisputeStatus { + DisputeStatus::Active + } + + /// Move status to confirmed status, if not yet concluded/confirmed already. + pub fn confirm(self) -> DisputeStatus { + match self { + DisputeStatus::Active => DisputeStatus::Confirmed, + DisputeStatus::Confirmed => DisputeStatus::Confirmed, + DisputeStatus::ConcludedFor(_) | DisputeStatus::ConcludedAgainst(_) => self, + } + } + + /// Check whether the dispute is not a spam dispute. + pub fn is_confirmed_concluded(&self) -> bool { + match self { + &DisputeStatus::Confirmed | + &DisputeStatus::ConcludedFor(_) | + DisputeStatus::ConcludedAgainst(_) => true, + &DisputeStatus::Active => false, + } + } + + /// Transition the status to a new status after observing the dispute has concluded for the candidate. + /// This may be a no-op if the status was already concluded. + pub fn concluded_for(self, now: Timestamp) -> DisputeStatus { + match self { + DisputeStatus::Active | DisputeStatus::Confirmed => DisputeStatus::ConcludedFor(now), + DisputeStatus::ConcludedFor(at) => DisputeStatus::ConcludedFor(std::cmp::min(at, now)), + against => against, + } + } + + /// Transition the status to a new status after observing the dispute has concluded against the candidate. + /// This may be a no-op if the status was already concluded. + pub fn concluded_against(self, now: Timestamp) -> DisputeStatus { + match self { + DisputeStatus::Active | DisputeStatus::Confirmed => + DisputeStatus::ConcludedAgainst(now), + DisputeStatus::ConcludedFor(at) => + DisputeStatus::ConcludedAgainst(std::cmp::min(at, now)), + DisputeStatus::ConcludedAgainst(at) => + DisputeStatus::ConcludedAgainst(std::cmp::min(at, now)), + } + } + + /// Whether the disputed candidate is possibly invalid. + pub fn is_possibly_invalid(&self) -> bool { + match self { + DisputeStatus::Active | + DisputeStatus::Confirmed | + DisputeStatus::ConcludedAgainst(_) => true, + DisputeStatus::ConcludedFor(_) => false, + } + } + + /// Yields the timestamp this dispute concluded at, if any. + pub fn concluded_at(&self) -> Option { + match self { + DisputeStatus::Active | DisputeStatus::Confirmed => None, + DisputeStatus::ConcludedFor(at) | DisputeStatus::ConcludedAgainst(at) => Some(*at), + } + } +} + +/// The choice here is fairly arbitrary. But any dispute that concluded more than a few minutes ago +/// is not worth considering anymore. Changing this value has little to no bearing on consensus, +/// and really only affects the work that the node might do on startup during periods of many +/// disputes. +pub const ACTIVE_DURATION_SECS: Timestamp = 180; + +/// Returns true if the dispute has concluded for longer than ACTIVE_DURATION_SECS +pub fn dispute_is_inactive(status: &DisputeStatus, now: &Timestamp) -> bool { + let at = status.concluded_at(); + + at.is_some() && at.unwrap() + ACTIVE_DURATION_SECS < *now +} diff --git a/node/primitives/src/lib.rs b/node/primitives/src/lib.rs index 882b75a0e81f..895ed2732024 100644 --- a/node/primitives/src/lib.rs +++ b/node/primitives/src/lib.rs @@ -46,8 +46,9 @@ pub mod approval; /// Disputes related types. pub mod disputes; pub use disputes::{ - CandidateVotes, DisputeMessage, DisputeMessageCheckError, InvalidDisputeVote, - SignedDisputeStatement, UncheckedDisputeMessage, ValidDisputeVote, + dispute_is_inactive, CandidateVotes, DisputeMessage, DisputeMessageCheckError, DisputeStatus, + InvalidDisputeVote, SignedDisputeStatement, Timestamp, UncheckedDisputeMessage, + ValidDisputeVote, ACTIVE_DURATION_SECS, }; // For a 16-ary Merkle Prefix Trie, we can expect at most 16 32-byte hashes per node diff --git a/node/service/Cargo.toml b/node/service/Cargo.toml index a3873d9cf2da..43c246675762 100644 --- a/node/service/Cargo.toml +++ b/node/service/Cargo.toml @@ -200,5 +200,3 @@ runtime-metrics = [ "polkadot-runtime/runtime-metrics", "polkadot-runtime-parachains/runtime-metrics" ] - -staging-client = ["polkadot-node-core-provisioner/staging-client"] diff --git a/node/subsystem-types/src/messages.rs b/node/subsystem-types/src/messages.rs index 10a5201cc524..c37f773b3839 100644 --- a/node/subsystem-types/src/messages.rs +++ b/node/subsystem-types/src/messages.rs @@ -35,8 +35,8 @@ use polkadot_node_network_protocol::{ use polkadot_node_primitives::{ approval::{BlockApprovalMeta, IndirectAssignmentCert, IndirectSignedApprovalVote}, AvailableData, BabeEpoch, BlockWeight, CandidateVotes, CollationGenerationConfig, - CollationSecondedSignal, DisputeMessage, ErasureChunk, PoV, SignedDisputeStatement, - SignedFullStatement, ValidationResult, + CollationSecondedSignal, DisputeMessage, DisputeStatus, ErasureChunk, PoV, + SignedDisputeStatement, SignedFullStatement, ValidationResult, }; use polkadot_primitives::v2::{ AuthorityDiscoveryId, BackedCandidate, BlockNumber, CandidateEvent, CandidateHash, @@ -271,7 +271,7 @@ pub enum DisputeCoordinatorMessage { /// Fetch a list of all recent disputes the co-ordinator is aware of. /// These are disputes which have occurred any time in recent sessions, /// and which may have already concluded. - RecentDisputes(oneshot::Sender>), + RecentDisputes(oneshot::Sender>), /// Fetch a list of all active disputes that the coordinator is aware of. /// These disputes are either not yet concluded or recently concluded. ActiveDisputes(oneshot::Sender>), @@ -699,10 +699,15 @@ pub enum RuntimeApiRequest { OccupiedCoreAssumption, RuntimeApiSender>, ), - /// Returns all on-chain disputes at given block number. - StagingDisputes( - RuntimeApiSender)>>, - ), + /// Returns all on-chain disputes at given block number. Available in v3. + Disputes(RuntimeApiSender)>>), +} + +impl RuntimeApiRequest { + /// Runtime version requirements for each message + + /// `Disputes` + pub const DISPUTES_RUNTIME_REQUIREMENT: u32 = 3; } /// A message to the Runtime API subsystem. diff --git a/node/subsystem-types/src/runtime_client.rs b/node/subsystem-types/src/runtime_client.rs index 2aa9e2bffb82..259c94fd4e51 100644 --- a/node/subsystem-types/src/runtime_client.rs +++ b/node/subsystem-types/src/runtime_client.rs @@ -186,7 +186,7 @@ pub trait RuntimeApiSubsystemClient { /// Returns all onchain disputes. /// This is a staging method! Do not use on production runtimes! - async fn staging_get_disputes( + async fn disputes( &self, at: Hash, ) -> Result)>, ApiError>; @@ -375,10 +375,10 @@ where self.runtime_api().session_info_before_version_2(&BlockId::Hash(at), index) } - async fn staging_get_disputes( + async fn disputes( &self, at: Hash, ) -> Result)>, ApiError> { - self.runtime_api().staging_get_disputes(&BlockId::Hash(at)) + self.runtime_api().disputes(&BlockId::Hash(at)) } } diff --git a/primitives/src/lib.rs b/primitives/src/lib.rs index 121f7cb40d23..168b5795b040 100644 --- a/primitives/src/lib.rs +++ b/primitives/src/lib.rs @@ -22,9 +22,9 @@ // `v2` is currently the latest stable version of the runtime API. pub mod v2; -// The 'staging' version is special - while other versions are set in stone, -// the staging version is malleable. Once it's released, it gets the next -// version number. +// The 'staging' version is special - it contains primitives which are +// still in development. Once they are considered stable, they will be +// moved to a new versioned module. pub mod vstaging; // `runtime_api` contains the actual API implementation. It contains stable and diff --git a/primitives/src/runtime_api.rs b/primitives/src/runtime_api.rs index 84d2cf0ec4ca..d0d0b7220bb9 100644 --- a/primitives/src/runtime_api.rs +++ b/primitives/src/runtime_api.rs @@ -18,31 +18,97 @@ //! of the Runtime API exposed from the Runtime to the Host. //! //! The functions in trait ParachainHost` can be part of the stable API -//! (which is versioned) or they can be staging (aka unstable functions). +//! (which is versioned) or they can be staging (aka unstable/testing +//! functions). //! -//! All stable API functions should use primitives from the latest version. -//! In the time of writing of this document - this is v2. So for example: -//! ```ignore -//! fn validators() -> Vec; -//! ``` -//! indicates a function from the stable v2 API. +//! The separation outlined above is achieved with the versioned api feature +//! of `decl_runtime_apis!` and `impl_runtime_apis!`. Before moving on let's +//! see a quick example about how api versioning works. //! -//! On the other hand a staging function's name should be prefixed with -//! `staging_` like this: -//! ```ignore -//! fn staging_get_disputes() -> Vec<(vstaging::SessionIndex, vstaging::CandidateHash, vstaging::DisputeState)>; +//! # Runtime api versioning crash course +//! +//! The versioning is achieved with the `api_version` attribute. It can be +//! placed on: +//! * trait declaration - represents the base version of the api. +//! * method declaration (inside a trait declaration) - represents a versioned +//! method, which is not available in the base version. +//! * trait implementation - represents which version of the api is being +//! implemented. +//! +//! Let's see a quick example: +//! +//! ```rust(ignore) +//! sp_api::decl_runtime_apis! { +//! #[api_version(2)] +//! pub trait MyApi { +//! fn fn1(); +//! fn fn2(); +//! #[api_version(3)] +//! fn fn3(); +//! #[api_version(4)] +//! fn fn4(); +//! } +//! } +//! +//! struct Runtime {} +//! +//! sp_api::impl_runtime_apis! { +//! #[api_version(3)] +//! impl self::MyApi for Runtime { +//! fn fn1() {} +//! fn fn2() {} +//! fn fn3() {} +//! } +//! } //! ``` +//! A new api named `MyApi` is declared with `decl_runtime_apis!`. The trait declaration +//! has got an `api_version` attribute which represents its base version - 2 in this case. +//! +//! The api has got three methods - `fn1`, `fn2`, `fn3` and `fn4`. `fn3` and `fn4` has got +//! an `api_version` attribute which makes them versioned methods. These methods do not exist +//! in the base version of the api. Behind the scenes the declaration above creates three +//! runtime apis: +//! * MyApiV2 with `fn1` and `fn2` +//! * MyApiV3 with `fn1`, `fn2` and `fn3`. +//! * MyApiV4 with `fn1`, `fn2`, `fn3` and `fn4`. //! -//! How a staging function becomes stable? +//! Please note that v4 contains all methods from v3, v3 all methods from v2 and so on. //! -//! Once a staging function is ready to be versioned the `renamed` macro -//! should be used to rename it and version it. For the example above: +//! Back to our example. At the end runtime api is implemented for `struct Runtime` with +//! `impl_runtime_apis` macro. `api_version` attribute is attached to the impl block which +//! means that a version different from the base one is being implemented - in our case this +//! is v3. +//! +//! This version of the api contains three methods so the `impl` block has got definitions +//! for them. Note that `fn4` is not implemented as it is not part of this version of the api. +//! `impl_runtime_apis` generates a default implementation for it calling `unimplemented!()`. +//! +//! Hopefully this should be all you need to know in order to use versioned methods in the node. +//! For more details about how the api versioning works refer to `spi_api` +//! documentation [here](https://docs.substrate.io/rustdocs/latest/sp_api/macro.decl_runtime_apis.html). +//! +//! # How versioned methods are used for `ParachainHost` +//! +//! Let's introduce two types of `ParachainHost` api implementation: +//! * stable - used on stable production networks like Polkadot and Kusama. There is only one +//! stable api at a single point in time. +//! * staging - used on test networks like Westend or Rococo. Depending on the development needs +//! there can be zero, one or multiple staging apis. +//! +//! The stable version of `ParachainHost` is indicated by the base version of the api. Any staging +//! method must use `api_version` attribute so that it is assigned to a specific version of a +//! staging api. This way in a single declaration one can see what's the stable version of +//! `ParachainHost` and what staging versions/functions are available. +//! +//! All stable api functions should use primitives from the latest version. +//! In the time of writing of this document - this is v2. So for example: //! ```ignore -//! #[renamed("staging_get_session_disputes", 3)] -//! fn get_session_disputes() -> Vec<(v3::SessionIndex, v3::CandidateHash, v3::DisputeState)>; +//! fn validators() -> Vec; //! ``` -//! For more details about how the API versioning works refer to `spi_api` -//! documentation [here](https://docs.substrate.io/rustdocs/latest/sp_api/macro.decl_runtime_apis.html). +//! indicates a function from the stable v2 API. +//! +//! All staging api functions should use primitives from vstaging. They should be clearly separated +//! from the stable primitives. use crate::v2; use parity_scale_codec::{Decode, Encode}; @@ -153,7 +219,7 @@ sp_api::decl_runtime_apis! { /***** STAGING *****/ /// Returns all onchain disputes. - /// This is a staging method! Do not use on production runtimes! - fn staging_get_disputes() -> Vec<(v2::SessionIndex, v2::CandidateHash, v2::DisputeState)>; + #[api_version(3)] + fn disputes() -> Vec<(v2::SessionIndex, v2::CandidateHash, v2::DisputeState)>; } } diff --git a/primitives/src/vstaging/mod.rs b/primitives/src/vstaging/mod.rs index 2f29ffbe60b7..64671bd48a60 100644 --- a/primitives/src/vstaging/mod.rs +++ b/primitives/src/vstaging/mod.rs @@ -16,4 +16,4 @@ //! Staging Primitives. -// Put any primitives used by staging API functions here +// Put any primitives used by staging APIs functions here diff --git a/primitives/test-helpers/Cargo.toml b/primitives/test-helpers/Cargo.toml index dd5e2ded6ae0..944e32333fd9 100644 --- a/primitives/test-helpers/Cargo.toml +++ b/primitives/test-helpers/Cargo.toml @@ -8,5 +8,6 @@ edition = "2021" sp-keyring = { git = "https://github.com/paritytech/substrate", branch = "master" } sp-application-crypto = { package = "sp-application-crypto", git = "https://github.com/paritytech/substrate", branch = "master", default-features = false } sp-runtime = { git = "https://github.com/paritytech/substrate", branch = "master" } +sp-core = { git = "https://github.com/paritytech/substrate", branch = "master", features = ["std"] } polkadot-primitives = { path = "../" } rand = "0.8.5" diff --git a/primitives/test-helpers/src/lib.rs b/primitives/test-helpers/src/lib.rs index 02ba009b13cc..8873d69cdb2f 100644 --- a/primitives/test-helpers/src/lib.rs +++ b/primitives/test-helpers/src/lib.rs @@ -255,3 +255,7 @@ impl rand::RngCore for AlwaysZeroRng { Ok(()) } } + +pub fn dummy_signature() -> polkadot_primitives::v2::ValidatorSignature { + sp_core::crypto::UncheckedFrom::unchecked_from([1u8; 64]) +} diff --git a/roadmap/implementers-guide/src/node/disputes/dispute-distribution.md b/roadmap/implementers-guide/src/node/disputes/dispute-distribution.md index b63ea2bdcbf0..0bd204934ee3 100644 --- a/roadmap/implementers-guide/src/node/disputes/dispute-distribution.md +++ b/roadmap/implementers-guide/src/node/disputes/dispute-distribution.md @@ -15,6 +15,13 @@ This design should result in a protocol that is: ## Protocol +Distributing disputes needs to be a reliable protocol. We would like to make as +sure as possible that our vote got properly delivered to all concerned +validators. For this to work, this subsystem won't be gossip based, but instead +will use a request/response protocol for application level confirmations. The +request will be the payload (the actual votes/statements), the response will +be the confirmation. See [below][#wire-format]. + ### Input [`DisputeDistributionMessage`][DisputeDistributionMessage] @@ -107,16 +114,7 @@ struct VotesResponse { } ``` -## Functionality - -Distributing disputes needs to be a reliable protocol. We would like to make as -sure as possible that our vote got properly delivered to all concerned -validators. For this to work, this subsystem won't be gossip based, but instead -will use a request/response protocol for application level confirmations. The -request will be the payload (the actual votes/statements), the response will -be the confirmation. See [above][#wire-format]. - -### Starting a Dispute +## Starting a Dispute A dispute is initiated once a node sends the first `DisputeRequest` wire message, which must contain an "invalid" vote and a "valid" vote. @@ -132,7 +130,7 @@ conflicting votes available, hence we have a valid dispute. Nodes will still need to check whether the disputing votes are somewhat current and not some stale ones. -### Participating in a Dispute +## Participating in a Dispute Upon receiving a `DisputeRequest` message, a dispute distribution will trigger the import of the received votes via the dispute coordinator @@ -144,13 +142,13 @@ except that if the local node deemed the candidate valid, the `SendDispute` message will contain a valid vote signed by our node and will contain the initially received `Invalid` vote. -Note, that we rely on the coordinator to check availability for spam protection -(see below). +Note, that we rely on `dispute-coordinator` to check validity of a dispute for spam +protection (see below). -### Sending of messages +## Sending of messages Starting and participating in a dispute are pretty similar from the perspective -of dispute distribution. Once we receive a `SendDispute` message we try to make +of dispute distribution. Once we receive a `SendDispute` message, we try to make sure to get the data out. We keep track of all the parachain validators that should see the message, which are all the parachain validators of the session where the dispute happened as they will want to participate in the dispute. In @@ -159,114 +157,185 @@ session (which might be the same or not and may change during the dispute). Those authorities will not participate in the dispute, but need to see the statements so they can include them in blocks. -We keep track of connected parachain validators and authorities and will issue -warnings in the logs if connected nodes are less than two thirds of the -corresponding sets. We also only consider a message transmitted, once we -received a confirmation message. If not, we will keep retrying getting that -message out as long as the dispute is deemed alive. To determine whether a -dispute is still alive we will issue a +### Reliability + +We only consider a message transmitted, once we received a confirmation message. +If not, we will keep retrying getting that message out as long as the dispute is +deemed alive. To determine whether a dispute is still alive we will ask the +`dispute-coordinator` for a list of all still active disputes via a `DisputeCoordinatorMessage::ActiveDisputes` message before each retry run. Once a dispute is no longer live, we will clean up the state accordingly. -### Reception & Spam Considerations - -Because we are not forwarding foreign statements, spam is less of an issue in -comparison to gossip based systems. Rate limiting should be implemented at the -substrate level, see -[#7750](https://github.com/paritytech/substrate/issues/7750). Still we should -make sure that it is not possible via spamming to prevent a dispute concluding -or worse from getting noticed. - -Considered attack vectors: - -1. Invalid disputes (candidate does not exist) could make us - run out of resources. E.g. if we recorded every statement, we could run out - of disk space eventually. -2. An attacker can just flood us with notifications on any notification - protocol, assuming flood protection is not effective enough, our unbounded - buffers can fill up and we will run out of memory eventually. -3. An attacker could participate in a valid dispute, but send its votes multiple - times. -4. Attackers could spam us at a high rate with invalid disputes. Our incoming - queue of requests could get monopolized by those malicious requests and we - won't be able to import any valid disputes and we could run out of resources, - if we tried to process them all in parallel. - -For tackling 1, we make sure to not occupy resources before we don't know a -candidate is available. So we will not record statements to disk until we -recovered availability for the candidate or know by some other means that the -dispute is legit. - -For 2, we will pick up on any dispute on restart, so assuming that any realistic -memory filling attack will take some time, we should be able to participate in a -dispute under such attacks. - -Importing/discarding redundant votes should be pretty quick, so measures with -regards to 4 should suffice to prevent 3, from doing any real harm. - -For 4, full monopolization of the incoming queue should not be possible assuming -substrate handles incoming requests in a somewhat fair way. Still we want some -defense mechanisms, at the very least we need to make sure to not exhaust -resources. - -The dispute coordinator will notify us on import about unavailable candidates or -otherwise invalid imports and we can disconnect from such peers/decrease their -reputation drastically. This alone should get us quite far with regards to queue -monopolization, as availability recovery is expected to fail relatively quickly -for unavailable data. - -Still if those spam messages come at a very high rate, we might still run out of -resources if we immediately call `DisputeCoordinatorMessage::ImportStatements` -on each one of them. Secondly with our assumption of 1/3 dishonest validators, -getting rid of all of them will take some time, depending on reputation timeouts -some of them might even be able to reconnect eventually. - -To mitigate those issues we will process dispute messages with a maximum -parallelism `N`. We initiate import processes for up to `N` candidates in -parallel. Once we reached `N` parallel requests we will start back pressuring on -the incoming requests. This saves us from resource exhaustion. - -To reduce impact of malicious nodes further, we can keep track from which nodes the -currently importing statements came from and will drop requests from nodes that -already have imports in flight. - -Honest nodes are not expected to send dispute statements at a high rate, but -even if they did: - -- we will import at least the first one and if it is valid it will trigger a - dispute, preventing finality. -- Chances are good that the first sent candidate from a peer is indeed the - oldest one (if they differ in age at all). -- for the dropped request any honest node will retry sending. -- there will be other nodes notifying us about that dispute as well. -- honest votes have a speed advantage on average. Apart from the very first - dispute statement for a candidate, which might cause the availability recovery - process, imports of honest votes will be super fast, while for spam imports - they will always take some time as we have to wait for availability to fail. - -So this general rate limit, that we drop requests from same peers if they come -faster than we can import the statements should not cause any problems for -honest nodes and is in their favor. - -Size of `N`: The larger `N` the better we can handle distributed flood attacks -(see previous paragraph), but we also get potentially more availability recovery -processes happening at the same time, which slows down the individual processes. -And we rather want to have one finish quickly than lots slowly at the same time. -On the other hand, valid disputes are expected to be rare, so if we ever exhaust -`N` it is very likely that this is caused by spam and spam recoveries don't cost -too much bandwidth due to empty responses. - -Considering that an attacker would need to attack many nodes in parallel to have -any effect, an `N` of 10 seems to be a good compromise. For honest requests, most -of those imports will likely concern the same candidate, and for dishonest ones -we get to disconnect from up to ten colluding adversaries at a time. - -For the size of the channel for incoming requests: Due to dropping of repeated -requests from same nodes we can make the channel relatively large without fear -of lots of spam requests sitting there wasting our time, even after we already -blocked a peer. For valid disputes, incoming requests can become bursty. On the -other hand we will also be very quick in processing them. A channel size of 100 -requests seems plenty and should be able to handle bursts adequately. +### Order + +We assume `SendDispute` messages are coming in an order of importance, hence +`dispute-distribution` will make sure to send out network messages in the same +order, even on retry. + +### Rate Limit + +For spam protection (see below), we employ an artificial rate limiting on sending +out messages in order to not hit the rate limit at the receiving side, which +would result in our messages getting dropped and our reputation getting reduced. + +## Reception + +As we shall see the receiving side is mostly about handling spam and ensuring +the dispute-coordinator learns about disputes as fast as possible. + +Goals for the receiving side: + +1. Get new disputes to the dispute-coordinator as fast as possible, so + prioritization can happen properly. +2. Batch votes per disputes as much as possible for good import performance. +3. Prevent malicious nodes exhausting node resources by sending lots of messages. +4. Prevent malicious nodes from sending so many messages/(fake) disputes, + preventing us from concluding good ones. +5. Limit ability of malicious nodes of delaying the vote import due to batching + logic. + +Goal 1 and 2 seem to be conflicting, but an easy compromise is possible: When +learning about a new dispute, we will import the vote immediately, making the +dispute coordinator aware and also getting immediate feedback on the validity. +Then if valid we can batch further incoming votes, with less time constraints as +the dispute-coordinator already knows about the dispute. + +Goal 3 and 4 are obviously very related and both can easily be solved via rate +limiting as we shall see below. Rate limits should already be implemented at the +substrate level, but [are not](https://github.com/paritytech/substrate/issues/7750) +at the time of writing. But even if they were, the enforced substrate limits would +likely not be configurable and thus would still be to high for our needs as we can +rely on the following observations: + +1. Each honest validator will only send one message (apart from duplicates on + timeout) per candidate/dispute. +2. An honest validator needs to fully recover availability and validate the + candidate for casting a vote. + +With these two observations, we can conclude that honest validators will usually +not send messages at a high rate. We can therefore enforce conservative rate +limits and thus minimize harm spamming malicious nodes can have. + +Before we dive into how rate limiting solves all spam issues elegantly, let's +discuss that honest behaviour further: + +What about session changes? Here we might have to inform a new validator set of +lots of already existing disputes at once. + +With observation 1) and a rate limit that is per peer, we are still good: + +Let's assume a rate limit of one message per 200ms per sender. This means 5 +messages from each validator per second. 5 messages means 5 disputes! +Conclusively, we will be able to conclude 5 disputes per second - no matter what +malicious actors are doing. This is assuming dispute messages are sent ordered, +but even if not perfectly ordered: On average it will be 5 disputes per second. + +This is good enough! All those disputes are valid ones and will result in +slashing and disabling of validators. Let's assume all of them conclude `valid`, +and we disable validators only after 100 raised concluding valid disputes, we +would still start disabling misbehaving validators in only 20 seconds. + +One could also think that in addition participation is expected to take longer, +which means on average we can import/conclude disputes faster than they are +generated - regardless of dispute spam. Unfortunately this is not necessarily +true: There might be parachains with very light load where recovery and +validation can be accomplished very quickly - maybe faster than we can import +those disputes. + +This is probably an argument for not imposing a too low rate limit, although the +issue is more general: Even without any rate limit, if an attacker generates +disputes at a very high rate, nodes will be having trouble keeping participation +up, hence the problem should be mitigated at a [more fundamental +layer](https://github.com/paritytech/polkadot/issues/5898). + +For nodes that have been offline for a while, the same argument as for session +changes holds, but matters even less: We assume 2/3 of nodes to be online, so +even if the worst case 1/3 offline happens and they could not import votes fast +enough (as argued above, they in fact can) it would not matter for consensus. + +### Rate Limiting + +As suggested previously, rate limiting allows to mitigate all threats that come +from malicious actors trying to overwhelm the system in order to get away without +a slash, when it comes to dispute-distribution. In this section we will explain +how in greater detail. + +The idea is to open a queue with limited size for each peer. We will process +incoming messages as fast as we can by doing the following: + +1. Check that the sending peer is actually a valid authority - otherwise drop + message and decrease reputation/disconnect. +2. Put message on the peer's queue, if queue is full - drop it. + +Every `RATE_LIMIT` seconds (or rather milliseconds), we pause processing +incoming requests to go a full circle and process one message from each queue. +Processing means `Batching` as explained in the next section. + +### Batching + +To achieve goal 2 we will batch incoming votes/messages together before passing +them on as a single batch to the `dispute-coordinator`. To adhere to goal 1 as +well, we will do the following: + +1. For an incoming message, we check whether we have an existing batch for that + candidate, if not we import directly to the dispute-coordinator, as we have + to assume this is concerning a new dispute. +2. We open a batch and start collecting incoming messages for that candidate, + instead of immediately forwarding. +4. We keep collecting votes in the batch until we received less than + `MIN_KEEP_BATCH_ALIVE_VOTES` unique votes in the last `BATCH_COLLECTING_INTERVAL`. This is + important to accommodate for goal 5 and also 3. +5. We send the whole batch to the dispute-coordinator. + +This together with rate limiting explained above ensures we will be able to +process valid disputes: We can limit the number of simultaneous existing batches +to some high value, but can be rather certain that this limit will never be +reached - hence we won't drop valid disputes: + +Let's assume `MIN_KEEP_BATCH_ALIVE_VOTES` is 10, `BATCH_COLLECTING_INTERVAL` +is `500ms` and above `RATE_LIMIT` is `100ms`. 1/3 of validators are malicious, +so for 1000 this means around 330 malicious actors worst case. + +All those actors can send a message every `100ms`, that is 10 per second. This +means at the begining of an attack they can open up around 3300 batches. Each +containing two votes. So memory usage is still negligible. In reality it is even +less, as we also demand 10 new votes to trickle in per batch in order to keep it +alive, every `500ms`. Hence for the first second, each batch requires 20 votes +each. Each message is 2 votes, so this means 10 messages per batch. Hence to +keep those batches alive 10 attackers are needed for each batch. This reduces +the number of opened batches by a factor of 10: So we only have 330 batches in 1 +second - each containing 20 votes. + +The next second: In order to further grow memory usage, attackers have to +maintain 10 messages per batch and second. Number of batches equals the number +of attackers, each has 10 messages per second, all are needed to maintain the +batches in memory. Therefore we have a hard cap of around 330 (number of +malicious nodes) open batches. Each can be filled with number of malicious +actor's votes. So 330 batches with each 330 votes: Let's assume approximately 100 +bytes per signature/vote. This results in a worst case memory usage of 330 * 330 +* 100 ~= 10 MiB. + +For 10_000 validators, we are already in the Gigabyte range, which means that +with a validator set that large we might want to be more strict with the rate limit or +require a larger rate of incoming votes per batch to keep them alive. + +For a thousand validators a limit on batches of around 1000 should never be +reached in practice. Hence due to rate limiting we have a very good chance to +not ever having to drop a potential valid dispute due to some resource limit. + +Further safe guards are possible: The dispute-coordinator actually +confirms/denies imports. So once we receive a denial by the dispute-coordinator +for the initial imported votes, we can opt into flushing the batch immediately +and importing the votes. This swaps memory usage for more CPU usage, but if that +import is deemed invalid again we can immediately decrease the reputation of the +sending peers, so this should be a net win. For the time being we punt on this +for simplicity. + +Instead of filling batches to maximize memory usage, attackers could also try to +overwhelm the dispute coordinator by only sending votes for new candidates all +the time. This attack vector is mitigated also by above rate limit and +decreasing the peer's reputation on denial of the invalid imports by the +coordinator. ### Node Startup diff --git a/runtime/kusama/src/lib.rs b/runtime/kusama/src/lib.rs index 250dcfa6868a..330040b343f1 100644 --- a/runtime/kusama/src/lib.rs +++ b/runtime/kusama/src/lib.rs @@ -23,11 +23,10 @@ use pallet_transaction_payment::CurrencyAdapter; use parity_scale_codec::{Decode, Encode, MaxEncodedLen}; use primitives::v2::{ - AccountId, AccountIndex, Balance, BlockNumber, CandidateEvent, CandidateHash, - CommittedCandidateReceipt, CoreState, DisputeState, GroupRotationInfo, Hash, Id as ParaId, - InboundDownwardMessage, InboundHrmpMessage, Moment, Nonce, OccupiedCoreAssumption, - PersistedValidationData, ScrapedOnChainVotes, SessionInfo, Signature, ValidationCode, - ValidationCodeHash, ValidatorId, ValidatorIndex, + AccountId, AccountIndex, Balance, BlockNumber, CandidateEvent, CommittedCandidateReceipt, + CoreState, GroupRotationInfo, Hash, Id as ParaId, InboundDownwardMessage, InboundHrmpMessage, + Moment, Nonce, OccupiedCoreAssumption, PersistedValidationData, ScrapedOnChainVotes, + SessionInfo, Signature, ValidationCode, ValidationCodeHash, ValidatorId, ValidatorIndex, }; use runtime_common::{ auctions, claims, crowdloan, impl_runtime_weights, impls::DealWithFees, paras_registrar, @@ -1670,10 +1669,6 @@ sp_api::impl_runtime_apis! { { parachains_runtime_api_impl::validation_code_hash::(para_id, assumption) } - - fn staging_get_disputes() -> Vec<(SessionIndex, CandidateHash, DisputeState)> { - unimplemented!() - } } impl beefy_primitives::BeefyApi for Runtime { diff --git a/runtime/parachains/Cargo.toml b/runtime/parachains/Cargo.toml index b913adcf1639..becbf26e68e0 100644 --- a/runtime/parachains/Cargo.toml +++ b/runtime/parachains/Cargo.toml @@ -109,4 +109,3 @@ try-runtime = [ "pallet-vesting/try-runtime", ] runtime-metrics = ["sp-tracing/with-tracing", "polkadot-runtime-metrics/runtime-metrics"] -vstaging = [] diff --git a/runtime/parachains/src/runtime_api_impl/mod.rs b/runtime/parachains/src/runtime_api_impl/mod.rs index 603b6c4cb385..c045b4747868 100644 --- a/runtime/parachains/src/runtime_api_impl/mod.rs +++ b/runtime/parachains/src/runtime_api_impl/mod.rs @@ -17,9 +17,14 @@ //! Runtime API implementations for Parachains. //! //! These are exposed as different modules using different sets of primitives. -//! At the moment there is only a v2 module and it is not completely clear how migration -//! to a v2 would be done. - +//! At the moment there is a v2 module for the current stable api and +//! vstaging module for all staging methods. +//! When new version of the stable api is released it will be based on v2 and +//! will contain methods from vstaging. +//! The promotion consists of the following steps: +//! 1. Bump the version of the stable module (e.g. v2 becomes v3) +//! 2. Move methods from vstaging to v3. The new stable version should include +//! all methods from vstaging tagged with the new version number (e.g. all +//! v3 methods). pub mod v2; -#[cfg(feature = "vstaging")] pub mod vstaging; diff --git a/runtime/parachains/src/runtime_api_impl/vstaging.rs b/runtime/parachains/src/runtime_api_impl/vstaging.rs index 8715cdc53121..7ae235c8133a 100644 --- a/runtime/parachains/src/runtime_api_impl/vstaging.rs +++ b/runtime/parachains/src/runtime_api_impl/vstaging.rs @@ -14,7 +14,7 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -// Put implementations of functions from staging API here. +//! Put implementations of functions from staging APIs here. use crate::disputes; use primitives::v2::{CandidateHash, DisputeState, SessionIndex}; diff --git a/runtime/polkadot/src/lib.rs b/runtime/polkadot/src/lib.rs index d429ea849915..c0fc2ced9f23 100644 --- a/runtime/polkadot/src/lib.rs +++ b/runtime/polkadot/src/lib.rs @@ -54,11 +54,10 @@ use pallet_session::historical as session_historical; use pallet_transaction_payment::{FeeDetails, RuntimeDispatchInfo}; use parity_scale_codec::{Decode, Encode, MaxEncodedLen}; use primitives::v2::{ - AccountId, AccountIndex, Balance, BlockNumber, CandidateEvent, CandidateHash, - CommittedCandidateReceipt, CoreState, DisputeState, GroupRotationInfo, Hash, Id as ParaId, - InboundDownwardMessage, InboundHrmpMessage, Moment, Nonce, OccupiedCoreAssumption, - PersistedValidationData, ScrapedOnChainVotes, SessionInfo, Signature, ValidationCode, - ValidationCodeHash, ValidatorId, ValidatorIndex, + AccountId, AccountIndex, Balance, BlockNumber, CandidateEvent, CommittedCandidateReceipt, + CoreState, GroupRotationInfo, Hash, Id as ParaId, InboundDownwardMessage, InboundHrmpMessage, + Moment, Nonce, OccupiedCoreAssumption, PersistedValidationData, ScrapedOnChainVotes, + SessionInfo, Signature, ValidationCode, ValidationCodeHash, ValidatorId, ValidatorIndex, }; use sp_core::OpaqueMetadata; use sp_mmr_primitives as mmr; @@ -1799,10 +1798,6 @@ sp_api::impl_runtime_apis! { { parachains_runtime_api_impl::validation_code_hash::(para_id, assumption) } - - fn staging_get_disputes() -> Vec<(SessionIndex, CandidateHash, DisputeState)> { - unimplemented!() - } } impl beefy_primitives::BeefyApi for Runtime { diff --git a/runtime/rococo/src/lib.rs b/runtime/rococo/src/lib.rs index 14f67b162774..64ad3646e6e1 100644 --- a/runtime/rococo/src/lib.rs +++ b/runtime/rococo/src/lib.rs @@ -1004,6 +1004,7 @@ sp_api::impl_runtime_apis! { } } + #[api_version(3)] impl primitives::runtime_api::ParachainHost for Runtime { fn validators() -> Vec { runtime_api_impl::validators::() @@ -1099,8 +1100,8 @@ sp_api::impl_runtime_apis! { runtime_api_impl::validation_code_hash::(para_id, assumption) } - fn staging_get_disputes() -> Vec<(SessionIndex, CandidateHash, DisputeState)> { - unimplemented!() + fn disputes() -> Vec<(SessionIndex, CandidateHash, DisputeState)> { + runtime_parachains::runtime_api_impl::vstaging::get_session_disputes::() } } diff --git a/runtime/test-runtime/Cargo.toml b/runtime/test-runtime/Cargo.toml index 2313be141443..3779390b2365 100644 --- a/runtime/test-runtime/Cargo.toml +++ b/runtime/test-runtime/Cargo.toml @@ -58,7 +58,7 @@ runtime-common = { package = "polkadot-runtime-common", path = "../common", defa primitives = { package = "polkadot-primitives", path = "../../primitives", default-features = false } pallet-xcm = { path = "../../xcm/pallet-xcm", default-features = false } polkadot-parachain = { path = "../../parachain", default-features = false } -polkadot-runtime-parachains = { path = "../parachains", default-features = false, features = ["vstaging"]} +polkadot-runtime-parachains = { path = "../parachains", default-features = false } xcm-builder = { path = "../../xcm/xcm-builder", default-features = false } xcm-executor = { path = "../../xcm/xcm-executor", default-features = false } xcm = { path = "../../xcm", default-features = false } diff --git a/runtime/test-runtime/src/lib.rs b/runtime/test-runtime/src/lib.rs index d18a2c9bb95c..f2b84b5f0616 100644 --- a/runtime/test-runtime/src/lib.rs +++ b/runtime/test-runtime/src/lib.rs @@ -45,12 +45,11 @@ use pallet_session::historical as session_historical; use pallet_transaction_payment::{FeeDetails, RuntimeDispatchInfo}; use polkadot_runtime_parachains::reward_points::RewardValidatorsWithEraPoints; use primitives::v2::{ - AccountId, AccountIndex, Balance, BlockNumber, CandidateEvent, CandidateHash, - CommittedCandidateReceipt, CoreState, DisputeState, GroupRotationInfo, Hash as HashT, - Id as ParaId, InboundDownwardMessage, InboundHrmpMessage, Moment, Nonce, - OccupiedCoreAssumption, PersistedValidationData, ScrapedOnChainVotes, - SessionInfo as SessionInfoData, Signature, ValidationCode, ValidationCodeHash, ValidatorId, - ValidatorIndex, + AccountId, AccountIndex, Balance, BlockNumber, CandidateEvent, CommittedCandidateReceipt, + CoreState, GroupRotationInfo, Hash as HashT, Id as ParaId, InboundDownwardMessage, + InboundHrmpMessage, Moment, Nonce, OccupiedCoreAssumption, PersistedValidationData, + ScrapedOnChainVotes, SessionInfo as SessionInfoData, Signature, ValidationCode, + ValidationCodeHash, ValidatorId, ValidatorIndex, }; use runtime_common::{ claims, impl_runtime_weights, paras_sudo_wrapper, BlockHashCount, BlockLength, @@ -905,10 +904,6 @@ sp_api::impl_runtime_apis! { { runtime_impl::validation_code_hash::(para_id, assumption) } - - fn staging_get_disputes() -> Vec<(SessionIndex, CandidateHash, DisputeState)> { - polkadot_runtime_parachains::runtime_api_impl::vstaging::get_session_disputes::() - } } impl beefy_primitives::BeefyApi for Runtime { diff --git a/runtime/westend/Cargo.toml b/runtime/westend/Cargo.toml index 0e0a6dbce3f7..3509cfaeda1f 100644 --- a/runtime/westend/Cargo.toml +++ b/runtime/westend/Cargo.toml @@ -87,7 +87,7 @@ hex-literal = { version = "0.3.4", optional = true } runtime-common = { package = "polkadot-runtime-common", path = "../common", default-features = false } primitives = { package = "polkadot-primitives", path = "../../primitives", default-features = false } polkadot-parachain = { path = "../../parachain", default-features = false } -runtime-parachains = { package = "polkadot-runtime-parachains", path = "../parachains", default-features = false, features = ["vstaging"] } +runtime-parachains = { package = "polkadot-runtime-parachains", path = "../parachains", default-features = false } xcm = { package = "xcm", path = "../../xcm", default-features = false } xcm-executor = { package = "xcm-executor", path = "../../xcm/xcm-executor", default-features = false } diff --git a/runtime/westend/src/lib.rs b/runtime/westend/src/lib.rs index 9bdde2ea5f38..6d08049910e3 100644 --- a/runtime/westend/src/lib.rs +++ b/runtime/westend/src/lib.rs @@ -1276,6 +1276,7 @@ sp_api::impl_runtime_apis! { } } + #[api_version(3)] impl primitives::runtime_api::ParachainHost for Runtime { fn validators() -> Vec { parachains_runtime_api_impl::validators::() @@ -1374,7 +1375,7 @@ sp_api::impl_runtime_apis! { parachains_runtime_api_impl::validation_code_hash::(para_id, assumption) } - fn staging_get_disputes() -> Vec<(SessionIndex, CandidateHash, DisputeState)> { + fn disputes() -> Vec<(SessionIndex, CandidateHash, DisputeState)> { runtime_parachains::runtime_api_impl::vstaging::get_session_disputes::() } }