From 04481d8fbe069435d5a77dcd29a0e90433f41d2b Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Mon, 8 Aug 2022 13:13:33 +0200 Subject: [PATCH] kvserver: also block LEARNER snaps to paused followers We checked whether the snapshot recipient was paused only in the raft log queue path. By pushing the check down into `sendSnapshot`, it is now hit by any snapshot attempt, which includes the replicate queue and store rebalancer. For best results, both of these should avoid moving replicas to paused followers in the first place, which they already do, at least partially, so this change shouldn't have much of an impact in practice. Fixes https://github.com/cockroachdb/cockroach/issues/85479. Release note: None --- pkg/kv/kvserver/raft_snapshot_queue.go | 12 ------------ pkg/kv/kvserver/replica_command.go | 21 +++++++++++++++------ 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/pkg/kv/kvserver/raft_snapshot_queue.go b/pkg/kv/kvserver/raft_snapshot_queue.go index 223747f98e8d..61419699ecec 100644 --- a/pkg/kv/kvserver/raft_snapshot_queue.go +++ b/pkg/kv/kvserver/raft_snapshot_queue.go @@ -139,18 +139,6 @@ func (rq *raftSnapshotQueue) processRaftSnapshot( return false, nil } } - repl.mu.RLock() - _, destPaused := repl.mu.pausedFollowers[id] - repl.mu.RUnlock() - if ioThresh := repl.store.ioOverloadedStores.Load()[repDesc.StoreID]; ioThresh != nil && destPaused { - // If the destination is paused, be more hesitant to send snapshots. The destination being - // paused implies that we have recently checked that it's not required for quorum, and that - // we wish to conserve I/O on that store, which sending a snapshot counteracts. So hold back on - // the snapshot as well. - err := errors.Errorf("skipping snapshot; %s is overloaded: %s", repDesc, ioThresh) - repl.reportSnapshotStatus(ctx, repDesc.ReplicaID, err) - return false, err - } err := repl.sendSnapshot(ctx, repDesc, snapType, kvserverpb.SnapshotRequest_RECOVERY) diff --git a/pkg/kv/kvserver/replica_command.go b/pkg/kv/kvserver/replica_command.go index 5b65e682e968..f48c2423dd6b 100644 --- a/pkg/kv/kvserver/replica_command.go +++ b/pkg/kv/kvserver/replica_command.go @@ -2592,10 +2592,23 @@ func (r *Replica) sendSnapshot( r.reportSnapshotStatus(ctx, recipient.ReplicaID, retErr) }() - sender, err := r.GetReplicaDescriptor() + r.mu.RLock() + sender, err := r.getReplicaDescriptorRLocked() + _, destPaused := r.mu.pausedFollowers[recipient.ReplicaID] + r.mu.RUnlock() + if err != nil { return err } + + if ioThresh := r.store.ioOverloadedStores.Load()[recipient.StoreID]; ioThresh != nil && destPaused { + // If the destination is paused, be more hesitant to send snapshots. The destination being + // paused implies that we have recently checked that it's not required for quorum, and that + // we wish to conserve I/O on that store, which sending a snapshot counteracts. So hold back on + // the snapshot as well. + return errors.Errorf("skipping snapshot; %s is overloaded: %s", recipient, ioThresh) + } + // Check follower snapshots cluster setting. if followerSnapshotsEnabled.Get(&r.ClusterSettings().SV) { sender, err = r.getSenderReplica(ctx) @@ -2607,10 +2620,6 @@ func (r *Replica) sendSnapshot( log.VEventf( ctx, 2, "delegating snapshot transmission for %v to %v", recipient, sender, ) - desc, err := r.GetReplicaDescriptor() - if err != nil { - return err - } status := r.RaftStatus() if status == nil { // This code path is sometimes hit during scatter for replicas that @@ -2621,7 +2630,7 @@ func (r *Replica) sendSnapshot( // Create new delegate snapshot request with only required metadata. delegateRequest := &kvserverpb.DelegateSnapshotRequest{ RangeID: r.RangeID, - CoordinatorReplica: desc, + CoordinatorReplica: sender, RecipientReplica: recipient, Priority: priority, Type: snapType,