Skip to content

Commit

Permalink
kvserver: also block LEARNER snaps to paused followers
Browse files Browse the repository at this point in the history
We checked whether the snapshot recipient was paused only in the
raft log queue path. By pushing the check down into `sendSnapshot`,
it is now hit by any snapshot attempt, which includes the replicate
queue and store rebalancer. For best results, both of these should
avoid moving replicas to paused followers in the first place, which
they already do, at least partially, so this change shouldn't have
much of an impact in practice.

Fixes #85479.

Release note: None
  • Loading branch information
tbg committed Aug 8, 2022
1 parent ce8bfd4 commit 04481d8
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 18 deletions.
12 changes: 0 additions & 12 deletions pkg/kv/kvserver/raft_snapshot_queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,18 +139,6 @@ func (rq *raftSnapshotQueue) processRaftSnapshot(
return false, nil
}
}
repl.mu.RLock()
_, destPaused := repl.mu.pausedFollowers[id]
repl.mu.RUnlock()
if ioThresh := repl.store.ioOverloadedStores.Load()[repDesc.StoreID]; ioThresh != nil && destPaused {
// If the destination is paused, be more hesitant to send snapshots. The destination being
// paused implies that we have recently checked that it's not required for quorum, and that
// we wish to conserve I/O on that store, which sending a snapshot counteracts. So hold back on
// the snapshot as well.
err := errors.Errorf("skipping snapshot; %s is overloaded: %s", repDesc, ioThresh)
repl.reportSnapshotStatus(ctx, repDesc.ReplicaID, err)
return false, err
}

err := repl.sendSnapshot(ctx, repDesc, snapType, kvserverpb.SnapshotRequest_RECOVERY)

Expand Down
21 changes: 15 additions & 6 deletions pkg/kv/kvserver/replica_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -2592,10 +2592,23 @@ func (r *Replica) sendSnapshot(
r.reportSnapshotStatus(ctx, recipient.ReplicaID, retErr)
}()

sender, err := r.GetReplicaDescriptor()
r.mu.RLock()
sender, err := r.getReplicaDescriptorRLocked()
_, destPaused := r.mu.pausedFollowers[recipient.ReplicaID]
r.mu.RUnlock()

if err != nil {
return err
}

if ioThresh := r.store.ioOverloadedStores.Load()[recipient.StoreID]; ioThresh != nil && destPaused {
// If the destination is paused, be more hesitant to send snapshots. The destination being
// paused implies that we have recently checked that it's not required for quorum, and that
// we wish to conserve I/O on that store, which sending a snapshot counteracts. So hold back on
// the snapshot as well.
return errors.Errorf("skipping snapshot; %s is overloaded: %s", recipient, ioThresh)
}

// Check follower snapshots cluster setting.
if followerSnapshotsEnabled.Get(&r.ClusterSettings().SV) {
sender, err = r.getSenderReplica(ctx)
Expand All @@ -2607,10 +2620,6 @@ func (r *Replica) sendSnapshot(
log.VEventf(
ctx, 2, "delegating snapshot transmission for %v to %v", recipient, sender,
)
desc, err := r.GetReplicaDescriptor()
if err != nil {
return err
}
status := r.RaftStatus()
if status == nil {
// This code path is sometimes hit during scatter for replicas that
Expand All @@ -2621,7 +2630,7 @@ func (r *Replica) sendSnapshot(
// Create new delegate snapshot request with only required metadata.
delegateRequest := &kvserverpb.DelegateSnapshotRequest{
RangeID: r.RangeID,
CoordinatorReplica: desc,
CoordinatorReplica: sender,
RecipientReplica: recipient,
Priority: priority,
Type: snapType,
Expand Down

0 comments on commit 04481d8

Please sign in to comment.