Skip to content

Commit 2ab0f5b

Browse files
committed
storage: delay manual splits that would result in more snapshots
When a Range has followers that aren't replicating properly, splitting that range results in a right-hand side with followers in a similar state. Certain workloads (restore/import/presplit) can run large numbers of splits against a given range, and this can result in a large number of Raft snapshots that backs up the Raft snapshot queue. Ideally we'd never have any ranges that require a snapshot, but over the last weeks it has become clear that this is very difficult to achieve since the knowledge required to decide whether a snapshot can efficiently be prevented is distributed across multiple nodes that don't share the necessary information. This commit is a bit of a nuclear option to prevent the likely last big culprit in large numbers of Raft snapshots in cockroachdb#31409. With this change, we should expect to see Raft snapshots regularly when a split/scatter phase of an import/restore is active, but never large volumes at once (except perhaps for an initial spike). Splits are delayed only for manual splits. In particular, the split queue is not affected and could in theory cause Raft snapshots. However, at the present juncture, adding delays in the split queue could cause problems as well, so we retain the previous behavior there which isn't known to have caused problems. More follow-up work in the area of Raft snapshots will be necessary to add some more sanity to this area of the code. Release note (bug fix): resolve a cluster degradation scenario that could occur during IMPORT/RESTORE operations, manifested through a high number of pending Raft snapshots.
1 parent 65bcf5b commit 2ab0f5b

6 files changed

+364
-3
lines changed

pkg/base/config.go

+25
Original file line numberDiff line numberDiff line change
@@ -490,7 +490,20 @@ type RaftConfig struct {
490490
// single raft.Ready operation.
491491
RaftMaxInflightMsgs int
492492

493+
// When a Replica with an empty log (i.e. last index zero), drop rejecting
494+
// MsgAppResp for the first few ticks to allow the split trigger to perform
495+
// the split.
496+
//
497+
// -1 to disable.
493498
RaftPostSplitSuppressSnapshotTicks int
499+
// Splitting a range which has a replica needing a snapshot results in two
500+
// ranges in that state. The delay configured here slows down splits when in
501+
// that situation (limiting to those splits not run through the split
502+
// queue). The most important target here are the splits performed by
503+
// backup/restore.
504+
//
505+
// -1 to disable.
506+
RaftDelaySplitToSuppressSnapshotTicks int
494507
}
495508

496509
// SetDefaults initializes unset fields.
@@ -532,6 +545,18 @@ func (cfg *RaftConfig) SetDefaults() {
532545
if cfg.RaftPostSplitSuppressSnapshotTicks == 0 {
533546
cfg.RaftPostSplitSuppressSnapshotTicks = defaultRaftPostSplitSuppressSnapshotTicks
534547
}
548+
549+
if cfg.RaftDelaySplitToSuppressSnapshotTicks == 0 {
550+
// The Raft Ticks interval defaults to 200ms, and
551+
// RaftPostSplitSuppressSnapshotTicks to 20 ticks. A total of 120 ticks is
552+
// ~24s which experimentally has been shown to allow the small pile (<100)
553+
// of Raft snapshots observed at the beginning of an import/restore to be
554+
// resolved.
555+
cfg.RaftDelaySplitToSuppressSnapshotTicks = 100
556+
if cfg.RaftPostSplitSuppressSnapshotTicks > 0 {
557+
cfg.RaftDelaySplitToSuppressSnapshotTicks += cfg.RaftPostSplitSuppressSnapshotTicks
558+
}
559+
}
535560
}
536561

537562
// RaftElectionTimeout returns the raft election timeout, as computed from the

pkg/storage/client_split_test.go

+3
Original file line numberDiff line numberDiff line change
@@ -1417,6 +1417,9 @@ func runSetupSplitSnapshotRace(
14171417
sc.TestingKnobs.DisableAsyncIntentResolution = true
14181418
// Avoid fighting with the merge queue while trying to reproduce this race.
14191419
sc.TestingKnobs.DisableMergeQueue = true
1420+
// Disable the split delay mechanism, or it'll spend 10s going in circles.
1421+
// (We can't set it to zero as otherwise the default overrides us).
1422+
sc.RaftDelaySplitToSuppressSnapshotTicks = -1
14201423
mtc := &multiTestContext{storeConfig: &sc}
14211424
defer mtc.Stop()
14221425
mtc.Start(t, 6)

pkg/storage/replica_command.go

+10-3
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ func (r *Replica) AdminSplit(
170170
return roachpb.AdminSplitResponse{}, pErr
171171
}
172172

173-
reply, lastErr = r.adminSplitWithDescriptor(ctx, args, r.Desc())
173+
reply, lastErr = r.adminSplitWithDescriptor(ctx, args, r.Desc(), true /* delayable */)
174174
// On seeing a ConditionFailedError or an AmbiguousResultError, retry
175175
// the command with the updated descriptor.
176176
if retry := causer.Visit(lastErr, func(err error) bool {
@@ -258,7 +258,10 @@ func splitSnapshotWarningStr(rangeID roachpb.RangeID, status *raft.Status) strin
258258
//
259259
// See the comment on splitTrigger for details on the complexities.
260260
func (r *Replica) adminSplitWithDescriptor(
261-
ctx context.Context, args roachpb.AdminSplitRequest, desc *roachpb.RangeDescriptor,
261+
ctx context.Context,
262+
args roachpb.AdminSplitRequest,
263+
desc *roachpb.RangeDescriptor,
264+
delayable bool,
262265
) (roachpb.AdminSplitResponse, error) {
263266
var reply roachpb.AdminSplitResponse
264267

@@ -337,7 +340,11 @@ func (r *Replica) adminSplitWithDescriptor(
337340
}
338341
leftDesc.EndKey = splitKey
339342

340-
extra := splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
343+
var extra string
344+
if delayable {
345+
extra += maybeDelaySplitToAvoidSnapshot(ctx, (*splitDelayHelper)(r))
346+
}
347+
extra += splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
341348

342349
log.Infof(ctx, "initiating a split of this range at key %s [r%d]%s",
343350
splitKey, rightDesc.RangeID, extra)

pkg/storage/split_delay_helper.go

+157
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
// Copyright 2018 The Cockroach Authors.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12+
// implied. See the License for the specific language governing
13+
// permissions and limitations under the License.
14+
15+
package storage
16+
17+
import (
18+
"context"
19+
"fmt"
20+
"time"
21+
22+
"github.com/cockroachdb/cockroach/pkg/roachpb"
23+
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
24+
"go.etcd.io/etcd/raft"
25+
)
26+
27+
type splitDelayHelperI interface {
28+
RaftStatus(context.Context) (roachpb.RangeID, *raft.Status)
29+
ProposeEmptyCommand(ctx context.Context)
30+
NumAttempts() int
31+
Sleep(context.Context) time.Duration
32+
}
33+
34+
type splitDelayHelper Replica
35+
36+
func (sdh *splitDelayHelper) RaftStatus(ctx context.Context) (roachpb.RangeID, *raft.Status) {
37+
r := (*Replica)(sdh)
38+
r.mu.RLock()
39+
raftStatus := r.raftStatusRLocked()
40+
if raftStatus != nil {
41+
updateRaftProgressFromActivity(
42+
ctx, raftStatus.Progress, r.descRLocked().Replicas, r.mu.lastUpdateTimes, timeutil.Now(),
43+
)
44+
}
45+
r.mu.RUnlock()
46+
return r.RangeID, raftStatus
47+
}
48+
49+
func (sdh *splitDelayHelper) ProposeEmptyCommand(ctx context.Context) {
50+
r := (*Replica)(sdh)
51+
r.raftMu.Lock()
52+
_ = r.withRaftGroup(true /* campaignOnWake */, func(rawNode *raft.RawNode) (bool, error) {
53+
// NB: intentionally ignore the error (which can be ErrProposalDropped
54+
// when there's an SST inflight).
55+
_ = rawNode.Propose(encodeRaftCommandV1(makeIDKey(), nil))
56+
// NB: we need to unquiesce as the group might be quiesced.
57+
return true /* unquiesceAndWakeLeader */, nil
58+
})
59+
r.raftMu.Unlock()
60+
}
61+
62+
func (sdh *splitDelayHelper) NumAttempts() int {
63+
return (*Replica)(sdh).store.cfg.RaftDelaySplitToSuppressSnapshotTicks
64+
}
65+
66+
func (sdh *splitDelayHelper) Sleep(ctx context.Context) time.Duration {
67+
tBegin := timeutil.Now()
68+
69+
r := (*Replica)(sdh)
70+
select {
71+
case <-time.After(r.store.cfg.RaftTickInterval):
72+
case <-ctx.Done():
73+
}
74+
75+
return timeutil.Since(tBegin)
76+
}
77+
78+
func maybeDelaySplitToAvoidSnapshot(ctx context.Context, sdh splitDelayHelperI) string {
79+
// We have an "optimization" to avoid Raft snapshots by dropping some
80+
// outgoing MsgAppResp (see the _ assignment below) which takes effect for
81+
// RaftPostSplitSuppressSnapshotTicks ticks after an uninitialized replica
82+
// is created. This check can err, in which case the snapshot will be
83+
// delayed for that many ticks, and so we want to delay by at least as much
84+
// plus a bit of padding to give a snapshot a chance to catch the follower
85+
// up. If we run out of time, we'll resume the split no matter what.
86+
_ = (*Replica)(nil).maybeDropMsgAppResp // guru assignment
87+
maxDelaySplitToAvoidSnapshotTicks := sdh.NumAttempts()
88+
89+
var slept time.Duration
90+
var extra string
91+
var succeeded bool
92+
for ticks := 0; ticks < maxDelaySplitToAvoidSnapshotTicks; ticks++ {
93+
succeeded = false
94+
extra = ""
95+
rangeID, raftStatus := sdh.RaftStatus(ctx)
96+
97+
if raftStatus == nil {
98+
// Don't delay on followers (we don't know when to stop). This case
99+
// is hit rarely enough to not matter.
100+
extra += "; not Raft leader"
101+
succeeded = true
102+
break
103+
}
104+
105+
done := true
106+
for replicaID, pr := range raftStatus.Progress {
107+
if replicaID == raftStatus.Lead {
108+
// TODO(tschottdorf): remove this once we have picked up
109+
// https://github.com/etcd-io/etcd/pull/10279
110+
continue
111+
}
112+
113+
if pr.State != raft.ProgressStateReplicate {
114+
if !pr.RecentActive {
115+
if ticks == 0 {
116+
// Having set done = false, we make sure we're not exiting early.
117+
// This is important because we sometimes need that Raft proposal
118+
// below to make the followers active as there's no chatter on an
119+
// idle range. (Note that there's a theoretical race in which the
120+
// follower becomes inactive again during the sleep, but the
121+
// inactivity interval is much larger than a tick).
122+
//
123+
// Don't do this more than once though: if a follower is down,
124+
// we don't want to delay splits for it.
125+
done = false
126+
}
127+
extra += fmt.Sprintf("; r%d/%d inactive", rangeID, replicaID)
128+
continue
129+
}
130+
done = false
131+
extra += fmt.Sprintf("; replica r%d/%d not caught up: %+v", rangeID, replicaID, &pr)
132+
}
133+
}
134+
if done {
135+
succeeded = true
136+
break
137+
}
138+
// Propose an empty command which works around a Raft bug that can
139+
// leave a follower in ProgressStateProbe even though it has caught
140+
// up.
141+
sdh.ProposeEmptyCommand(ctx)
142+
slept += sdh.Sleep(ctx)
143+
144+
if ctx.Err() != nil {
145+
return ""
146+
}
147+
}
148+
149+
if slept != 0 {
150+
extra += fmt.Sprintf("; delayed split for %.1fs to avoid Raft snapshot", slept.Seconds())
151+
if !succeeded {
152+
extra += " (without success)"
153+
}
154+
}
155+
156+
return extra
157+
}

0 commit comments

Comments
 (0)