Skip to content

Commit 3ba8f16

Browse files
craig[bot]tbg
craig[bot]
andcommitted
Merge #32594
32594: storage: delay manual splits that would result in more snapshots r=petermattis a=tbg This is unpolished, but I had used an earlier version of this with what at the time looked like success. At this point I suspect that this is the best way to suppress Raft snapshot growth in IMPORT/RESTORE. (Definitely needs tests). ---- When a Range has followers that aren't replicating properly, splitting that range results in a right-hand side with followers in a similar state. Certain workloads (restore/import/presplit) can run large numbers of splits against a given range, and this can result in a large number of Raft snapshots that backs up the Raft snapshot queue. Ideally we'd never have any ranges that require a snapshot, but over the last weeks it has become clear that this is very difficult to achieve since the knowledge required to decide whether a snapshot can efficiently be prevented is distributed across multiple nodes that don't share the necessary information. This is a bit of a nuclear option to prevent the likely last big culprit in large numbers of Raft snapshots in #31409. With this change, we should expect to see Raft snapshots regularly when a split/scatter phase of an import/restore is active, but never large volumes at once. Release note: None Co-authored-by: Tobias Schottdorf <[email protected]>
2 parents 3e902ae + 2ab0f5b commit 3ba8f16

6 files changed

+364
-3
lines changed

pkg/base/config.go

+25
Original file line numberDiff line numberDiff line change
@@ -490,7 +490,20 @@ type RaftConfig struct {
490490
// single raft.Ready operation.
491491
RaftMaxInflightMsgs int
492492

493+
// When a Replica with an empty log (i.e. last index zero), drop rejecting
494+
// MsgAppResp for the first few ticks to allow the split trigger to perform
495+
// the split.
496+
//
497+
// -1 to disable.
493498
RaftPostSplitSuppressSnapshotTicks int
499+
// Splitting a range which has a replica needing a snapshot results in two
500+
// ranges in that state. The delay configured here slows down splits when in
501+
// that situation (limiting to those splits not run through the split
502+
// queue). The most important target here are the splits performed by
503+
// backup/restore.
504+
//
505+
// -1 to disable.
506+
RaftDelaySplitToSuppressSnapshotTicks int
494507
}
495508

496509
// SetDefaults initializes unset fields.
@@ -532,6 +545,18 @@ func (cfg *RaftConfig) SetDefaults() {
532545
if cfg.RaftPostSplitSuppressSnapshotTicks == 0 {
533546
cfg.RaftPostSplitSuppressSnapshotTicks = defaultRaftPostSplitSuppressSnapshotTicks
534547
}
548+
549+
if cfg.RaftDelaySplitToSuppressSnapshotTicks == 0 {
550+
// The Raft Ticks interval defaults to 200ms, and
551+
// RaftPostSplitSuppressSnapshotTicks to 20 ticks. A total of 120 ticks is
552+
// ~24s which experimentally has been shown to allow the small pile (<100)
553+
// of Raft snapshots observed at the beginning of an import/restore to be
554+
// resolved.
555+
cfg.RaftDelaySplitToSuppressSnapshotTicks = 100
556+
if cfg.RaftPostSplitSuppressSnapshotTicks > 0 {
557+
cfg.RaftDelaySplitToSuppressSnapshotTicks += cfg.RaftPostSplitSuppressSnapshotTicks
558+
}
559+
}
535560
}
536561

537562
// RaftElectionTimeout returns the raft election timeout, as computed from the

pkg/storage/client_split_test.go

+3
Original file line numberDiff line numberDiff line change
@@ -1417,6 +1417,9 @@ func runSetupSplitSnapshotRace(
14171417
sc.TestingKnobs.DisableAsyncIntentResolution = true
14181418
// Avoid fighting with the merge queue while trying to reproduce this race.
14191419
sc.TestingKnobs.DisableMergeQueue = true
1420+
// Disable the split delay mechanism, or it'll spend 10s going in circles.
1421+
// (We can't set it to zero as otherwise the default overrides us).
1422+
sc.RaftDelaySplitToSuppressSnapshotTicks = -1
14201423
mtc := &multiTestContext{storeConfig: &sc}
14211424
defer mtc.Stop()
14221425
mtc.Start(t, 6)

pkg/storage/replica_command.go

+10-3
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ func (r *Replica) AdminSplit(
170170
return roachpb.AdminSplitResponse{}, pErr
171171
}
172172

173-
reply, lastErr = r.adminSplitWithDescriptor(ctx, args, r.Desc())
173+
reply, lastErr = r.adminSplitWithDescriptor(ctx, args, r.Desc(), true /* delayable */)
174174
// On seeing a ConditionFailedError or an AmbiguousResultError, retry
175175
// the command with the updated descriptor.
176176
if retry := causer.Visit(lastErr, func(err error) bool {
@@ -258,7 +258,10 @@ func splitSnapshotWarningStr(rangeID roachpb.RangeID, status *raft.Status) strin
258258
//
259259
// See the comment on splitTrigger for details on the complexities.
260260
func (r *Replica) adminSplitWithDescriptor(
261-
ctx context.Context, args roachpb.AdminSplitRequest, desc *roachpb.RangeDescriptor,
261+
ctx context.Context,
262+
args roachpb.AdminSplitRequest,
263+
desc *roachpb.RangeDescriptor,
264+
delayable bool,
262265
) (roachpb.AdminSplitResponse, error) {
263266
var reply roachpb.AdminSplitResponse
264267

@@ -337,7 +340,11 @@ func (r *Replica) adminSplitWithDescriptor(
337340
}
338341
leftDesc.EndKey = splitKey
339342

340-
extra := splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
343+
var extra string
344+
if delayable {
345+
extra += maybeDelaySplitToAvoidSnapshot(ctx, (*splitDelayHelper)(r))
346+
}
347+
extra += splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
341348

342349
log.Infof(ctx, "initiating a split of this range at key %s [r%d]%s",
343350
splitKey, rightDesc.RangeID, extra)

pkg/storage/split_delay_helper.go

+157
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
// Copyright 2018 The Cockroach Authors.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12+
// implied. See the License for the specific language governing
13+
// permissions and limitations under the License.
14+
15+
package storage
16+
17+
import (
18+
"context"
19+
"fmt"
20+
"time"
21+
22+
"github.com/cockroachdb/cockroach/pkg/roachpb"
23+
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
24+
"go.etcd.io/etcd/raft"
25+
)
26+
27+
type splitDelayHelperI interface {
28+
RaftStatus(context.Context) (roachpb.RangeID, *raft.Status)
29+
ProposeEmptyCommand(ctx context.Context)
30+
NumAttempts() int
31+
Sleep(context.Context) time.Duration
32+
}
33+
34+
type splitDelayHelper Replica
35+
36+
func (sdh *splitDelayHelper) RaftStatus(ctx context.Context) (roachpb.RangeID, *raft.Status) {
37+
r := (*Replica)(sdh)
38+
r.mu.RLock()
39+
raftStatus := r.raftStatusRLocked()
40+
if raftStatus != nil {
41+
updateRaftProgressFromActivity(
42+
ctx, raftStatus.Progress, r.descRLocked().Replicas, r.mu.lastUpdateTimes, timeutil.Now(),
43+
)
44+
}
45+
r.mu.RUnlock()
46+
return r.RangeID, raftStatus
47+
}
48+
49+
func (sdh *splitDelayHelper) ProposeEmptyCommand(ctx context.Context) {
50+
r := (*Replica)(sdh)
51+
r.raftMu.Lock()
52+
_ = r.withRaftGroup(true /* campaignOnWake */, func(rawNode *raft.RawNode) (bool, error) {
53+
// NB: intentionally ignore the error (which can be ErrProposalDropped
54+
// when there's an SST inflight).
55+
_ = rawNode.Propose(encodeRaftCommandV1(makeIDKey(), nil))
56+
// NB: we need to unquiesce as the group might be quiesced.
57+
return true /* unquiesceAndWakeLeader */, nil
58+
})
59+
r.raftMu.Unlock()
60+
}
61+
62+
func (sdh *splitDelayHelper) NumAttempts() int {
63+
return (*Replica)(sdh).store.cfg.RaftDelaySplitToSuppressSnapshotTicks
64+
}
65+
66+
func (sdh *splitDelayHelper) Sleep(ctx context.Context) time.Duration {
67+
tBegin := timeutil.Now()
68+
69+
r := (*Replica)(sdh)
70+
select {
71+
case <-time.After(r.store.cfg.RaftTickInterval):
72+
case <-ctx.Done():
73+
}
74+
75+
return timeutil.Since(tBegin)
76+
}
77+
78+
func maybeDelaySplitToAvoidSnapshot(ctx context.Context, sdh splitDelayHelperI) string {
79+
// We have an "optimization" to avoid Raft snapshots by dropping some
80+
// outgoing MsgAppResp (see the _ assignment below) which takes effect for
81+
// RaftPostSplitSuppressSnapshotTicks ticks after an uninitialized replica
82+
// is created. This check can err, in which case the snapshot will be
83+
// delayed for that many ticks, and so we want to delay by at least as much
84+
// plus a bit of padding to give a snapshot a chance to catch the follower
85+
// up. If we run out of time, we'll resume the split no matter what.
86+
_ = (*Replica)(nil).maybeDropMsgAppResp // guru assignment
87+
maxDelaySplitToAvoidSnapshotTicks := sdh.NumAttempts()
88+
89+
var slept time.Duration
90+
var extra string
91+
var succeeded bool
92+
for ticks := 0; ticks < maxDelaySplitToAvoidSnapshotTicks; ticks++ {
93+
succeeded = false
94+
extra = ""
95+
rangeID, raftStatus := sdh.RaftStatus(ctx)
96+
97+
if raftStatus == nil {
98+
// Don't delay on followers (we don't know when to stop). This case
99+
// is hit rarely enough to not matter.
100+
extra += "; not Raft leader"
101+
succeeded = true
102+
break
103+
}
104+
105+
done := true
106+
for replicaID, pr := range raftStatus.Progress {
107+
if replicaID == raftStatus.Lead {
108+
// TODO(tschottdorf): remove this once we have picked up
109+
// https://github.com/etcd-io/etcd/pull/10279
110+
continue
111+
}
112+
113+
if pr.State != raft.ProgressStateReplicate {
114+
if !pr.RecentActive {
115+
if ticks == 0 {
116+
// Having set done = false, we make sure we're not exiting early.
117+
// This is important because we sometimes need that Raft proposal
118+
// below to make the followers active as there's no chatter on an
119+
// idle range. (Note that there's a theoretical race in which the
120+
// follower becomes inactive again during the sleep, but the
121+
// inactivity interval is much larger than a tick).
122+
//
123+
// Don't do this more than once though: if a follower is down,
124+
// we don't want to delay splits for it.
125+
done = false
126+
}
127+
extra += fmt.Sprintf("; r%d/%d inactive", rangeID, replicaID)
128+
continue
129+
}
130+
done = false
131+
extra += fmt.Sprintf("; replica r%d/%d not caught up: %+v", rangeID, replicaID, &pr)
132+
}
133+
}
134+
if done {
135+
succeeded = true
136+
break
137+
}
138+
// Propose an empty command which works around a Raft bug that can
139+
// leave a follower in ProgressStateProbe even though it has caught
140+
// up.
141+
sdh.ProposeEmptyCommand(ctx)
142+
slept += sdh.Sleep(ctx)
143+
144+
if ctx.Err() != nil {
145+
return ""
146+
}
147+
}
148+
149+
if slept != 0 {
150+
extra += fmt.Sprintf("; delayed split for %.1fs to avoid Raft snapshot", slept.Seconds())
151+
if !succeeded {
152+
extra += " (without success)"
153+
}
154+
}
155+
156+
return extra
157+
}

0 commit comments

Comments
 (0)