Skip to content

Commit 0e46399

Browse files
committed
storage: delay split that would result in more snapshots
When a Range has followers that aren't replicating properly, splitting that range results in a right-hand side with followers in a similar state. Certain workloads (restore/import/presplit) can run large numbers of splits against a given range, and this can result in a large number of Raft snapshots that backs up the Raft snapshot queue. Ideally we'd never have any ranges that require a snapshot, but over the last weeks it has become clear that this is very difficult to achieve since the knowledge required to decide whether a snapshot can efficiently be prevented is distributed across multiple nodes that don't share the necessary information. This is a bit of a nuclear option to prevent the likely last big culprit in large numbers of Raft snapshots in cockroachdb#31409. With this change, we should expect to see Raft snapshots regularly when a split/scatter phase of an import/restore is active, but never large volumes at once. Release note: None
1 parent 5f865f3 commit 0e46399

File tree

1 file changed

+73
-1
lines changed

1 file changed

+73
-1
lines changed

pkg/storage/replica_command.go

+73-1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import (
3939
"github.com/cockroachdb/cockroach/pkg/util/log"
4040
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
4141
"github.com/cockroachdb/cockroach/pkg/util/retry"
42+
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
4243
"github.com/pkg/errors"
4344
"go.etcd.io/etcd/raft"
4445
"go.etcd.io/etcd/raft/raftpb"
@@ -222,6 +223,76 @@ func splitSnapshotWarningStr(rangeID roachpb.RangeID, status *raft.Status) strin
222223
return s
223224
}
224225

226+
func (r *Replica) maybeDelaySplitToAvoidSnapshot(ctx context.Context) string {
227+
// We have an "optimization" to avoid Raft snapshots by dropping some
228+
// outgoing MsgAppResp (see the _ assignment below) which takes effect for
229+
// RaftPostSplitSuppressSnapshotTicks ticks after an uninitialized replica
230+
// is created. This check can err, in which case the snapshot will be
231+
// delayed for that many ticks, and so we want to delay by at least as much
232+
// plus a bit of padding to give a snapshot a chance to catch the follower
233+
// up. If we run out of time, we'll resume the split no matter what.
234+
_ = r.maybeDropMsgAppResp
235+
maxDelaySplitToAvoidSnapshotTicks := 5 + r.store.cfg.RaftPostSplitSuppressSnapshotTicks
236+
237+
var extra string
238+
239+
tPreWait := timeutil.Now()
240+
var waited bool
241+
for ticks := 0; ticks < maxDelaySplitToAvoidSnapshotTicks; ticks++ {
242+
if ticks == 1 {
243+
waited = true
244+
}
245+
246+
r.mu.RLock()
247+
raftStatus := r.raftStatusRLocked()
248+
if raftStatus != nil {
249+
updateRaftProgressFromActivity(
250+
ctx, raftStatus.Progress, r.descRLocked().Replicas, r.mu.lastUpdateTimes, timeutil.Now(),
251+
)
252+
}
253+
r.mu.RUnlock()
254+
255+
if raftStatus == nil {
256+
// Don't delay followers artificially. This case is hit rarely
257+
// enough to not matter.
258+
break
259+
}
260+
261+
done := true
262+
for replicaID, pr := range raftStatus.Progress {
263+
if replicaID == raftStatus.Lead {
264+
// TODO(tschottdorf): remove this once we have picked up
265+
// https://github.com/etcd-io/etcd/pull/10279
266+
continue
267+
}
268+
269+
if !pr.RecentActive {
270+
continue
271+
}
272+
273+
if pr.State != raft.ProgressStateReplicate {
274+
if ticks == 0 {
275+
extra += fmt.Sprintf("delaying split; replica r%d/%d not caught up: %+v", r.RangeID, replicaID, pr)
276+
}
277+
done = false
278+
}
279+
}
280+
if done {
281+
break
282+
}
283+
select {
284+
case <-time.After(r.store.cfg.RaftTickInterval):
285+
case <-ctx.Done():
286+
return ""
287+
}
288+
}
289+
290+
if elapsed := timeutil.Since(tPreWait); waited {
291+
extra += fmt.Sprintf("; delayed split for %s to avoid Raft snapshot", elapsed)
292+
}
293+
return extra
294+
}
295+
225296
// adminSplitWithDescriptor divides the range into into two ranges, using
226297
// either args.SplitKey (if provided) or an internally computed key that aims
227298
// to roughly equipartition the range by size. The split is done inside of a
@@ -320,7 +391,8 @@ func (r *Replica) adminSplitWithDescriptor(
320391
}
321392
leftDesc.EndKey = splitKey
322393

323-
extra := splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
394+
extra := r.maybeDelaySplitToAvoidSnapshot(ctx)
395+
extra += splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
324396

325397
log.Infof(ctx, "initiating a split of this range at key %s [r%d]%s",
326398
splitKey, rightDesc.RangeID, extra)

0 commit comments

Comments
 (0)