Skip to content

Commit 21da18d

Browse files
committed
storage: delay split that would result in more snapshots
When a Range has followers that aren't replicating properly, splitting that range results in a right-hand side with followers in a similar state. Certain workloads (restore/import/presplit) can run large numbers of splits against a given range, and this can result in a large number of Raft snapshots that backs up the Raft snapshot queue. Ideally we'd never have any ranges that require a snapshot, but over the last weeks it has become clear that this is very difficult to achieve since the knowledge required to decide whether a snapshot can efficiently be prevented is distributed across multiple nodes that don't share the necessary information. This is a bit of a nuclear option to prevent the likely last big culprit in large numbers of Raft snapshots in cockroachdb#31409. With this change, we should expect to see Raft snapshots regularly when a split/scatter phase of an import/restore is active, but never large volumes at once. Release note: None
1 parent 5f865f3 commit 21da18d

File tree

1 file changed

+69
-2
lines changed

1 file changed

+69
-2
lines changed

pkg/storage/replica_command.go

+69-2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import (
3939
"github.com/cockroachdb/cockroach/pkg/util/log"
4040
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
4141
"github.com/cockroachdb/cockroach/pkg/util/retry"
42+
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
4243
"github.com/pkg/errors"
4344
"go.etcd.io/etcd/raft"
4445
"go.etcd.io/etcd/raft/raftpb"
@@ -222,6 +223,71 @@ func splitSnapshotWarningStr(rangeID roachpb.RangeID, status *raft.Status) strin
222223
return s
223224
}
224225

226+
func (r *Replica) maybeDelaySplitToAvoidSnapshot(ctx context.Context) string {
227+
maxDelaySplitToAvoidSnapshotTicks := 5 + r.store.cfg.RaftPostSplitSuppressSnapshotTicks
228+
229+
var extra string
230+
231+
tPreWait := timeutil.Now()
232+
tPostWait := tPreWait
233+
for ticks := 0; ticks < maxDelaySplitToAvoidSnapshotTicks; ticks++ {
234+
if ticks > 0 {
235+
tPostWait = time.Time{}
236+
}
237+
238+
r.mu.RLock()
239+
raftStatus := r.raftStatusRLocked()
240+
if raftStatus != nil {
241+
updateRaftProgressFromActivity(
242+
ctx, raftStatus.Progress, r.descRLocked().Replicas, r.mu.lastUpdateTimes, timeutil.Now(),
243+
)
244+
}
245+
r.mu.RUnlock()
246+
247+
if raftStatus == nil {
248+
// Don't delay followers artificially. This case is hit rarely
249+
// enough to not matter.
250+
break
251+
}
252+
253+
done := true
254+
for replicaID, pr := range raftStatus.Progress {
255+
if replicaID == raftStatus.Lead {
256+
// TODO(tschottdorf): remove this once we have picked up
257+
// https://github.com/etcd-io/etcd/pull/10279
258+
continue
259+
}
260+
261+
if !pr.RecentActive {
262+
continue
263+
}
264+
265+
if pr.State != raft.ProgressStateReplicate {
266+
if ticks == 0 {
267+
extra += fmt.Sprintf("delaying split; replica r%d/%d not caught up: %+v", r.RangeID, replicaID, pr)
268+
}
269+
done = false
270+
}
271+
}
272+
if done {
273+
break
274+
}
275+
select {
276+
case <-time.After(r.store.cfg.RaftTickInterval):
277+
case <-ctx.Done():
278+
return ""
279+
}
280+
}
281+
if tPostWait == (time.Time{}) {
282+
tPostWait = timeutil.Now()
283+
}
284+
285+
if elapsed := tPostWait.Sub(tPreWait); elapsed != 0 {
286+
extra += fmt.Sprintf("; delayed split for %s to avoid Raft snapshot", elapsed)
287+
}
288+
return extra
289+
}
290+
225291
// adminSplitWithDescriptor divides the range into into two ranges, using
226292
// either args.SplitKey (if provided) or an internally computed key that aims
227293
// to roughly equipartition the range by size. The split is done inside of a
@@ -320,10 +386,11 @@ func (r *Replica) adminSplitWithDescriptor(
320386
}
321387
leftDesc.EndKey = splitKey
322388

323-
extra := splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
389+
extra := r.maybeDelaySplitToAvoidSnapshot(ctx)
390+
extra += splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
324391

325392
log.Infof(ctx, "initiating a split of this range at key %s [r%d]%s",
326-
splitKey, rightDesc.RangeID, extra)
393+
extra, splitKey, rightDesc.RangeID)
327394

328395
if err := r.store.DB().Txn(ctx, func(ctx context.Context, txn *client.Txn) error {
329396
log.Event(ctx, "split closure begins")

0 commit comments

Comments
 (0)