Skip to content

Commit af253b3

Browse files
committed
storage: delay split that would result in more snapshots
When a Range has followers that aren't replicating properly, splitting that range results in a right-hand side with followers in a similar state. Certain workloads (restore/import/presplit) can run large numbers of splits against a given range, and this can result in a large number of Raft snapshots that backs up the Raft snapshot queue. Ideally we'd never have any ranges that require a snapshot, but over the last weeks it has become clear that this is very difficult to achieve since the knowledge required to decide whether a snapshot can efficiently be prevented is distributed across multiple nodes that don't share the necessary information. This is a bit of a nuclear option to prevent the likely last big culprit in large numbers of Raft snapshots in cockroachdb#31409. With this change, we should expect to see Raft snapshots regularly when a split/scatter phase of an import/restore is active, but never large volumes at once. Release note: None
1 parent 027f231 commit af253b3

File tree

1 file changed

+82
-1
lines changed

1 file changed

+82
-1
lines changed

pkg/storage/replica_command.go

+82-1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import (
3939
"github.com/cockroachdb/cockroach/pkg/util/log"
4040
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
4141
"github.com/cockroachdb/cockroach/pkg/util/retry"
42+
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
4243
"github.com/pkg/errors"
4344
"go.etcd.io/etcd/raft"
4445
"go.etcd.io/etcd/raft/raftpb"
@@ -222,6 +223,85 @@ func splitSnapshotWarningStr(rangeID roachpb.RangeID, status *raft.Status) strin
222223
return s
223224
}
224225

226+
func (r *Replica) maybeDelaySplitToAvoidSnapshot(ctx context.Context) string {
227+
// We have an "optimization" to avoid Raft snapshots by dropping some
228+
// outgoing MsgAppResp (see the _ assignment below) which takes effect for
229+
// RaftPostSplitSuppressSnapshotTicks ticks after an uninitialized replica
230+
// is created. This check can err, in which case the snapshot will be
231+
// delayed for that many ticks, and so we want to delay by at least as much
232+
// plus a bit of padding to give a snapshot a chance to catch the follower
233+
// up. If we run out of time, we'll resume the split no matter what.
234+
_ = r.maybeDropMsgAppResp
235+
maxDelaySplitToAvoidSnapshotTicks := 50 + r.store.cfg.RaftPostSplitSuppressSnapshotTicks
236+
237+
var extra string
238+
239+
tPreWait := timeutil.Now()
240+
var waited bool
241+
var succeeded bool
242+
for ticks := 0; ticks < maxDelaySplitToAvoidSnapshotTicks; ticks++ {
243+
succeeded = false
244+
extra = ""
245+
246+
if ticks == 1 {
247+
waited = true
248+
}
249+
250+
r.mu.RLock()
251+
raftStatus := r.raftStatusRLocked()
252+
if raftStatus != nil {
253+
updateRaftProgressFromActivity(
254+
ctx, raftStatus.Progress, r.descRLocked().Replicas, r.mu.lastUpdateTimes, timeutil.Now(),
255+
)
256+
}
257+
r.mu.RUnlock()
258+
259+
if raftStatus == nil {
260+
// Don't delay followers artificially. This case is hit rarely
261+
// enough to not matter.
262+
break
263+
}
264+
265+
done := true
266+
for replicaID, pr := range raftStatus.Progress {
267+
if replicaID == raftStatus.Lead {
268+
// TODO(tschottdorf): remove this once we have picked up
269+
// https://github.com/etcd-io/etcd/pull/10279
270+
continue
271+
}
272+
273+
if !pr.RecentActive {
274+
continue
275+
}
276+
277+
if pr.State != raft.ProgressStateReplicate {
278+
extra += fmt.Sprintf("replica r%d/%d not caught up: %+v", r.RangeID, replicaID, pr)
279+
done = false
280+
}
281+
}
282+
if done {
283+
succeeded = true
284+
break
285+
}
286+
select {
287+
case <-time.After(r.store.cfg.RaftTickInterval):
288+
case <-ctx.Done():
289+
return ""
290+
}
291+
}
292+
293+
if !waited {
294+
return ""
295+
}
296+
297+
elapsed := timeutil.Since(tPreWait)
298+
extra += fmt.Sprintf("; delayed split for %s to avoid Raft snapshot", elapsed)
299+
if !succeeded {
300+
extra += " (without success)"
301+
}
302+
return extra
303+
}
304+
225305
// adminSplitWithDescriptor divides the range into into two ranges, using
226306
// either args.SplitKey (if provided) or an internally computed key that aims
227307
// to roughly equipartition the range by size. The split is done inside of a
@@ -320,7 +400,8 @@ func (r *Replica) adminSplitWithDescriptor(
320400
}
321401
leftDesc.EndKey = splitKey
322402

323-
extra := splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
403+
extra := r.maybeDelaySplitToAvoidSnapshot(ctx)
404+
extra += splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
324405

325406
log.Infof(ctx, "initiating a split of this range at key %s [r%d]%s",
326407
splitKey, rightDesc.RangeID, extra)

0 commit comments

Comments
 (0)