storage: delay split that would result in more snapshots

tbg · tbg · commit af253b37cc50 · 2018-11-28T13:36:21.000+01:00
When a Range has followers that aren't replicating properly, splitting that range results in a right-hand side with followers in a similar state. Certain workloads (restore/import/presplit) can run large numbers of splits against a given range, and this can result in a large number of Raft snapshots that backs up the Raft snapshot queue. Ideally we'd never have any ranges that require a snapshot, but over the last weeks it has become clear that this is very difficult to achieve since the knowledge required to decide whether a snapshot can efficiently be prevented is distributed across multiple nodes that don't share the necessary information. This is a bit of a nuclear option to prevent the likely last big culprit in large numbers of Raft snapshots in cockroachdb#31409. With this change, we should expect to see Raft snapshots regularly when a split/scatter phase of an import/restore is active, but never large volumes at once. Release note: None
diff --git a/pkg/storage/replica_command.go b/pkg/storage/replica_command.go
@@ -39,6 +39,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/util/log"
 	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
 	"github.com/cockroachdb/cockroach/pkg/util/retry"
+	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
 	"github.com/pkg/errors"
 	"go.etcd.io/etcd/raft"
 	"go.etcd.io/etcd/raft/raftpb"
@@ -222,6 +223,85 @@ func splitSnapshotWarningStr(rangeID roachpb.RangeID, status *raft.Status) strin
 	return s
 }
 
+func (r *Replica) maybeDelaySplitToAvoidSnapshot(ctx context.Context) string {
+	// We have an "optimization" to avoid Raft snapshots by dropping some
+	// outgoing MsgAppResp (see the _ assignment below) which takes effect for
+	// RaftPostSplitSuppressSnapshotTicks ticks after an uninitialized replica
+	// is created. This check can err, in which case the snapshot will be
+	// delayed for that many ticks, and so we want to delay by at least as much
+	// plus a bit of padding to give a snapshot a chance to catch the follower
+	// up. If we run out of time, we'll resume the split no matter what.
+	_ = r.maybeDropMsgAppResp
+	maxDelaySplitToAvoidSnapshotTicks := 50 + r.store.cfg.RaftPostSplitSuppressSnapshotTicks
+
+	var extra string
+
+	tPreWait := timeutil.Now()
+	var waited bool
+	var succeeded bool
+	for ticks := 0; ticks < maxDelaySplitToAvoidSnapshotTicks; ticks++ {
+		succeeded = false
+		extra = ""
+
+		if ticks == 1 {
+			waited = true
+		}
+
+		r.mu.RLock()
+		raftStatus := r.raftStatusRLocked()
+		if raftStatus != nil {
+			updateRaftProgressFromActivity(
+				ctx, raftStatus.Progress, r.descRLocked().Replicas, r.mu.lastUpdateTimes, timeutil.Now(),
+			)
+		}
+		r.mu.RUnlock()
+
+		if raftStatus == nil {
+			// Don't delay followers artificially. This case is hit rarely
+			// enough to not matter.
+			break
+		}
+
+		done := true
+		for replicaID, pr := range raftStatus.Progress {
+			if replicaID == raftStatus.Lead {
+				// TODO(tschottdorf): remove this once we have picked up
+				// https://github.com/etcd-io/etcd/pull/10279
+				continue
+			}
+
+			if !pr.RecentActive {
+				continue
+			}
+
+			if pr.State != raft.ProgressStateReplicate {
+				extra += fmt.Sprintf("replica r%d/%d not caught up: %+v", r.RangeID, replicaID, pr)
+				done = false
+			}
+		}
+		if done {
+			succeeded = true
+			break
+		}
+		select {
+		case <-time.After(r.store.cfg.RaftTickInterval):
+		case <-ctx.Done():
+			return ""
+		}
+	}
+
+	if !waited {
+		return ""
+	}
+
+	elapsed := timeutil.Since(tPreWait)
+	extra += fmt.Sprintf("; delayed split for %s to avoid Raft snapshot", elapsed)
+	if !succeeded {
+		extra += " (without success)"
+	}
+	return extra
+}
+
 // adminSplitWithDescriptor divides the range into into two ranges, using
 // either args.SplitKey (if provided) or an internally computed key that aims
 // to roughly equipartition the range by size. The split is done inside of a
@@ -320,7 +400,8 @@ func (r *Replica) adminSplitWithDescriptor(
 	}
 	leftDesc.EndKey = splitKey
 
-	extra := splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
+	extra := r.maybeDelaySplitToAvoidSnapshot(ctx)
+	extra += splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
 
 	log.Infof(ctx, "initiating a split of this range at key %s [r%d]%s",
 		splitKey, rightDesc.RangeID, extra)