storage: delay manual splits that would result in more snapshots

tbg · tbg · commit 2ab0f5baed67 · 2018-11-30T15:47:47.000+01:00
When a Range has followers that aren't replicating properly, splitting that range results in a right-hand side with followers in a similar state. Certain workloads (restore/import/presplit) can run large numbers of splits against a given range, and this can result in a large number of Raft snapshots that backs up the Raft snapshot queue. Ideally we'd never have any ranges that require a snapshot, but over the last weeks it has become clear that this is very difficult to achieve since the knowledge required to decide whether a snapshot can efficiently be prevented is distributed across multiple nodes that don't share the necessary information. This commit is a bit of a nuclear option to prevent the likely last big culprit in large numbers of Raft snapshots in cockroachdb#31409. With this change, we should expect to see Raft snapshots regularly when a split/scatter phase of an import/restore is active, but never large volumes at once (except perhaps for an initial spike). Splits are delayed only for manual splits. In particular, the split queue is not affected and could in theory cause Raft snapshots. However, at the present juncture, adding delays in the split queue could cause problems as well, so we retain the previous behavior there which isn't known to have caused problems. More follow-up work in the area of Raft snapshots will be necessary to add some more sanity to this area of the code. Release note (bug fix): resolve a cluster degradation scenario that could occur during IMPORT/RESTORE operations, manifested through a high number of pending Raft snapshots.
diff --git a/pkg/base/config.go b/pkg/base/config.go
@@ -490,7 +490,20 @@ type RaftConfig struct {
 	// single raft.Ready operation.
 	RaftMaxInflightMsgs int
 
+	// When a Replica with an empty log (i.e. last index zero), drop rejecting
+	// MsgAppResp for the first few ticks to allow the split trigger to perform
+	// the split.
+	//
+	// -1 to disable.
 	RaftPostSplitSuppressSnapshotTicks int
+	// Splitting a range which has a replica needing a snapshot results in two
+	// ranges in that state. The delay configured here slows down splits when in
+	// that situation (limiting to those splits not run through the split
+	// queue). The most important target here are the splits performed by
+	// backup/restore.
+	//
+	// -1 to disable.
+	RaftDelaySplitToSuppressSnapshotTicks int
 }
 
 // SetDefaults initializes unset fields.
@@ -532,6 +545,18 @@ func (cfg *RaftConfig) SetDefaults() {
 	if cfg.RaftPostSplitSuppressSnapshotTicks == 0 {
 		cfg.RaftPostSplitSuppressSnapshotTicks = defaultRaftPostSplitSuppressSnapshotTicks
 	}
+
+	if cfg.RaftDelaySplitToSuppressSnapshotTicks == 0 {
+		// The Raft Ticks interval defaults to 200ms, and
+		// RaftPostSplitSuppressSnapshotTicks to 20 ticks. A total of 120 ticks is
+		// ~24s which experimentally has been shown to allow the small pile (<100)
+		// of Raft snapshots observed at the beginning of an import/restore to be
+		// resolved.
+		cfg.RaftDelaySplitToSuppressSnapshotTicks = 100
+		if cfg.RaftPostSplitSuppressSnapshotTicks > 0 {
+			cfg.RaftDelaySplitToSuppressSnapshotTicks += cfg.RaftPostSplitSuppressSnapshotTicks
+		}
+	}
 }
 
 // RaftElectionTimeout returns the raft election timeout, as computed from the
diff --git a/pkg/storage/client_split_test.go b/pkg/storage/client_split_test.go
@@ -1417,6 +1417,9 @@ func runSetupSplitSnapshotRace(
 	sc.TestingKnobs.DisableAsyncIntentResolution = true
 	// Avoid fighting with the merge queue while trying to reproduce this race.
 	sc.TestingKnobs.DisableMergeQueue = true
+	// Disable the split delay mechanism, or it'll spend 10s going in circles.
+	// (We can't set it to zero as otherwise the default overrides us).
+	sc.RaftDelaySplitToSuppressSnapshotTicks = -1
 	mtc := &multiTestContext{storeConfig: &sc}
 	defer mtc.Stop()
 	mtc.Start(t, 6)
diff --git a/pkg/storage/replica_command.go b/pkg/storage/replica_command.go
@@ -170,7 +170,7 @@ func (r *Replica) AdminSplit(
 			return roachpb.AdminSplitResponse{}, pErr
 		}
 
-		reply, lastErr = r.adminSplitWithDescriptor(ctx, args, r.Desc())
+		reply, lastErr = r.adminSplitWithDescriptor(ctx, args, r.Desc(), true /* delayable */)
 		// On seeing a ConditionFailedError or an AmbiguousResultError, retry
 		// the command with the updated descriptor.
 		if retry := causer.Visit(lastErr, func(err error) bool {
@@ -258,7 +258,10 @@ func splitSnapshotWarningStr(rangeID roachpb.RangeID, status *raft.Status) strin
 //
 // See the comment on splitTrigger for details on the complexities.
 func (r *Replica) adminSplitWithDescriptor(
-	ctx context.Context, args roachpb.AdminSplitRequest, desc *roachpb.RangeDescriptor,
+	ctx context.Context,
+	args roachpb.AdminSplitRequest,
+	desc *roachpb.RangeDescriptor,
+	delayable bool,
 ) (roachpb.AdminSplitResponse, error) {
 	var reply roachpb.AdminSplitResponse
 
@@ -337,7 +340,11 @@ func (r *Replica) adminSplitWithDescriptor(
 	}
 	leftDesc.EndKey = splitKey
 
-	extra := splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
+	var extra string
+	if delayable {
+		extra += maybeDelaySplitToAvoidSnapshot(ctx, (*splitDelayHelper)(r))
+	}
+	extra += splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
 
 	log.Infof(ctx, "initiating a split of this range at key %s [r%d]%s",
 		splitKey, rightDesc.RangeID, extra)
diff --git a/pkg/storage/split_delay_helper.go b/pkg/storage/split_delay_helper.go
@@ -0,0 +1,157 @@
+// Copyright 2018 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License.
+
+package storage
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/cockroachdb/cockroach/pkg/roachpb"
+	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
+	"go.etcd.io/etcd/raft"
+)
+
+type splitDelayHelperI interface {
+	RaftStatus(context.Context) (roachpb.RangeID, *raft.Status)
+	ProposeEmptyCommand(ctx context.Context)
+	NumAttempts() int
+	Sleep(context.Context) time.Duration
+}
+
+type splitDelayHelper Replica
+
+func (sdh *splitDelayHelper) RaftStatus(ctx context.Context) (roachpb.RangeID, *raft.Status) {
+	r := (*Replica)(sdh)
+	r.mu.RLock()
+	raftStatus := r.raftStatusRLocked()
+	if raftStatus != nil {
+		updateRaftProgressFromActivity(
+			ctx, raftStatus.Progress, r.descRLocked().Replicas, r.mu.lastUpdateTimes, timeutil.Now(),
+		)
+	}
+	r.mu.RUnlock()
+	return r.RangeID, raftStatus
+}
+
+func (sdh *splitDelayHelper) ProposeEmptyCommand(ctx context.Context) {
+	r := (*Replica)(sdh)
+	r.raftMu.Lock()
+	_ = r.withRaftGroup(true /* campaignOnWake */, func(rawNode *raft.RawNode) (bool, error) {
+		// NB: intentionally ignore the error (which can be ErrProposalDropped
+		// when there's an SST inflight).
+		_ = rawNode.Propose(encodeRaftCommandV1(makeIDKey(), nil))
+		// NB: we need to unquiesce as the group might be quiesced.
+		return true /* unquiesceAndWakeLeader */, nil
+	})
+	r.raftMu.Unlock()
+}
+
+func (sdh *splitDelayHelper) NumAttempts() int {
+	return (*Replica)(sdh).store.cfg.RaftDelaySplitToSuppressSnapshotTicks
+}
+
+func (sdh *splitDelayHelper) Sleep(ctx context.Context) time.Duration {
+	tBegin := timeutil.Now()
+
+	r := (*Replica)(sdh)
+	select {
+	case <-time.After(r.store.cfg.RaftTickInterval):
+	case <-ctx.Done():
+	}
+
+	return timeutil.Since(tBegin)
+}
+
+func maybeDelaySplitToAvoidSnapshot(ctx context.Context, sdh splitDelayHelperI) string {
+	// We have an "optimization" to avoid Raft snapshots by dropping some
+	// outgoing MsgAppResp (see the _ assignment below) which takes effect for
+	// RaftPostSplitSuppressSnapshotTicks ticks after an uninitialized replica
+	// is created. This check can err, in which case the snapshot will be
+	// delayed for that many ticks, and so we want to delay by at least as much
+	// plus a bit of padding to give a snapshot a chance to catch the follower
+	// up. If we run out of time, we'll resume the split no matter what.
+	_ = (*Replica)(nil).maybeDropMsgAppResp // guru assignment
+	maxDelaySplitToAvoidSnapshotTicks := sdh.NumAttempts()
+
+	var slept time.Duration
+	var extra string
+	var succeeded bool
+	for ticks := 0; ticks < maxDelaySplitToAvoidSnapshotTicks; ticks++ {
+		succeeded = false
+		extra = ""
+		rangeID, raftStatus := sdh.RaftStatus(ctx)
+
+		if raftStatus == nil {
+			// Don't delay on followers (we don't know when to stop). This case
+			// is hit rarely enough to not matter.
+			extra += "; not Raft leader"
+			succeeded = true
+			break
+		}
+
+		done := true
+		for replicaID, pr := range raftStatus.Progress {
+			if replicaID == raftStatus.Lead {
+				// TODO(tschottdorf): remove this once we have picked up
+				// https://github.com/etcd-io/etcd/pull/10279
+				continue
+			}
+
+			if pr.State != raft.ProgressStateReplicate {
+				if !pr.RecentActive {
+					if ticks == 0 {
+						// Having set done = false, we make sure we're not exiting early.
+						// This is important because we sometimes need that Raft proposal
+						// below to make the followers active as there's no chatter on an
+						// idle range. (Note that there's a theoretical race in which the
+						// follower becomes inactive again during the sleep, but the
+						// inactivity interval is much larger than a tick).
+						//
+						// Don't do this more than once though: if a follower is down,
+						// we don't want to delay splits for it.
+						done = false
+					}
+					extra += fmt.Sprintf("; r%d/%d inactive", rangeID, replicaID)
+					continue
+				}
+				done = false
+				extra += fmt.Sprintf("; replica r%d/%d not caught up: %+v", rangeID, replicaID, &pr)
+			}
+		}
+		if done {
+			succeeded = true
+			break
+		}
+		// Propose an empty command which works around a Raft bug that can
+		// leave a follower in ProgressStateProbe even though it has caught
+		// up.
+		sdh.ProposeEmptyCommand(ctx)
+		slept += sdh.Sleep(ctx)
+
+		if ctx.Err() != nil {
+			return ""
+		}
+	}
+
+	if slept != 0 {
+		extra += fmt.Sprintf("; delayed split for %.1fs to avoid Raft snapshot", slept.Seconds())
+		if !succeeded {
+			extra += " (without success)"
+		}
+	}
+
+	return extra
+}
diff --git a/pkg/storage/split_delay_helper_test.go b/pkg/storage/split_delay_helper_test.go
diff --git a/pkg/storage/split_queue.go b/pkg/storage/split_queue.go