Merge #32594

craig[bot] · tbg · craig[bot] · commit 3ba8f1656e27 · 2018-11-30T16:08:15.000Z
32594: storage: delay manual splits that would result in more snapshots r=petermattis a=tbg This is unpolished, but I had used an earlier version of this with what at the time looked like success. At this point I suspect that this is the best way to suppress Raft snapshot growth in IMPORT/RESTORE. (Definitely needs tests). ---- When a Range has followers that aren't replicating properly, splitting that range results in a right-hand side with followers in a similar state. Certain workloads (restore/import/presplit) can run large numbers of splits against a given range, and this can result in a large number of Raft snapshots that backs up the Raft snapshot queue. Ideally we'd never have any ranges that require a snapshot, but over the last weeks it has become clear that this is very difficult to achieve since the knowledge required to decide whether a snapshot can efficiently be prevented is distributed across multiple nodes that don't share the necessary information. This is a bit of a nuclear option to prevent the likely last big culprit in large numbers of Raft snapshots in #31409. With this change, we should expect to see Raft snapshots regularly when a split/scatter phase of an import/restore is active, but never large volumes at once. Release note: None Co-authored-by: Tobias Schottdorf <tobias.schottdorf@gmail.com>
diff --git a/pkg/base/config.go b/pkg/base/config.go
@@ -490,7 +490,20 @@ type RaftConfig struct {
 	// single raft.Ready operation.
 	RaftMaxInflightMsgs int
 
+	// When a Replica with an empty log (i.e. last index zero), drop rejecting
+	// MsgAppResp for the first few ticks to allow the split trigger to perform
+	// the split.
+	//
+	// -1 to disable.
 	RaftPostSplitSuppressSnapshotTicks int
+	// Splitting a range which has a replica needing a snapshot results in two
+	// ranges in that state. The delay configured here slows down splits when in
+	// that situation (limiting to those splits not run through the split
+	// queue). The most important target here are the splits performed by
+	// backup/restore.
+	//
+	// -1 to disable.
+	RaftDelaySplitToSuppressSnapshotTicks int
 }
 
 // SetDefaults initializes unset fields.
@@ -532,6 +545,18 @@ func (cfg *RaftConfig) SetDefaults() {
 	if cfg.RaftPostSplitSuppressSnapshotTicks == 0 {
 		cfg.RaftPostSplitSuppressSnapshotTicks = defaultRaftPostSplitSuppressSnapshotTicks
 	}
+
+	if cfg.RaftDelaySplitToSuppressSnapshotTicks == 0 {
+		// The Raft Ticks interval defaults to 200ms, and
+		// RaftPostSplitSuppressSnapshotTicks to 20 ticks. A total of 120 ticks is
+		// ~24s which experimentally has been shown to allow the small pile (<100)
+		// of Raft snapshots observed at the beginning of an import/restore to be
+		// resolved.
+		cfg.RaftDelaySplitToSuppressSnapshotTicks = 100
+		if cfg.RaftPostSplitSuppressSnapshotTicks > 0 {
+			cfg.RaftDelaySplitToSuppressSnapshotTicks += cfg.RaftPostSplitSuppressSnapshotTicks
+		}
+	}
 }
 
 // RaftElectionTimeout returns the raft election timeout, as computed from the
diff --git a/pkg/storage/client_split_test.go b/pkg/storage/client_split_test.go
@@ -1417,6 +1417,9 @@ func runSetupSplitSnapshotRace(
 	sc.TestingKnobs.DisableAsyncIntentResolution = true
 	// Avoid fighting with the merge queue while trying to reproduce this race.
 	sc.TestingKnobs.DisableMergeQueue = true
+	// Disable the split delay mechanism, or it'll spend 10s going in circles.
+	// (We can't set it to zero as otherwise the default overrides us).
+	sc.RaftDelaySplitToSuppressSnapshotTicks = -1
 	mtc := &multiTestContext{storeConfig: &sc}
 	defer mtc.Stop()
 	mtc.Start(t, 6)
diff --git a/pkg/storage/replica_command.go b/pkg/storage/replica_command.go
@@ -170,7 +170,7 @@ func (r *Replica) AdminSplit(
 			return roachpb.AdminSplitResponse{}, pErr
 		}
 
-		reply, lastErr = r.adminSplitWithDescriptor(ctx, args, r.Desc())
+		reply, lastErr = r.adminSplitWithDescriptor(ctx, args, r.Desc(), true /* delayable */)
 		// On seeing a ConditionFailedError or an AmbiguousResultError, retry
 		// the command with the updated descriptor.
 		if retry := causer.Visit(lastErr, func(err error) bool {
@@ -258,7 +258,10 @@ func splitSnapshotWarningStr(rangeID roachpb.RangeID, status *raft.Status) strin
 //
 // See the comment on splitTrigger for details on the complexities.
 func (r *Replica) adminSplitWithDescriptor(
-	ctx context.Context, args roachpb.AdminSplitRequest, desc *roachpb.RangeDescriptor,
+	ctx context.Context,
+	args roachpb.AdminSplitRequest,
+	desc *roachpb.RangeDescriptor,
+	delayable bool,
 ) (roachpb.AdminSplitResponse, error) {
 	var reply roachpb.AdminSplitResponse
 
@@ -337,7 +340,11 @@ func (r *Replica) adminSplitWithDescriptor(
 	}
 	leftDesc.EndKey = splitKey
 
-	extra := splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
+	var extra string
+	if delayable {
+		extra += maybeDelaySplitToAvoidSnapshot(ctx, (*splitDelayHelper)(r))
+	}
+	extra += splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
 
 	log.Infof(ctx, "initiating a split of this range at key %s [r%d]%s",
 		splitKey, rightDesc.RangeID, extra)
diff --git a/pkg/storage/split_delay_helper.go b/pkg/storage/split_delay_helper.go
@@ -0,0 +1,157 @@
+// Copyright 2018 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License.
+
+package storage
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/cockroachdb/cockroach/pkg/roachpb"
+	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
+	"go.etcd.io/etcd/raft"
+)
+
+type splitDelayHelperI interface {
+	RaftStatus(context.Context) (roachpb.RangeID, *raft.Status)
+	ProposeEmptyCommand(ctx context.Context)
+	NumAttempts() int
+	Sleep(context.Context) time.Duration
+}
+
+type splitDelayHelper Replica
+
+func (sdh *splitDelayHelper) RaftStatus(ctx context.Context) (roachpb.RangeID, *raft.Status) {
+	r := (*Replica)(sdh)
+	r.mu.RLock()
+	raftStatus := r.raftStatusRLocked()
+	if raftStatus != nil {
+		updateRaftProgressFromActivity(
+			ctx, raftStatus.Progress, r.descRLocked().Replicas, r.mu.lastUpdateTimes, timeutil.Now(),
+		)
+	}
+	r.mu.RUnlock()
+	return r.RangeID, raftStatus
+}
+
+func (sdh *splitDelayHelper) ProposeEmptyCommand(ctx context.Context) {
+	r := (*Replica)(sdh)
+	r.raftMu.Lock()
+	_ = r.withRaftGroup(true /* campaignOnWake */, func(rawNode *raft.RawNode) (bool, error) {
+		// NB: intentionally ignore the error (which can be ErrProposalDropped
+		// when there's an SST inflight).
+		_ = rawNode.Propose(encodeRaftCommandV1(makeIDKey(), nil))
+		// NB: we need to unquiesce as the group might be quiesced.
+		return true /* unquiesceAndWakeLeader */, nil
+	})
+	r.raftMu.Unlock()
+}
+
+func (sdh *splitDelayHelper) NumAttempts() int {
+	return (*Replica)(sdh).store.cfg.RaftDelaySplitToSuppressSnapshotTicks
+}
+
+func (sdh *splitDelayHelper) Sleep(ctx context.Context) time.Duration {
+	tBegin := timeutil.Now()
+
+	r := (*Replica)(sdh)
+	select {
+	case <-time.After(r.store.cfg.RaftTickInterval):
+	case <-ctx.Done():
+	}
+
+	return timeutil.Since(tBegin)
+}
+
+func maybeDelaySplitToAvoidSnapshot(ctx context.Context, sdh splitDelayHelperI) string {
+	// We have an "optimization" to avoid Raft snapshots by dropping some
+	// outgoing MsgAppResp (see the _ assignment below) which takes effect for
+	// RaftPostSplitSuppressSnapshotTicks ticks after an uninitialized replica
+	// is created. This check can err, in which case the snapshot will be
+	// delayed for that many ticks, and so we want to delay by at least as much
+	// plus a bit of padding to give a snapshot a chance to catch the follower
+	// up. If we run out of time, we'll resume the split no matter what.
+	_ = (*Replica)(nil).maybeDropMsgAppResp // guru assignment
+	maxDelaySplitToAvoidSnapshotTicks := sdh.NumAttempts()
+
+	var slept time.Duration
+	var extra string
+	var succeeded bool
+	for ticks := 0; ticks < maxDelaySplitToAvoidSnapshotTicks; ticks++ {
+		succeeded = false
+		extra = ""
+		rangeID, raftStatus := sdh.RaftStatus(ctx)
+
+		if raftStatus == nil {
+			// Don't delay on followers (we don't know when to stop). This case
+			// is hit rarely enough to not matter.
+			extra += "; not Raft leader"
+			succeeded = true
+			break
+		}
+
+		done := true
+		for replicaID, pr := range raftStatus.Progress {
+			if replicaID == raftStatus.Lead {
+				// TODO(tschottdorf): remove this once we have picked up
+				// https://github.com/etcd-io/etcd/pull/10279
+				continue
+			}
+
+			if pr.State != raft.ProgressStateReplicate {
+				if !pr.RecentActive {
+					if ticks == 0 {
+						// Having set done = false, we make sure we're not exiting early.
+						// This is important because we sometimes need that Raft proposal
+						// below to make the followers active as there's no chatter on an
+						// idle range. (Note that there's a theoretical race in which the
+						// follower becomes inactive again during the sleep, but the
+						// inactivity interval is much larger than a tick).
+						//
+						// Don't do this more than once though: if a follower is down,
+						// we don't want to delay splits for it.
+						done = false
+					}
+					extra += fmt.Sprintf("; r%d/%d inactive", rangeID, replicaID)
+					continue
+				}
+				done = false
+				extra += fmt.Sprintf("; replica r%d/%d not caught up: %+v", rangeID, replicaID, &pr)
+			}
+		}
+		if done {
+			succeeded = true
+			break
+		}
+		// Propose an empty command which works around a Raft bug that can
+		// leave a follower in ProgressStateProbe even though it has caught
+		// up.
+		sdh.ProposeEmptyCommand(ctx)
+		slept += sdh.Sleep(ctx)
+
+		if ctx.Err() != nil {
+			return ""
+		}
+	}
+
+	if slept != 0 {
+		extra += fmt.Sprintf("; delayed split for %.1fs to avoid Raft snapshot", slept.Seconds())
+		if !succeeded {
+			extra += " (without success)"
+		}
+	}
+
+	return extra
+}
diff --git a/pkg/storage/split_delay_helper_test.go b/pkg/storage/split_delay_helper_test.go
diff --git a/pkg/storage/split_queue.go b/pkg/storage/split_queue.go