From 6bf70dc460357f288d41cccda6c02ff3e8bb8606 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <grinaker@cockroachlabs.com>
Date: Fri, 19 Nov 2021 18:22:05 +0000
Subject: [PATCH 1/2] kvserver: increase Migrate application timeout to 1
 minute

This increases the timeout when waiting for application of a `Migrate`
command on all range replicas to 1 minute, up from 5 seconds. It also
adds a cluster setting `kv.migration.migrate_application.timeout` to
control this.

When encountering a range that's e.g. undergoing rebalancing, it can
take a long time for a learner replica to receive a snapshot and respond
to this request, which would cause the timeout to trigger. This is
especially likely in clusters with many ranges and frequent rebalancing
activity.

Release note (bug fix): The timeout when checking for Raft application
of upgrade migrations has been increased from 5 seconds to 1 minute, and
is now controllable via the cluster setting
`kv.migration.migrate_application.timeout`. This makes migrations much
less likely to fail in clusters with ongoing rebalancing activity during
upgrade migrations.
---
 pkg/kv/kvserver/replica_command.go | 47 +++++++++++++++++-------------
 pkg/kv/kvserver/replica_write.go   | 28 ++++++++++++++----
 2 files changed, 48 insertions(+), 27 deletions(-)

diff --git a/pkg/kv/kvserver/replica_command.go b/pkg/kv/kvserver/replica_command.go
index 142bde8cf25f..014b672f5b9f 100644
--- a/pkg/kv/kvserver/replica_command.go
+++ b/pkg/kv/kvserver/replica_command.go
@@ -44,6 +44,11 @@ import (
 	"go.etcd.io/etcd/raft/v3/tracker"
 )
 
+// mergeApplicationTimeout is the timeout when waiting for a merge command to be
+// applied on all range replicas. There doesn't appear to be any strong reason
+// why this value was chosen in particular, but it seems to work.
+const mergeApplicationTimeout = 5 * time.Second
+
 // AdminSplit divides the range into into two ranges using args.SplitKey.
 func (r *Replica) AdminSplit(
 	ctx context.Context, args roachpb.AdminSplitRequest, reason string,
@@ -727,9 +732,11 @@ func (r *Replica) AdminMerge(
 		}
 		rhsSnapshotRes := br.(*roachpb.SubsumeResponse)
 
-		err = waitForApplication(
-			ctx, r.store.cfg.NodeDialer, rightDesc.RangeID, mergeReplicas,
-			rhsSnapshotRes.LeaseAppliedIndex)
+		err = contextutil.RunWithTimeout(ctx, "waiting for merge application", mergeApplicationTimeout,
+			func(ctx context.Context) error {
+				return waitForApplication(ctx, r.store.cfg.NodeDialer, rightDesc.RangeID, mergeReplicas,
+					rhsSnapshotRes.LeaseAppliedIndex)
+			})
 		if err != nil {
 			return errors.Wrap(err, "waiting for all right-hand replicas to catch up")
 		}
@@ -794,25 +801,23 @@ func waitForApplication(
 	replicas []roachpb.ReplicaDescriptor,
 	leaseIndex uint64,
 ) error {
-	return contextutil.RunWithTimeout(ctx, "wait for application", 5*time.Second, func(ctx context.Context) error {
-		g := ctxgroup.WithContext(ctx)
-		for _, repl := range replicas {
-			repl := repl // copy for goroutine
-			g.GoCtx(func(ctx context.Context) error {
-				conn, err := dialer.Dial(ctx, repl.NodeID, rpc.DefaultClass)
-				if err != nil {
-					return errors.Wrapf(err, "could not dial n%d", repl.NodeID)
-				}
-				_, err = NewPerReplicaClient(conn).WaitForApplication(ctx, &WaitForApplicationRequest{
-					StoreRequestHeader: StoreRequestHeader{NodeID: repl.NodeID, StoreID: repl.StoreID},
-					RangeID:            rangeID,
-					LeaseIndex:         leaseIndex,
-				})
-				return err
+	g := ctxgroup.WithContext(ctx)
+	for _, repl := range replicas {
+		repl := repl // copy for goroutine
+		g.GoCtx(func(ctx context.Context) error {
+			conn, err := dialer.Dial(ctx, repl.NodeID, rpc.DefaultClass)
+			if err != nil {
+				return errors.Wrapf(err, "could not dial n%d", repl.NodeID)
+			}
+			_, err = NewPerReplicaClient(conn).WaitForApplication(ctx, &WaitForApplicationRequest{
+				StoreRequestHeader: StoreRequestHeader{NodeID: repl.NodeID, StoreID: repl.StoreID},
+				RangeID:            rangeID,
+				LeaseIndex:         leaseIndex,
 			})
-		}
-		return g.Wait()
-	})
+			return err
+		})
+	}
+	return g.Wait()
 }
 
 // waitForReplicasInit blocks until it has proof that the replicas listed in
diff --git a/pkg/kv/kvserver/replica_write.go b/pkg/kv/kvserver/replica_write.go
index 4475c6ba4ab7..bb40c30759c3 100644
--- a/pkg/kv/kvserver/replica_write.go
+++ b/pkg/kv/kvserver/replica_write.go
@@ -25,6 +25,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/observedts"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
+	"github.com/cockroachdb/cockroach/pkg/settings"
 	"github.com/cockroachdb/cockroach/pkg/storage"
 	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
 	"github.com/cockroachdb/cockroach/pkg/util"
@@ -37,6 +38,18 @@ import (
 	"go.etcd.io/etcd/raft/v3"
 )
 
+// migrateApplicationTimeout is the duration to wait for a Migrate command
+// to be applied to all replicas.
+//
+// TODO(erikgrinaker): this, and the timeout handling, should be moved into a
+// migration helper that manages checkpointing and retries as well.
+var migrateApplicationTimeout = settings.RegisterDurationSetting(
+	"kv.migration.migrate_application.timeout",
+	"timeout for a Migrate request to be applied across all replicas of a range",
+	1*time.Minute,
+	settings.PositiveDuration,
+)
+
 // executeWriteBatch is the entry point for client requests which may mutate the
 // range's replicated state. Requests taking this path are evaluated and ultimately
 // serialized through Raft, but pass through additional machinery whose goal is
@@ -249,12 +262,15 @@ func (r *Replica) executeWriteBatch(
 				//
 				// [1]: See PurgeOutdatedReplicas from the Migration service.
 				// [2]: pkg/migration
-				desc := r.Desc()
-				// NB: waitForApplication already has a timeout.
-				applicationErr := waitForApplication(
-					ctx, r.store.cfg.NodeDialer, desc.RangeID, desc.Replicas().Descriptors(),
-					// We wait for an index >= that of the migration command.
-					r.GetLeaseAppliedIndex())
+				applicationErr := contextutil.RunWithTimeout(ctx, "wait for Migrate application",
+					migrateApplicationTimeout.Get(&r.ClusterSettings().SV),
+					func(ctx context.Context) error {
+						desc := r.Desc()
+						return waitForApplication(
+							ctx, r.store.cfg.NodeDialer, desc.RangeID, desc.Replicas().Descriptors(),
+							// We wait for an index >= that of the migration command.
+							r.GetLeaseAppliedIndex())
+					})
 				propResult.Err = roachpb.NewError(applicationErr)
 			}
 			return propResult.Reply, nil, propResult.Err

From e0cc0640238705ae70733073b216a7529dd493bd Mon Sep 17 00:00:00 2001
From: Erik Grinaker <grinaker@cockroachlabs.com>
Date: Fri, 19 Nov 2021 18:39:34 +0000
Subject: [PATCH 2/2] migration: add informative log message for sep intents
 migrate failure

The separated intents migration has been seen to go into failure loops
in the wild, with a generic "context deadline exceeded" error. This adds
a more informative log entry with additional hints on how to resolve the
problem.

Release note: None
---
 pkg/migration/migrations/separated_intents.go | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/pkg/migration/migrations/separated_intents.go b/pkg/migration/migrations/separated_intents.go
index 5e99c4796015..9463fa3b460d 100644
--- a/pkg/migration/migrations/separated_intents.go
+++ b/pkg/migration/migrations/separated_intents.go
@@ -549,6 +549,18 @@ func postSeparatedIntentsMigration(
 				return err
 			})
 			if err != nil {
+				// TODO(erikgrinaker): This should be moved into a common helper for
+				// all migrations that e.g. manages checkpointing and retries as well.
+				// We add a message here for 21.2 temporarily, since this has been seen
+				// to fail in the wild.
+				log.Warningf(ctx, `Migrate command failed for range %d: %s.
+Command must be applied on all range replicas (not just a quorum). Please make
+sure all ranges are healthy and fully upreplicated. Heavy rebalancing may
+interfere with the migration, consider temporarily disabling rebalancing with
+the cluster setting kv.allocator.load_based_rebalancing=1 and
+kv.allocator.range_rebalance_threshold=1. The timeout can also be increased
+with kv.migration.migrate_application.timeout.`,
+					desc.RangeID, err)
 				return err
 			}
 		}