From 6bf70dc460357f288d41cccda6c02ff3e8bb8606 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 19 Nov 2021 18:22:05 +0000 Subject: [PATCH 1/2] kvserver: increase Migrate application timeout to 1 minute This increases the timeout when waiting for application of a `Migrate` command on all range replicas to 1 minute, up from 5 seconds. It also adds a cluster setting `kv.migration.migrate_application.timeout` to control this. When encountering a range that's e.g. undergoing rebalancing, it can take a long time for a learner replica to receive a snapshot and respond to this request, which would cause the timeout to trigger. This is especially likely in clusters with many ranges and frequent rebalancing activity. Release note (bug fix): The timeout when checking for Raft application of upgrade migrations has been increased from 5 seconds to 1 minute, and is now controllable via the cluster setting `kv.migration.migrate_application.timeout`. This makes migrations much less likely to fail in clusters with ongoing rebalancing activity during upgrade migrations. --- pkg/kv/kvserver/replica_command.go | 47 +++++++++++++++++------------- pkg/kv/kvserver/replica_write.go | 28 ++++++++++++++---- 2 files changed, 48 insertions(+), 27 deletions(-) diff --git a/pkg/kv/kvserver/replica_command.go b/pkg/kv/kvserver/replica_command.go index 142bde8cf25f..014b672f5b9f 100644 --- a/pkg/kv/kvserver/replica_command.go +++ b/pkg/kv/kvserver/replica_command.go @@ -44,6 +44,11 @@ import ( "go.etcd.io/etcd/raft/v3/tracker" ) +// mergeApplicationTimeout is the timeout when waiting for a merge command to be +// applied on all range replicas. There doesn't appear to be any strong reason +// why this value was chosen in particular, but it seems to work. +const mergeApplicationTimeout = 5 * time.Second + // AdminSplit divides the range into into two ranges using args.SplitKey. func (r *Replica) AdminSplit( ctx context.Context, args roachpb.AdminSplitRequest, reason string, @@ -727,9 +732,11 @@ func (r *Replica) AdminMerge( } rhsSnapshotRes := br.(*roachpb.SubsumeResponse) - err = waitForApplication( - ctx, r.store.cfg.NodeDialer, rightDesc.RangeID, mergeReplicas, - rhsSnapshotRes.LeaseAppliedIndex) + err = contextutil.RunWithTimeout(ctx, "waiting for merge application", mergeApplicationTimeout, + func(ctx context.Context) error { + return waitForApplication(ctx, r.store.cfg.NodeDialer, rightDesc.RangeID, mergeReplicas, + rhsSnapshotRes.LeaseAppliedIndex) + }) if err != nil { return errors.Wrap(err, "waiting for all right-hand replicas to catch up") } @@ -794,25 +801,23 @@ func waitForApplication( replicas []roachpb.ReplicaDescriptor, leaseIndex uint64, ) error { - return contextutil.RunWithTimeout(ctx, "wait for application", 5*time.Second, func(ctx context.Context) error { - g := ctxgroup.WithContext(ctx) - for _, repl := range replicas { - repl := repl // copy for goroutine - g.GoCtx(func(ctx context.Context) error { - conn, err := dialer.Dial(ctx, repl.NodeID, rpc.DefaultClass) - if err != nil { - return errors.Wrapf(err, "could not dial n%d", repl.NodeID) - } - _, err = NewPerReplicaClient(conn).WaitForApplication(ctx, &WaitForApplicationRequest{ - StoreRequestHeader: StoreRequestHeader{NodeID: repl.NodeID, StoreID: repl.StoreID}, - RangeID: rangeID, - LeaseIndex: leaseIndex, - }) - return err + g := ctxgroup.WithContext(ctx) + for _, repl := range replicas { + repl := repl // copy for goroutine + g.GoCtx(func(ctx context.Context) error { + conn, err := dialer.Dial(ctx, repl.NodeID, rpc.DefaultClass) + if err != nil { + return errors.Wrapf(err, "could not dial n%d", repl.NodeID) + } + _, err = NewPerReplicaClient(conn).WaitForApplication(ctx, &WaitForApplicationRequest{ + StoreRequestHeader: StoreRequestHeader{NodeID: repl.NodeID, StoreID: repl.StoreID}, + RangeID: rangeID, + LeaseIndex: leaseIndex, }) - } - return g.Wait() - }) + return err + }) + } + return g.Wait() } // waitForReplicasInit blocks until it has proof that the replicas listed in diff --git a/pkg/kv/kvserver/replica_write.go b/pkg/kv/kvserver/replica_write.go index 4475c6ba4ab7..bb40c30759c3 100644 --- a/pkg/kv/kvserver/replica_write.go +++ b/pkg/kv/kvserver/replica_write.go @@ -25,6 +25,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv/kvserver/observedts" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset" "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" "github.com/cockroachdb/cockroach/pkg/util" @@ -37,6 +38,18 @@ import ( "go.etcd.io/etcd/raft/v3" ) +// migrateApplicationTimeout is the duration to wait for a Migrate command +// to be applied to all replicas. +// +// TODO(erikgrinaker): this, and the timeout handling, should be moved into a +// migration helper that manages checkpointing and retries as well. +var migrateApplicationTimeout = settings.RegisterDurationSetting( + "kv.migration.migrate_application.timeout", + "timeout for a Migrate request to be applied across all replicas of a range", + 1*time.Minute, + settings.PositiveDuration, +) + // executeWriteBatch is the entry point for client requests which may mutate the // range's replicated state. Requests taking this path are evaluated and ultimately // serialized through Raft, but pass through additional machinery whose goal is @@ -249,12 +262,15 @@ func (r *Replica) executeWriteBatch( // // [1]: See PurgeOutdatedReplicas from the Migration service. // [2]: pkg/migration - desc := r.Desc() - // NB: waitForApplication already has a timeout. - applicationErr := waitForApplication( - ctx, r.store.cfg.NodeDialer, desc.RangeID, desc.Replicas().Descriptors(), - // We wait for an index >= that of the migration command. - r.GetLeaseAppliedIndex()) + applicationErr := contextutil.RunWithTimeout(ctx, "wait for Migrate application", + migrateApplicationTimeout.Get(&r.ClusterSettings().SV), + func(ctx context.Context) error { + desc := r.Desc() + return waitForApplication( + ctx, r.store.cfg.NodeDialer, desc.RangeID, desc.Replicas().Descriptors(), + // We wait for an index >= that of the migration command. + r.GetLeaseAppliedIndex()) + }) propResult.Err = roachpb.NewError(applicationErr) } return propResult.Reply, nil, propResult.Err From e0cc0640238705ae70733073b216a7529dd493bd Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 19 Nov 2021 18:39:34 +0000 Subject: [PATCH 2/2] migration: add informative log message for sep intents migrate failure The separated intents migration has been seen to go into failure loops in the wild, with a generic "context deadline exceeded" error. This adds a more informative log entry with additional hints on how to resolve the problem. Release note: None --- pkg/migration/migrations/separated_intents.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pkg/migration/migrations/separated_intents.go b/pkg/migration/migrations/separated_intents.go index 5e99c4796015..9463fa3b460d 100644 --- a/pkg/migration/migrations/separated_intents.go +++ b/pkg/migration/migrations/separated_intents.go @@ -549,6 +549,18 @@ func postSeparatedIntentsMigration( return err }) if err != nil { + // TODO(erikgrinaker): This should be moved into a common helper for + // all migrations that e.g. manages checkpointing and retries as well. + // We add a message here for 21.2 temporarily, since this has been seen + // to fail in the wild. + log.Warningf(ctx, `Migrate command failed for range %d: %s. +Command must be applied on all range replicas (not just a quorum). Please make +sure all ranges are healthy and fully upreplicated. Heavy rebalancing may +interfere with the migration, consider temporarily disabling rebalancing with +the cluster setting kv.allocator.load_based_rebalancing=1 and +kv.allocator.range_rebalance_threshold=1. The timeout can also be increased +with kv.migration.migrate_application.timeout.`, + desc.RangeID, err) return err } }