From c3519fe15c8aae15a6d1456e2f2d6d25efc77ba5 Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Tue, 21 Feb 2023 14:19:39 -0500 Subject: [PATCH] kv: consider ErrReplicaNotFound and ErrReplicaCannotHoldLease to be retryable Fixes #96746. Fixes #100379. This commit considers ErrReplicaNotFound and ErrReplicaCannotHoldLease to be retryable replication change errors when thrown by lease transfer requests. In doing so, these errors will be retried by the retry loop in `Replica.executeAdminCommandWithDescriptor`. This avoids spurious errors when a split gets blocked behind a lateral replica move like we see in the following situation: 1. issue AdminSplit 2. range in joint config, first needs to leave (maybeLeaveAtomicChangeReplicas) 3. to leave, needs to transfer lease from voter_outgoing to voter_incoming 4(a). lease transfer request sent to replica that has not yet applied the replication change that added the voter_incoming to the range 4(b). lease transfer request delayed and delivered after voter_incoming has been transferred the lease, added to the range, then removed from the range. In either case, retrying the AdminSplit operation on these errors will ensure that it eventually succeeds. Release note (bug fix): Fixed a rare race that could allow large RESTOREs to fail with a `unable to find store` error. --- pkg/kv/kvnemesis/validator.go | 13 +-- pkg/kv/kvserver/batcheval/cmd_lease_test.go | 6 +- pkg/kv/kvserver/client_lease_test.go | 18 +--- pkg/kv/kvserver/client_replica_test.go | 2 +- pkg/kv/kvserver/markers.go | 11 ++ pkg/kv/kvserver/replica_learner_test.go | 112 ++++++++++++-------- pkg/kv/kvserver/replica_raft.go | 10 +- pkg/kv/kvserver/replica_range_lease.go | 2 +- pkg/roachpb/metadata_replicas.go | 11 +- 9 files changed, 104 insertions(+), 81 deletions(-) diff --git a/pkg/kv/kvnemesis/validator.go b/pkg/kv/kvnemesis/validator.go index 09feadc7f664..631753f78570 100644 --- a/pkg/kv/kvnemesis/validator.go +++ b/pkg/kv/kvnemesis/validator.go @@ -356,17 +356,7 @@ func transferLeaseResultIsIgnorable(res Result) bool { return false } return kvserver.IsLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(err) || - // Only VOTER (_FULL, _INCOMING, sometimes _OUTGOING) replicas can - // hold a range lease. Attempts to transfer to lease to any other - // replica type are rejected. See CheckCanReceiveLease. - resultIsErrorStr(res, `replica cannot hold lease`) || - // Only replicas that are part of the range can be given - // the lease. This case is hit if a TransferLease op races - // with a ChangeReplicas op. - resultIsErrorStr(res, `replica not found in RangeDescriptor`) || - // A lease transfer that races with a replica removal may find that - // the store it was targeting is no longer part of the range. - resultIsErrorStr(res, `unable to find store \d+ in range`) || + kvserver.IsLeaseTransferRejectedBecauseTargetCannotReceiveLease(err) || // A lease transfer is not permitted while a range merge is in its // critical phase. resultIsErrorStr(res, `cannot transfer lease while merge in progress`) || @@ -773,7 +763,6 @@ func (v *validator) processOp(op Operation) { ignore = kvserver.IsRetriableReplicationChangeError(err) || kvserver.IsIllegalReplicationChangeError(err) || kvserver.IsReplicationChangeInProgressError(err) || - errors.Is(err, roachpb.ErrReplicaCannotHoldLease) || transferLeaseResultIsIgnorable(t.Result) // replication changes can transfer leases } if !ignore { diff --git a/pkg/kv/kvserver/batcheval/cmd_lease_test.go b/pkg/kv/kvserver/batcheval/cmd_lease_test.go index 5826c5e9c986..9587216b561e 100644 --- a/pkg/kv/kvserver/batcheval/cmd_lease_test.go +++ b/pkg/kv/kvserver/batcheval/cmd_lease_test.go @@ -151,13 +151,13 @@ func TestLeaseCommandLearnerReplica(t *testing.T) { // Learners are not allowed to become leaseholders for now, see the comments // in TransferLease and RequestLease. _, err := TransferLease(ctx, nil, cArgs, nil) - require.EqualError(t, err, `replica cannot hold lease`) + require.EqualError(t, err, `lease target replica cannot hold lease`) cArgs.Args = &kvpb.RequestLeaseRequest{} _, err = RequestLease(ctx, nil, cArgs, nil) const expForUnknown = `cannot replace lease with : ` + - `replica not found in RangeDescriptor` + `lease target replica not found in RangeDescriptor` require.EqualError(t, err, expForUnknown) cArgs.Args = &kvpb.RequestLeaseRequest{ @@ -169,7 +169,7 @@ func TestLeaseCommandLearnerReplica(t *testing.T) { const expForLearner = `cannot replace lease ` + `with repl=(n2,s2):2LEARNER seq=0 start=0,0 exp=: ` + - `replica cannot hold lease` + `lease target replica cannot hold lease` require.EqualError(t, err, expForLearner) } diff --git a/pkg/kv/kvserver/client_lease_test.go b/pkg/kv/kvserver/client_lease_test.go index ae352965949d..3a70e77f5616 100644 --- a/pkg/kv/kvserver/client_lease_test.go +++ b/pkg/kv/kvserver/client_lease_test.go @@ -247,21 +247,10 @@ func TestCannotTransferLeaseToVoterDemoting(t *testing.T) { wg.Add(1) go func() { defer wg.Done() - err := tc.Server(0).DB().AdminTransferLease(context.Background(), + err := tc.Server(0).DB().AdminTransferLease(ctx, scratchStartKey, tc.Target(2).StoreID) require.Error(t, err) - require.Regexp(t, - // The error generated during evaluation. - "replica cannot hold lease|"+ - // If the lease transfer request has not yet made it to the latching - // phase by the time we close(ch) below, we can receive the following - // error due to the sanity checking which happens in - // AdminTransferLease before attempting to evaluate the lease - // transfer. - // We have a sleep loop below to try to encourage the lease transfer - // to make it past that sanity check prior to letting the change - // of replicas proceed. - "cannot transfer lease to replica of type VOTER_DEMOTING_LEARNER", err.Error()) + require.True(t, errors.Is(err, roachpb.ErrReplicaCannotHoldLease)) }() // Try really hard to make sure that our request makes it past the // sanity check error to the evaluation error. @@ -360,7 +349,8 @@ func TestTransferLeaseToVoterDemotingFails(t *testing.T) { // (wasLastLeaseholder = false in CheckCanReceiveLease). err = tc.Server(0).DB().AdminTransferLease(context.Background(), scratchStartKey, tc.Target(2).StoreID) - require.EqualError(t, err, `replica cannot hold lease`) + require.Error(t, err) + require.True(t, errors.Is(err, roachpb.ErrReplicaCannotHoldLease)) // Make sure the lease is still on n1. leaseHolder, err = tc.FindRangeLeaseHolder(desc, nil) require.NoError(t, err) diff --git a/pkg/kv/kvserver/client_replica_test.go b/pkg/kv/kvserver/client_replica_test.go index 7ac03a9aea8b..ad6267dd1e0c 100644 --- a/pkg/kv/kvserver/client_replica_test.go +++ b/pkg/kv/kvserver/client_replica_test.go @@ -1637,7 +1637,7 @@ func TestLeaseExpirationBasedRangeTransfer(t *testing.T) { { // An invalid target should result in an error. - const expected = "unable to find store .* in range" + const expected = "lease target replica not found in RangeDescriptor" if err := l.replica0.AdminTransferLease(ctx, 1000, false /* bypassSafetyChecks */); !testutils.IsError(err, expected) { t.Fatalf("expected %s, but found %v", expected, err) } diff --git a/pkg/kv/kvserver/markers.go b/pkg/kv/kvserver/markers.go index 4f95d04d377c..d8901a3edd6e 100644 --- a/pkg/kv/kvserver/markers.go +++ b/pkg/kv/kvserver/markers.go @@ -38,6 +38,7 @@ var errMarkCanRetryReplicationChangeWithUpdatedDesc = errors.New("should retry w func IsRetriableReplicationChangeError(err error) bool { return errors.Is(err, errMarkCanRetryReplicationChangeWithUpdatedDesc) || IsLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(err) || + IsLeaseTransferRejectedBecauseTargetCannotReceiveLease(err) || isSnapshotError(err) } @@ -106,3 +107,13 @@ var errMarkLeaseTransferRejectedBecauseTargetMayNeedSnapshot = errors.New( func IsLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(err error) bool { return errors.Is(err, errMarkLeaseTransferRejectedBecauseTargetMayNeedSnapshot) } + +// IsLeaseTransferRejectedBecauseTargetCannotReceiveLease returns true if err +// (assumed to have been emitted by the current leaseholder when processing a +// lease transfer request) indicates that the target replica is not qualified to +// receive the lease. This could be because the current leaseholder has an +// outdated view of the target replica's state. +func IsLeaseTransferRejectedBecauseTargetCannotReceiveLease(err error) bool { + return errors.Is(err, roachpb.ErrReplicaNotFound) || + errors.Is(err, roachpb.ErrReplicaCannotHoldLease) +} diff --git a/pkg/kv/kvserver/replica_learner_test.go b/pkg/kv/kvserver/replica_learner_test.go index cda2a6fa913b..1d67084cf885 100644 --- a/pkg/kv/kvserver/replica_learner_test.go +++ b/pkg/kv/kvserver/replica_learner_test.go @@ -955,51 +955,79 @@ func TestSplitWithLearnerOrJointConfig(t *testing.T) { func TestSplitRetriesOnFailedExitOfJointConfig(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) - ctx := context.Background() - var rangeIDAtomic int64 - var rejectedCount int - const maxRejects = 3 - reqFilter := func(ctx context.Context, ba *kvpb.BatchRequest) *kvpb.Error { - rangeID := roachpb.RangeID(atomic.LoadInt64(&rangeIDAtomic)) - if ba.RangeID == rangeID && ba.IsSingleTransferLeaseRequest() && rejectedCount < maxRejects { - rejectedCount++ - repl := ba.Requests[0].GetTransferLease().Lease.Replica - status := raftutil.ReplicaStateProbe - err := kvserver.NewLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(repl, status) - return kvpb.NewError(err) - } - return nil + testCases := []struct { + name string + errFn func(*kvpb.TransferLeaseRequest) error + }{ + { + name: "targetMayNeedSnapshot", + errFn: func(req *kvpb.TransferLeaseRequest) error { + repl := req.Lease.Replica + status := raftutil.ReplicaStateProbe + return kvserver.NewLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(repl, status) + }, + }, + { + name: "replicaNotFound", + errFn: func(_ *kvpb.TransferLeaseRequest) error { + return roachpb.ErrReplicaNotFound + }, + }, + { + name: "replicaCannotHoldLease", + errFn: func(_ *kvpb.TransferLeaseRequest) error { + return roachpb.ErrReplicaCannotHoldLease + }, + }, } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + var rangeIDAtomic int64 + var rejectedCount int + const maxRejects = 3 + reqFilter := func(ctx context.Context, ba *kvpb.BatchRequest) *kvpb.Error { + rangeID := roachpb.RangeID(atomic.LoadInt64(&rangeIDAtomic)) + if ba.RangeID == rangeID && ba.IsSingleTransferLeaseRequest() && rejectedCount < maxRejects { + rejectedCount++ + req := ba.Requests[0].GetTransferLease() + err := tc.errFn(req) + return kvpb.NewError(err) + } + return nil + } - knobs, ltk := makeReplicationTestKnobs() - knobs.Store.(*kvserver.StoreTestingKnobs).TestingRequestFilter = reqFilter - tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ - ServerArgs: base.TestServerArgs{Knobs: knobs}, - ReplicationMode: base.ReplicationManual, - }) - defer tc.Stopper().Stop(ctx) - - scratchStartKey := tc.ScratchRange(t) - scratchDesc := tc.LookupRangeOrFatal(t, scratchStartKey) - atomic.StoreInt64(&rangeIDAtomic, int64(scratchDesc.RangeID)) - - // Rebalance the range from one store to the other. This will enter a joint - // configuration and then stop because of the testing knobs. - atomic.StoreInt64(<k.replicationAlwaysUseJointConfig, 1) - atomic.StoreInt64(<k.replicaAddStopAfterJointConfig, 1) - tc.RebalanceVoterOrFatal(ctx, t, scratchStartKey, tc.Target(0), tc.Target(1)) - - // Perform a split of the range. This will auto-transitions us out of the - // joint conf before doing work. However, because of the filter we installed - // above, this will first run into a series of retryable errors when - // attempting to perform a lease transfer. The split should retry until the - // join configuration completes. - left, right := tc.SplitRangeOrFatal(t, scratchStartKey.Next()) - require.False(t, left.Replicas().InAtomicReplicationChange(), left) - require.False(t, right.Replicas().InAtomicReplicationChange(), right) - - require.Equal(t, maxRejects, rejectedCount) + ctx := context.Background() + knobs, ltk := makeReplicationTestKnobs() + knobs.Store.(*kvserver.StoreTestingKnobs).TestingRequestFilter = reqFilter + tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ + ServerArgs: base.TestServerArgs{Knobs: knobs}, + ReplicationMode: base.ReplicationManual, + }) + defer tc.Stopper().Stop(ctx) + + scratchStartKey := tc.ScratchRange(t) + scratchDesc := tc.LookupRangeOrFatal(t, scratchStartKey) + atomic.StoreInt64(&rangeIDAtomic, int64(scratchDesc.RangeID)) + + // Rebalance the range from one store to the other. This will enter a joint + // configuration and then stop because of the testing knobs. + atomic.StoreInt64(<k.replicationAlwaysUseJointConfig, 1) + atomic.StoreInt64(<k.replicaAddStopAfterJointConfig, 1) + tc.RebalanceVoterOrFatal(ctx, t, scratchStartKey, tc.Target(0), tc.Target(1)) + + // Perform a split of the range. This will auto-transitions us out of the + // joint conf before doing work. However, because of the filter we installed + // above, this will first run into a series of retryable errors when + // attempting to perform a lease transfer. The split should retry until the + // joint configuration completes. + left, right := tc.SplitRangeOrFatal(t, scratchStartKey.Next()) + require.False(t, left.Replicas().InAtomicReplicationChange(), left) + require.False(t, right.Replicas().InAtomicReplicationChange(), right) + + require.Equal(t, maxRejects, rejectedCount) + }) + } } func TestReplicateQueueSeesLearnerOrJointConfig(t *testing.T) { diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go index 898892163df1..03eefdfebf7c 100644 --- a/pkg/kv/kvserver/replica_raft.go +++ b/pkg/kv/kvserver/replica_raft.go @@ -433,11 +433,13 @@ func (r *Replica) propose( if err := roachpb.CheckCanReceiveLease( lhDesc, proposedDesc.Replicas(), true, /* wasLastLeaseholder */ ); err != nil { - e := errors.Mark(errors.Wrapf(err, "%v received invalid ChangeReplicasTrigger %s to "+ + err = errors.Handled(err) + err = errors.Mark(err, errMarkInvalidReplicationChange) + err = errors.Wrapf(err, "%v received invalid ChangeReplicasTrigger %s to "+ "remove self (leaseholder); lhRemovalAllowed: %v; current desc: %v; proposed desc: %v", - lhDesc, crt, true /* lhRemovalAllowed */, r.Desc(), proposedDesc), errMarkInvalidReplicationChange) - log.Errorf(p.ctx, "%v", e) - return kvpb.NewError(e) + lhDesc, crt, true /* lhRemovalAllowed */, r.Desc(), proposedDesc) + log.Errorf(p.ctx, "%v", err) + return kvpb.NewError(err) } } else if p.command.ReplicatedEvalResult.AddSSTable != nil { log.VEvent(p.ctx, 4, "sideloadable proposal detected") diff --git a/pkg/kv/kvserver/replica_range_lease.go b/pkg/kv/kvserver/replica_range_lease.go index d8ce4dc077b3..45e598bccb1f 100644 --- a/pkg/kv/kvserver/replica_range_lease.go +++ b/pkg/kv/kvserver/replica_range_lease.go @@ -875,7 +875,7 @@ func (r *Replica) AdminTransferLease( // Verify the target is a replica of the range. var ok bool if nextLeaseHolder, ok = desc.GetReplicaDescriptor(target); !ok { - return nil, nil, errors.Errorf("unable to find store %d in range %+v", target, desc) + return nil, nil, roachpb.ErrReplicaNotFound } if nextLease, ok := r.mu.pendingLeaseRequest.RequestPending(); ok && diff --git a/pkg/roachpb/metadata_replicas.go b/pkg/roachpb/metadata_replicas.go index f458ae8b7bf5..1217ed878167 100644 --- a/pkg/roachpb/metadata_replicas.go +++ b/pkg/roachpb/metadata_replicas.go @@ -507,12 +507,15 @@ func (c ReplicaChangeType) IsRemoval() bool { } } -var errReplicaNotFound = errors.Errorf(`replica not found in RangeDescriptor`) +// ErrReplicaNotFound can be returned from CheckCanReceiveLease. +// +// See: https://github.com/cockroachdb/cockroach/issues/93163. +var ErrReplicaNotFound = errors.New(`lease target replica not found in RangeDescriptor`) // ErrReplicaCannotHoldLease can be returned from CheckCanReceiveLease. // -// See: https://github.com/cockroachdb/cockroach/issues/93163 -var ErrReplicaCannotHoldLease = errors.Errorf("replica cannot hold lease") +// See: https://github.com/cockroachdb/cockroach/issues/93163. +var ErrReplicaCannotHoldLease = errors.New(`lease target replica cannot hold lease`) // CheckCanReceiveLease checks whether `wouldbeLeaseholder` can receive a lease. // Returns an error if the respective replica is not eligible. @@ -543,7 +546,7 @@ func CheckCanReceiveLease( ) error { repDesc, ok := replDescs.GetReplicaDescriptorByID(wouldbeLeaseholder.ReplicaID) if !ok { - return errReplicaNotFound + return ErrReplicaNotFound } if !(repDesc.IsVoterNewConfig() || (repDesc.IsVoterOldConfig() && replDescs.containsVoterIncoming() && wasLastLeaseholder)) {