diff --git a/pkg/kv/kvnemesis/validator.go b/pkg/kv/kvnemesis/validator.go index 09feadc7f664..631753f78570 100644 --- a/pkg/kv/kvnemesis/validator.go +++ b/pkg/kv/kvnemesis/validator.go @@ -356,17 +356,7 @@ func transferLeaseResultIsIgnorable(res Result) bool { return false } return kvserver.IsLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(err) || - // Only VOTER (_FULL, _INCOMING, sometimes _OUTGOING) replicas can - // hold a range lease. Attempts to transfer to lease to any other - // replica type are rejected. See CheckCanReceiveLease. - resultIsErrorStr(res, `replica cannot hold lease`) || - // Only replicas that are part of the range can be given - // the lease. This case is hit if a TransferLease op races - // with a ChangeReplicas op. - resultIsErrorStr(res, `replica not found in RangeDescriptor`) || - // A lease transfer that races with a replica removal may find that - // the store it was targeting is no longer part of the range. - resultIsErrorStr(res, `unable to find store \d+ in range`) || + kvserver.IsLeaseTransferRejectedBecauseTargetCannotReceiveLease(err) || // A lease transfer is not permitted while a range merge is in its // critical phase. resultIsErrorStr(res, `cannot transfer lease while merge in progress`) || @@ -773,7 +763,6 @@ func (v *validator) processOp(op Operation) { ignore = kvserver.IsRetriableReplicationChangeError(err) || kvserver.IsIllegalReplicationChangeError(err) || kvserver.IsReplicationChangeInProgressError(err) || - errors.Is(err, roachpb.ErrReplicaCannotHoldLease) || transferLeaseResultIsIgnorable(t.Result) // replication changes can transfer leases } if !ignore { diff --git a/pkg/kv/kvserver/batcheval/cmd_lease_test.go b/pkg/kv/kvserver/batcheval/cmd_lease_test.go index 5826c5e9c986..9587216b561e 100644 --- a/pkg/kv/kvserver/batcheval/cmd_lease_test.go +++ b/pkg/kv/kvserver/batcheval/cmd_lease_test.go @@ -151,13 +151,13 @@ func TestLeaseCommandLearnerReplica(t *testing.T) { // Learners are not allowed to become leaseholders for now, see the comments // in TransferLease and RequestLease. _, err := TransferLease(ctx, nil, cArgs, nil) - require.EqualError(t, err, `replica cannot hold lease`) + require.EqualError(t, err, `lease target replica cannot hold lease`) cArgs.Args = &kvpb.RequestLeaseRequest{} _, err = RequestLease(ctx, nil, cArgs, nil) const expForUnknown = `cannot replace lease with : ` + - `replica not found in RangeDescriptor` + `lease target replica not found in RangeDescriptor` require.EqualError(t, err, expForUnknown) cArgs.Args = &kvpb.RequestLeaseRequest{ @@ -169,7 +169,7 @@ func TestLeaseCommandLearnerReplica(t *testing.T) { const expForLearner = `cannot replace lease ` + `with repl=(n2,s2):2LEARNER seq=0 start=0,0 exp=: ` + - `replica cannot hold lease` + `lease target replica cannot hold lease` require.EqualError(t, err, expForLearner) } diff --git a/pkg/kv/kvserver/client_lease_test.go b/pkg/kv/kvserver/client_lease_test.go index ae352965949d..3a70e77f5616 100644 --- a/pkg/kv/kvserver/client_lease_test.go +++ b/pkg/kv/kvserver/client_lease_test.go @@ -247,21 +247,10 @@ func TestCannotTransferLeaseToVoterDemoting(t *testing.T) { wg.Add(1) go func() { defer wg.Done() - err := tc.Server(0).DB().AdminTransferLease(context.Background(), + err := tc.Server(0).DB().AdminTransferLease(ctx, scratchStartKey, tc.Target(2).StoreID) require.Error(t, err) - require.Regexp(t, - // The error generated during evaluation. - "replica cannot hold lease|"+ - // If the lease transfer request has not yet made it to the latching - // phase by the time we close(ch) below, we can receive the following - // error due to the sanity checking which happens in - // AdminTransferLease before attempting to evaluate the lease - // transfer. - // We have a sleep loop below to try to encourage the lease transfer - // to make it past that sanity check prior to letting the change - // of replicas proceed. - "cannot transfer lease to replica of type VOTER_DEMOTING_LEARNER", err.Error()) + require.True(t, errors.Is(err, roachpb.ErrReplicaCannotHoldLease)) }() // Try really hard to make sure that our request makes it past the // sanity check error to the evaluation error. @@ -360,7 +349,8 @@ func TestTransferLeaseToVoterDemotingFails(t *testing.T) { // (wasLastLeaseholder = false in CheckCanReceiveLease). err = tc.Server(0).DB().AdminTransferLease(context.Background(), scratchStartKey, tc.Target(2).StoreID) - require.EqualError(t, err, `replica cannot hold lease`) + require.Error(t, err) + require.True(t, errors.Is(err, roachpb.ErrReplicaCannotHoldLease)) // Make sure the lease is still on n1. leaseHolder, err = tc.FindRangeLeaseHolder(desc, nil) require.NoError(t, err) diff --git a/pkg/kv/kvserver/client_replica_test.go b/pkg/kv/kvserver/client_replica_test.go index 7ac03a9aea8b..ad6267dd1e0c 100644 --- a/pkg/kv/kvserver/client_replica_test.go +++ b/pkg/kv/kvserver/client_replica_test.go @@ -1637,7 +1637,7 @@ func TestLeaseExpirationBasedRangeTransfer(t *testing.T) { { // An invalid target should result in an error. - const expected = "unable to find store .* in range" + const expected = "lease target replica not found in RangeDescriptor" if err := l.replica0.AdminTransferLease(ctx, 1000, false /* bypassSafetyChecks */); !testutils.IsError(err, expected) { t.Fatalf("expected %s, but found %v", expected, err) } diff --git a/pkg/kv/kvserver/markers.go b/pkg/kv/kvserver/markers.go index 4f95d04d377c..d8901a3edd6e 100644 --- a/pkg/kv/kvserver/markers.go +++ b/pkg/kv/kvserver/markers.go @@ -38,6 +38,7 @@ var errMarkCanRetryReplicationChangeWithUpdatedDesc = errors.New("should retry w func IsRetriableReplicationChangeError(err error) bool { return errors.Is(err, errMarkCanRetryReplicationChangeWithUpdatedDesc) || IsLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(err) || + IsLeaseTransferRejectedBecauseTargetCannotReceiveLease(err) || isSnapshotError(err) } @@ -106,3 +107,13 @@ var errMarkLeaseTransferRejectedBecauseTargetMayNeedSnapshot = errors.New( func IsLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(err error) bool { return errors.Is(err, errMarkLeaseTransferRejectedBecauseTargetMayNeedSnapshot) } + +// IsLeaseTransferRejectedBecauseTargetCannotReceiveLease returns true if err +// (assumed to have been emitted by the current leaseholder when processing a +// lease transfer request) indicates that the target replica is not qualified to +// receive the lease. This could be because the current leaseholder has an +// outdated view of the target replica's state. +func IsLeaseTransferRejectedBecauseTargetCannotReceiveLease(err error) bool { + return errors.Is(err, roachpb.ErrReplicaNotFound) || + errors.Is(err, roachpb.ErrReplicaCannotHoldLease) +} diff --git a/pkg/kv/kvserver/replica_learner_test.go b/pkg/kv/kvserver/replica_learner_test.go index cda2a6fa913b..1d67084cf885 100644 --- a/pkg/kv/kvserver/replica_learner_test.go +++ b/pkg/kv/kvserver/replica_learner_test.go @@ -955,51 +955,79 @@ func TestSplitWithLearnerOrJointConfig(t *testing.T) { func TestSplitRetriesOnFailedExitOfJointConfig(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) - ctx := context.Background() - var rangeIDAtomic int64 - var rejectedCount int - const maxRejects = 3 - reqFilter := func(ctx context.Context, ba *kvpb.BatchRequest) *kvpb.Error { - rangeID := roachpb.RangeID(atomic.LoadInt64(&rangeIDAtomic)) - if ba.RangeID == rangeID && ba.IsSingleTransferLeaseRequest() && rejectedCount < maxRejects { - rejectedCount++ - repl := ba.Requests[0].GetTransferLease().Lease.Replica - status := raftutil.ReplicaStateProbe - err := kvserver.NewLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(repl, status) - return kvpb.NewError(err) - } - return nil + testCases := []struct { + name string + errFn func(*kvpb.TransferLeaseRequest) error + }{ + { + name: "targetMayNeedSnapshot", + errFn: func(req *kvpb.TransferLeaseRequest) error { + repl := req.Lease.Replica + status := raftutil.ReplicaStateProbe + return kvserver.NewLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(repl, status) + }, + }, + { + name: "replicaNotFound", + errFn: func(_ *kvpb.TransferLeaseRequest) error { + return roachpb.ErrReplicaNotFound + }, + }, + { + name: "replicaCannotHoldLease", + errFn: func(_ *kvpb.TransferLeaseRequest) error { + return roachpb.ErrReplicaCannotHoldLease + }, + }, } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + var rangeIDAtomic int64 + var rejectedCount int + const maxRejects = 3 + reqFilter := func(ctx context.Context, ba *kvpb.BatchRequest) *kvpb.Error { + rangeID := roachpb.RangeID(atomic.LoadInt64(&rangeIDAtomic)) + if ba.RangeID == rangeID && ba.IsSingleTransferLeaseRequest() && rejectedCount < maxRejects { + rejectedCount++ + req := ba.Requests[0].GetTransferLease() + err := tc.errFn(req) + return kvpb.NewError(err) + } + return nil + } - knobs, ltk := makeReplicationTestKnobs() - knobs.Store.(*kvserver.StoreTestingKnobs).TestingRequestFilter = reqFilter - tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ - ServerArgs: base.TestServerArgs{Knobs: knobs}, - ReplicationMode: base.ReplicationManual, - }) - defer tc.Stopper().Stop(ctx) - - scratchStartKey := tc.ScratchRange(t) - scratchDesc := tc.LookupRangeOrFatal(t, scratchStartKey) - atomic.StoreInt64(&rangeIDAtomic, int64(scratchDesc.RangeID)) - - // Rebalance the range from one store to the other. This will enter a joint - // configuration and then stop because of the testing knobs. - atomic.StoreInt64(<k.replicationAlwaysUseJointConfig, 1) - atomic.StoreInt64(<k.replicaAddStopAfterJointConfig, 1) - tc.RebalanceVoterOrFatal(ctx, t, scratchStartKey, tc.Target(0), tc.Target(1)) - - // Perform a split of the range. This will auto-transitions us out of the - // joint conf before doing work. However, because of the filter we installed - // above, this will first run into a series of retryable errors when - // attempting to perform a lease transfer. The split should retry until the - // join configuration completes. - left, right := tc.SplitRangeOrFatal(t, scratchStartKey.Next()) - require.False(t, left.Replicas().InAtomicReplicationChange(), left) - require.False(t, right.Replicas().InAtomicReplicationChange(), right) - - require.Equal(t, maxRejects, rejectedCount) + ctx := context.Background() + knobs, ltk := makeReplicationTestKnobs() + knobs.Store.(*kvserver.StoreTestingKnobs).TestingRequestFilter = reqFilter + tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ + ServerArgs: base.TestServerArgs{Knobs: knobs}, + ReplicationMode: base.ReplicationManual, + }) + defer tc.Stopper().Stop(ctx) + + scratchStartKey := tc.ScratchRange(t) + scratchDesc := tc.LookupRangeOrFatal(t, scratchStartKey) + atomic.StoreInt64(&rangeIDAtomic, int64(scratchDesc.RangeID)) + + // Rebalance the range from one store to the other. This will enter a joint + // configuration and then stop because of the testing knobs. + atomic.StoreInt64(<k.replicationAlwaysUseJointConfig, 1) + atomic.StoreInt64(<k.replicaAddStopAfterJointConfig, 1) + tc.RebalanceVoterOrFatal(ctx, t, scratchStartKey, tc.Target(0), tc.Target(1)) + + // Perform a split of the range. This will auto-transitions us out of the + // joint conf before doing work. However, because of the filter we installed + // above, this will first run into a series of retryable errors when + // attempting to perform a lease transfer. The split should retry until the + // joint configuration completes. + left, right := tc.SplitRangeOrFatal(t, scratchStartKey.Next()) + require.False(t, left.Replicas().InAtomicReplicationChange(), left) + require.False(t, right.Replicas().InAtomicReplicationChange(), right) + + require.Equal(t, maxRejects, rejectedCount) + }) + } } func TestReplicateQueueSeesLearnerOrJointConfig(t *testing.T) { diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go index 898892163df1..03eefdfebf7c 100644 --- a/pkg/kv/kvserver/replica_raft.go +++ b/pkg/kv/kvserver/replica_raft.go @@ -433,11 +433,13 @@ func (r *Replica) propose( if err := roachpb.CheckCanReceiveLease( lhDesc, proposedDesc.Replicas(), true, /* wasLastLeaseholder */ ); err != nil { - e := errors.Mark(errors.Wrapf(err, "%v received invalid ChangeReplicasTrigger %s to "+ + err = errors.Handled(err) + err = errors.Mark(err, errMarkInvalidReplicationChange) + err = errors.Wrapf(err, "%v received invalid ChangeReplicasTrigger %s to "+ "remove self (leaseholder); lhRemovalAllowed: %v; current desc: %v; proposed desc: %v", - lhDesc, crt, true /* lhRemovalAllowed */, r.Desc(), proposedDesc), errMarkInvalidReplicationChange) - log.Errorf(p.ctx, "%v", e) - return kvpb.NewError(e) + lhDesc, crt, true /* lhRemovalAllowed */, r.Desc(), proposedDesc) + log.Errorf(p.ctx, "%v", err) + return kvpb.NewError(err) } } else if p.command.ReplicatedEvalResult.AddSSTable != nil { log.VEvent(p.ctx, 4, "sideloadable proposal detected") diff --git a/pkg/kv/kvserver/replica_range_lease.go b/pkg/kv/kvserver/replica_range_lease.go index d8ce4dc077b3..45e598bccb1f 100644 --- a/pkg/kv/kvserver/replica_range_lease.go +++ b/pkg/kv/kvserver/replica_range_lease.go @@ -875,7 +875,7 @@ func (r *Replica) AdminTransferLease( // Verify the target is a replica of the range. var ok bool if nextLeaseHolder, ok = desc.GetReplicaDescriptor(target); !ok { - return nil, nil, errors.Errorf("unable to find store %d in range %+v", target, desc) + return nil, nil, roachpb.ErrReplicaNotFound } if nextLease, ok := r.mu.pendingLeaseRequest.RequestPending(); ok && diff --git a/pkg/roachpb/metadata_replicas.go b/pkg/roachpb/metadata_replicas.go index f458ae8b7bf5..1217ed878167 100644 --- a/pkg/roachpb/metadata_replicas.go +++ b/pkg/roachpb/metadata_replicas.go @@ -507,12 +507,15 @@ func (c ReplicaChangeType) IsRemoval() bool { } } -var errReplicaNotFound = errors.Errorf(`replica not found in RangeDescriptor`) +// ErrReplicaNotFound can be returned from CheckCanReceiveLease. +// +// See: https://github.com/cockroachdb/cockroach/issues/93163. +var ErrReplicaNotFound = errors.New(`lease target replica not found in RangeDescriptor`) // ErrReplicaCannotHoldLease can be returned from CheckCanReceiveLease. // -// See: https://github.com/cockroachdb/cockroach/issues/93163 -var ErrReplicaCannotHoldLease = errors.Errorf("replica cannot hold lease") +// See: https://github.com/cockroachdb/cockroach/issues/93163. +var ErrReplicaCannotHoldLease = errors.New(`lease target replica cannot hold lease`) // CheckCanReceiveLease checks whether `wouldbeLeaseholder` can receive a lease. // Returns an error if the respective replica is not eligible. @@ -543,7 +546,7 @@ func CheckCanReceiveLease( ) error { repDesc, ok := replDescs.GetReplicaDescriptorByID(wouldbeLeaseholder.ReplicaID) if !ok { - return errReplicaNotFound + return ErrReplicaNotFound } if !(repDesc.IsVoterNewConfig() || (repDesc.IsVoterOldConfig() && replDescs.containsVoterIncoming() && wasLastLeaseholder)) {