From 40ecc865147eef4499d2b96ba850b3b3a60cfc11 Mon Sep 17 00:00:00 2001 From: Alex Sarkesian Date: Thu, 21 Sep 2023 16:40:58 -0700 Subject: [PATCH] kvcoord: fix flake in `TestTransactionUnexpectedlyCommitted` The `TestTransactionUnexpectedlyCommitted/recovery_after_transfer_lease` test, introduced to test #107658, has been flaky (particularly under deadlock builds) due to a race condition between a retry of a write and intent resolution. While both orderings in this test result in a correct `AmbiguousResultError` for the client, when intent resolution wins the race, the retried write will attempt to push away the current lockholder; since it is illegal for a committed transaction to perform a push, this results in a different secondary error attached to the `AmbiguousResultError`. This change ensures a predefined ordering of these operations so that the secondary error is consistent across runs of the test. Fixes: #110187 Release note: None --- .../kvclient/kvcoord/dist_sender_ambiguous_test.go | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pkg/kv/kvclient/kvcoord/dist_sender_ambiguous_test.go b/pkg/kv/kvclient/kvcoord/dist_sender_ambiguous_test.go index c65af14d642e..5098dabfa3a1 100644 --- a/pkg/kv/kvclient/kvcoord/dist_sender_ambiguous_test.go +++ b/pkg/kv/kvclient/kvcoord/dist_sender_ambiguous_test.go @@ -1048,7 +1048,7 @@ func TestTransactionUnexpectedlyCommitted(t *testing.T) { // 7. _->n1: PushTxn(txn2->txn1) -- Discovers txn1 in STAGING and starts // recovery. // 8. _->n1: RecoverTxn(txn1) -- Recovery should mark txn1 committed, but - // pauses before returning so that txn1's intents don't get cleaned up. + // intent resolution on txn1 needs to be paused until after txn1 finishes. if req.ba.IsSingleRecoverTxnRequest() && cp == AfterSending { close(recoverComplete) } @@ -1075,8 +1075,15 @@ func TestTransactionUnexpectedlyCommitted(t *testing.T) { // 12. txn1->n1: EndTxn(commit) -- Recovery has already completed, so this // request fails with "transaction unexpectedly committed". - // - if req.ba.IsSingleRecoverTxnRequest() && cp == AfterSending { + // + // If the intent on (b) were resolved and txn2 could grab the lock prior + // to txn1's retry of the Put(b), the retry will cause a PushTxn to txn2. + // Given that the recovery at (8) has already completed, a PushTxn + // request where the pusher is a committed transaction results in an + // "already committed" TransactionStatusError from the txnwait queue. + // While this still results in AmbiguousResultError from the DistSender, + // the reason will be distinct; as such we pause the intent resolution. + if riReq, ok := req.ba.GetArg(kvpb.ResolveIntent); ok && riReq.Header().Key.Equal(keyB) && cp == BeforeSending { req.pauseUntil(t, txn1Done, cp) t.Logf("%s - complete, resp={%s}", req.prefix, resp) }