From da8b2abae1b609419d733797ddf863e46258b39a Mon Sep 17 00:00:00 2001 From: Eric Sheng Date: Wed, 9 Oct 2024 10:25:03 -0700 Subject: [PATCH] [BACKPORT 2.20][#24317] docdb: Fix race between commit/abort and old transaction cleanup Summary: Original commit: f51e54d8b7e8be649948a62a0d0cc2cc9fd4a496 / D38789 There exists a race condition between commit/abort path and old transaction cleanup for promoted transactions, where commit/abort path observes that old transaction cleanup is still ongoing and sets `cleanup_waiter_`, but old transaction cleanup finishes before `cleanup_waiter_` is set, resulting in the waiter never getting called. This is the cause of occasional failures of GeoTransactionsPromotionTest.TestPromotionReturningToAbortedState and GeoTransactionsPromotionRF1Test.TestTwoTabletPromotionFailure with the following stack: ``` F20241007 19:23:50 ../../src/yb/rpc/rpc.cc:339] Check failed: calls_.empty() Calls: [0x000035bd35b49d60 -> AbortTransaction: tablet_id: "36df9b80658448848075dd10894f489c" transaction_id: "`\2051\'S&K\333\267[<\276\373P\302\207" propagated_hybrid_time: 7079338864676978688, retrier: { task_id: -1 state: kFinished deadline: 314126.220s }] *** Check failure stack trace: *** @ 0x7fa6097965c0 google::LogMessage::SendToLog() @ 0x7fa609796c00 google::LogMessage::Flush() @ 0x7fa609799979 google::LogMessageFatal::~LogMessageFatal() @ 0x7fa60990b545 yb::rpc::Rpcs::Shutdown() @ 0x7fa60b40216d yb::client::TransactionManager::Impl::~Impl() @ 0x7fa60b3fdfad yb::client::TransactionManager::~TransactionManager() @ 0x7fa60cfe6f6e yb::tserver::DbServerBase::~DbServerBase() @ 0x7fa60d0b702e yb::tserver::TabletServer::~TabletServer() @ 0x7fa60e3e3393 yb::tserver::MiniTabletServer::Shutdown() @ 0x7fa60e561a05 yb::MiniCluster::Shutdown() @ 0x7fa60e595aca yb::YBMiniClusterTestBase<>::DoTearDown() @ 0x7fa60de00c1d testing::internal::HandleExceptionsInMethodIfSupported<>() @ 0x7fa60dde78c7 testing::TestInfo::Run() @ 0x7fa60dde8575 testing::TestSuite::Run() @ 0x7fa60ddf7e4e testing::internal::UnitTestImpl::RunAllTests() @ 0x7fa60de0190d testing::internal::HandleExceptionsInMethodIfSupported<>() @ 0x7fa60ddf795f testing::UnitTest::Run() @ 0x7fa60de81177 main @ 0x7fa607a29d90 (unknown) @ 0x7fa607a29e40 __libc_start_main @ 0x55f94fc7e325 _start ``` Jira: DB-13207 Test Plan: Jenkins. Also ran GeoTransactionsPromotionTest.TestPromotionReturningToAbortedState 500x and ensured above stack did not appear. Reviewers: sergei, rthallam Reviewed By: rthallam Subscribers: ybase, rthallam Differential Revision: https://phorge.dev.yugabyte.com/D38890 --- src/yb/client/transaction.cc | 63 +++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/src/yb/client/transaction.cc b/src/yb/client/transaction.cc index 458678804979..838320d5ae0f 100644 --- a/src/yb/client/transaction.cc +++ b/src/yb/client/transaction.cc @@ -1412,38 +1412,49 @@ class YBTransaction::Impl final : public internal::TxnBatcherIf { cleanup_type, tablet_ids); } + bool CheckWaitAbortOnOldStatusTablet(std::string_view action, Waiter waiter) REQUIRES(mutex_) { + auto old_status_tablet_state = old_status_tablet_state_.load(std::memory_order_acquire); + if (old_status_tablet_state != OldTransactionState::kAborting) { + return false; + } + + VLOG_WITH_PREFIX(1) << action << " done, but waiting for abort on old status tablet"; + cleanup_waiter_ = std::move(waiter); + return true; + } + void CommitDone(const Status& status, const tserver::UpdateTransactionResponsePB& response, const YBTransactionPtr& transaction) { TRACE_TO(trace_, __func__); - auto old_status_tablet_state = old_status_tablet_state_.load(std::memory_order_acquire); - if (old_status_tablet_state == OldTransactionState::kAborting) { + Status actual_status; + CommitCallback commit_callback; + { std::lock_guard lock(mutex_); - VLOG_WITH_PREFIX(1) << "Commit done, but waiting for abort on old status tablet"; - cleanup_waiter_ = Waiter(std::bind(&Impl::CommitDone, this, status, response, transaction)); - return; - } + if (CheckWaitAbortOnOldStatusTablet( + "Commit", Waiter(std::bind(&Impl::CommitDone, this, status, response, transaction)))) { + return; + } - VLOG_WITH_PREFIX(1) << "Committed: " << status; + VLOG_WITH_PREFIX(1) << "Committed: " << status; - UpdateClock(response, manager_); - manager_->rpcs().Unregister(&commit_handle_); + UpdateClock(response, manager_); + manager_->rpcs().Unregister(&commit_handle_); - Status actual_status = status.IsAlreadyPresent() ? Status::OK() : status; - CommitCallback commit_callback; - if (state_.load(std::memory_order_acquire) != TransactionState::kCommitted && - actual_status.ok()) { - std::lock_guard lock(mutex_); - commit_replicated_ = true; - if (running_requests_ != 0) { - return; + actual_status = status.IsAlreadyPresent() ? Status::OK() : status; + if (state_.load(std::memory_order_acquire) != TransactionState::kCommitted && + actual_status.ok()) { + commit_replicated_ = true; + if (running_requests_ != 0) { + return; + } + commit_callback = std::move(commit_callback_); + } else { + commit_callback = std::move(commit_callback_); } - commit_callback = std::move(commit_callback_); - } else { - std::lock_guard lock(mutex_); - commit_callback = std::move(commit_callback_); } + VLOG_WITH_PREFIX(4) << "Commit done: " << actual_status; commit_callback(actual_status); @@ -1460,12 +1471,12 @@ class YBTransaction::Impl final : public internal::TxnBatcherIf { const YBTransactionPtr& transaction) { TRACE_TO(trace_, __func__); - auto old_status_tablet_state = old_status_tablet_state_.load(std::memory_order_acquire); - if (old_status_tablet_state == OldTransactionState::kAborting) { + { std::lock_guard lock(mutex_); - VLOG_WITH_PREFIX(1) << "Abort done, but waiting for abort on old status tablet"; - cleanup_waiter_ = Waiter(std::bind(&Impl::AbortDone, this, status, response, transaction)); - return; + if (CheckWaitAbortOnOldStatusTablet( + "Abort", Waiter(std::bind(&Impl::AbortDone, this, status, response, transaction)))) { + return; + } } VLOG_WITH_PREFIX(1) << "Aborted: " << status;