From 4d702276660b7ed775ebf6249c1b244da33c2792 Mon Sep 17 00:00:00 2001 From: Yuan Jing Vincent Yan Date: Tue, 28 Jan 2025 15:15:12 -0500 Subject: [PATCH 1/4] Fix mqbblp::RecoveryManager: Clear sync peer promptly Signed-off-by: Yuan Jing Vincent Yan --- src/groups/mqb/mqbblp/mqbblp_recoverymanager.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/groups/mqb/mqbblp/mqbblp_recoverymanager.cpp b/src/groups/mqb/mqbblp/mqbblp_recoverymanager.cpp index e90ef1b400..471add293e 100644 --- a/src/groups/mqb/mqbblp/mqbblp_recoverymanager.cpp +++ b/src/groups/mqb/mqbblp/mqbblp_recoverymanager.cpp @@ -1417,10 +1417,20 @@ void RecoveryManager::onPartitionPrimarySyncStatus(int partitionId, int status) PrimarySyncContext& primarySyncCtx = d_primarySyncContexts[partitionId]; BSLS_ASSERT_SAFE(primarySyncCtx.primarySyncInProgress()); + BALL_LOG_INFO << d_clusterData_p->identity().description() + << "For Partition [" << partitionId + << "], primary sync returned with status: " << status + << ". Resetting primary sync peer from " + << (primarySyncCtx.syncPeer() + ? primarySyncCtx.syncPeer()->nodeDescription() + : "** null **") + << " to ** null **."; + d_clusterData_p->scheduler().cancelEventAndWait( &primarySyncCtx.primarySyncStatusEventHandle()); primarySyncCtx.partitionPrimarySyncCb()(partitionId, status); + primarySyncCtx.setPrimarySyncPeer(0); if (primarySyncCtx.fileTransferInfo().areFilesMapped()) { // Don't clear the 'primarySyncCtx' at this time because files are From d0ffa3b5fe8929ee35d959e2047f4591353a5ada Mon Sep 17 00:00:00 2001 From: Yuan Jing Vincent Yan Date: Tue, 28 Jan 2025 15:19:49 -0500 Subject: [PATCH 2/4] mqb logging: PartitionId -> Partition Signed-off-by: Yuan Jing Vincent Yan --- src/groups/mqb/mqbblp/mqbblp_clusterorchestrator.cpp | 2 +- src/groups/mqb/mqbs/mqbs_filebackedstorage.cpp | 2 +- src/groups/mqb/mqbs/mqbs_virtualstoragecatalog.cpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/groups/mqb/mqbblp/mqbblp_clusterorchestrator.cpp b/src/groups/mqb/mqbblp/mqbblp_clusterorchestrator.cpp index 163c93bb90..d155c9a81e 100644 --- a/src/groups/mqb/mqbblp/mqbblp_clusterorchestrator.cpp +++ b/src/groups/mqb/mqbblp/mqbblp_clusterorchestrator.cpp @@ -1770,7 +1770,7 @@ void ClusterOrchestrator::processPrimaryStatusAdvisory( // TBD: may need to review the order of invoking these routines. BALL_LOG_INFO << d_clusterData_p->identity().description() - << " PartitionId [" << primaryAdv.partitionId() + << " Partition [" << primaryAdv.partitionId() << "]: received primary status advisory: " << primaryAdv << ", from: " << source->nodeDescription(); diff --git a/src/groups/mqb/mqbs/mqbs_filebackedstorage.cpp b/src/groups/mqb/mqbs/mqbs_filebackedstorage.cpp index 5ea7fd38ce..7196ef3b89 100644 --- a/src/groups/mqb/mqbs/mqbs_filebackedstorage.cpp +++ b/src/groups/mqb/mqbs/mqbs_filebackedstorage.cpp @@ -470,7 +470,7 @@ FileBackedStorage::releaseRef(const bmqt::MessageGUID& guid) if (0 != rc) { BMQTSK_ALARMLOG_ALARM("FILE_IO") - << "PartitionId [" << partitionId() << "] failed to write " + << "Partition [" << partitionId() << "] failed to write " << "DELETION record for GUID: " << guid << ", for queue '" << d_queueUri << "', queueKey '" << d_queueKey << "' while attempting to purge the message, rc: " << rc diff --git a/src/groups/mqb/mqbs/mqbs_virtualstoragecatalog.cpp b/src/groups/mqb/mqbs/mqbs_virtualstoragecatalog.cpp index 07c77d43c5..d9085dd4bb 100644 --- a/src/groups/mqb/mqbs/mqbs_virtualstoragecatalog.cpp +++ b/src/groups/mqb/mqbs/mqbs_virtualstoragecatalog.cpp @@ -342,7 +342,7 @@ VirtualStorageCatalog::removeAll(const mqbu::StorageKey& appKey) else { if (result == mqbi::StorageResult::e_GUID_NOT_FOUND) { BALL_LOG_WARN - << "#STORAGE_PURGE_ERROR " << "PartitionId [" + << "#STORAGE_PURGE_ERROR " << "Partition [" << d_storage_p->partitionId() << "]" << ": Attempting to purge GUID '" << itData->first << "' from virtual storage with appId '" @@ -359,7 +359,7 @@ VirtualStorageCatalog::removeAll(const mqbu::StorageKey& appKey) } else { BMQTSK_ALARMLOG_ALARM("STORAGE_PURGE_ERROR") - << "PartitionId [" << d_storage_p->partitionId() << "]" + << "Partition [" << d_storage_p->partitionId() << "]" << ": Attempting to purge GUID '" << itData->first << "' from virtual storage with appId '" << itVs->value()->appId() << "' & appKey '" << appKey From d3f6e8aa7e803f2c73bbaa0bd6db87a735030155 Mon Sep 17 00:00:00 2001 From: Yuan Jing Vincent Yan Date: Wed, 5 Feb 2025 13:43:00 -0500 Subject: [PATCH 3/4] mqbblp::StorageMgr: Fix logging Signed-off-by: Yuan Jing Vincent Yan --- src/groups/mqb/mqbblp/mqbblp_storagemanager.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/groups/mqb/mqbblp/mqbblp_storagemanager.cpp b/src/groups/mqb/mqbblp/mqbblp_storagemanager.cpp index 2f8572cfef..1c880a49e5 100644 --- a/src/groups/mqb/mqbblp/mqbblp_storagemanager.cpp +++ b/src/groups/mqb/mqbblp/mqbblp_storagemanager.cpp @@ -796,13 +796,15 @@ void StorageManager::processPartitionSyncEventDispatched( return; // RETURN } - if (source != d_recoveryManager_mp->primarySyncPeer(partitionId)) { + mqbnet::ClusterNode* syncPeer = d_recoveryManager_mp->primarySyncPeer( + partitionId); + if (source != syncPeer) { BALL_LOG_ERROR << d_clusterData_p->identity().description() << " Partition [" << partitionId << "]: received a partition sync event from: " << source->nodeDescription() << ", while partition-sync peer is: " - << source->nodeDescription(); + << syncPeer->nodeDescription(); return; // RETURN } From f4cbbb6d0c1ad497c26aa185e576614f0b01de3b Mon Sep 17 00:00:00 2001 From: Yuan Jing Vincent Yan Date: Wed, 5 Feb 2025 13:53:37 -0500 Subject: [PATCH 4/4] mqbblp::RecoveryMgr: Explain why reset sync peer early Signed-off-by: Yuan Jing Vincent Yan --- src/groups/mqb/mqbblp/mqbblp_recoverymanager.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/groups/mqb/mqbblp/mqbblp_recoverymanager.cpp b/src/groups/mqb/mqbblp/mqbblp_recoverymanager.cpp index 471add293e..0347a77328 100644 --- a/src/groups/mqb/mqbblp/mqbblp_recoverymanager.cpp +++ b/src/groups/mqb/mqbblp/mqbblp_recoverymanager.cpp @@ -1430,13 +1430,18 @@ void RecoveryManager::onPartitionPrimarySyncStatus(int partitionId, int status) &primarySyncCtx.primarySyncStatusEventHandle()); primarySyncCtx.partitionPrimarySyncCb()(partitionId, status); - primarySyncCtx.setPrimarySyncPeer(0); if (primarySyncCtx.fileTransferInfo().areFilesMapped()) { // Don't clear the 'primarySyncCtx' at this time because files are // still mapped. It will be cleaned up when the chunk deleter // eventually invokes 'partitionSyncCleanupDispatched' routine. + // However, we have already received all sync data chunks from the sync + // peer, so we can reset our sync peer to *null*. This prevents false + // alarm of primary sync failure if that peer happens to go down before + // 'partitionSyncCleanupDispatched' is invoked. + primarySyncCtx.setPrimarySyncPeer(0); + return; // RETURN }