From 1b728541134252f4bebeef82a77b32631379d70b Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Fri, 16 Aug 2024 08:02:36 -0400 Subject: [PATCH] sambacc: add a retry loop to ctdb.monitor_cluster_meta_changes Add a loop that tries the `ctdb reloadnodes` command after an increasing delay. This is an attempt to fix a condition where ctdbd is apparently not ready to handle the `ctdb reloadnodes` command. In this case the command would be run, but fail and an exception would be raised in the monitor_cluster_meta_changes function would raise an exception. This would be caught by the command-level retry loop. However, this command-level retry loop will simply re-run monitor_cluster_meta_changes and this function now no longer has the same initial clustermeta state and has effectively "forgotten" that it needs to run reloadnodes. This new retry loop adds a level of error handling inside the monitor_cluster_meta_changes function so that we will retry with a bounded number of attempts. Signed-off-by: John Mulligan --- sambacc/ctdb.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/sambacc/ctdb.py b/sambacc/ctdb.py index da1dbcd..cc4dd55 100644 --- a/sambacc/ctdb.py +++ b/sambacc/ctdb.py @@ -20,6 +20,7 @@ import logging import os import subprocess +import time import typing from sambacc import config @@ -572,7 +573,23 @@ def monitor_cluster_meta_changes( if nodes_file_path: _logger.info("updating nodes file: %s", nodes_file_path) _save_nodes(nodes_file_path, expected_nodes) - _maybe_reload_nodes(leader_locator, reload_all=reload_all) + _maybe_reload_nodes_retry(leader_locator, reload_all=reload_all) + + +def _maybe_reload_nodes_retry( + leader_locator: typing.Optional[leader.LeaderLocator] = None, + reload_all: bool = False, + *, + tries: int = 5, +) -> None: + for idx in range(tries): + time.sleep(1 << idx) + try: + _maybe_reload_nodes(leader_locator, reload_all=reload_all) + return + except subprocess.CalledProcessError: + _logger.exception("failed to execute reload nodes command") + raise RuntimeError("exceeded retries running reload nodes command") def _maybe_reload_nodes(