From ba7a372bcae7518eccc9d8da03e6a0a13562f026 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Mon, 15 Jul 2024 10:13:23 -0400 Subject: [PATCH 1/8] commands: fix incorrect help text for --write-nodes Fix the help text for `--write-nodes`. The previous help text was incorrect copy-paste. Signed-off-by: John Mulligan --- sambacc/commands/ctdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sambacc/commands/ctdb.py b/sambacc/commands/ctdb.py index a3847c1..962339a 100644 --- a/sambacc/commands/ctdb.py +++ b/sambacc/commands/ctdb.py @@ -340,7 +340,7 @@ def _ctdb_must_have_node_args(parser: argparse.ArgumentParser) -> None: parser.add_argument( "--write-nodes", action="store_true", - help="Specify node by IP", + help="Write ctdb nodes file based on cluster meta contents", ) From 18506269a38010dd1909026978506cdda496dd6f Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Sat, 13 Jul 2024 11:10:13 -0400 Subject: [PATCH 2/8] commands: encapsulate retry logic Move the limited-retries-on-error logic to a class that can be reused later. Signed-off-by: John Mulligan --- sambacc/commands/ctdb.py | 58 +++++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/sambacc/commands/ctdb.py b/sambacc/commands/ctdb.py index 962339a..4bf2aa1 100644 --- a/sambacc/commands/ctdb.py +++ b/sambacc/commands/ctdb.py @@ -17,6 +17,7 @@ # import argparse +import contextlib import logging import os import socket @@ -311,28 +312,15 @@ def ctdb_manage_nodes(ctx: Context) -> None: expected_pnn = np.node_number or 0 waiter = np.cluster_meta_waiter() - errors = 0 + limiter = ErrorLimiter("ctdb_manage_nodes", 10, pause_func=waiter.wait) while True: - try: + with limiter.catch(): ctdb.monitor_cluster_meta_updates( cmeta=np.cluster_meta(), pnn=expected_pnn, real_path=np.persistent_path, pause_func=waiter.wait, ) - errors = 0 - except KeyboardInterrupt: - raise - except Exception as err: - _logger.error( - f"error during manage_nodes: {err}, count={errors}", - exc_info=True, - ) - errors += 1 - if errors > 10: - _logger.error(f"too many retries ({errors}). giving up") - raise - waiter.wait() def _ctdb_must_have_node_args(parser: argparse.ArgumentParser) -> None: @@ -411,3 +399,43 @@ def ctdb_rados_mutex(ctx: Context) -> None: cmd = cmd["-n", namespace] _logger.debug("executing command: %r", cmd) samba_cmds.execute(cmd) # replaces process + + +class ErrorLimiter: + def __init__( + self, + name: str, + limit: int, + *, + pause_func: typing.Optional[typing.Callable] = None, + ) -> None: + self.name = name + self.limit = limit + self.errors = 0 + self.pause_func = pause_func + + def post_catch(self): + if self.pause_func is not None: + self.pause_func() + + @contextlib.contextmanager + def catch(self) -> typing.Iterator[None]: + try: + _logger.debug( + "error limiter proceeding: %s: errors=%r", + self.name, + self.errors, + ) + yield + except KeyboardInterrupt: + raise + except Exception as err: + _logger.error( + f"error during {self.name}: {err}, count={self.errors}", + exc_info=True, + ) + self.errors += 1 + if self.errors > self.limit: + _logger.error(f"too many retries ({self.errors}). giving up") + raise + self.post_catch() From e553414d2a57b5e946f82fb20d71f0d2cc381471 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Sat, 13 Jul 2024 11:21:20 -0400 Subject: [PATCH 3/8] sambacc: rename function to manage_cluster_meta_updates Rename a function to manage_cluster_meta_updates to better match what it does. A future change will want to use the word "monitor" for something else and it would be confusing to continue calling this a "monitor" function vs "manage -- and this function is the more active "participant" of the two. Signed-off-by: John Mulligan --- sambacc/commands/ctdb.py | 2 +- sambacc/ctdb.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sambacc/commands/ctdb.py b/sambacc/commands/ctdb.py index 4bf2aa1..428462e 100644 --- a/sambacc/commands/ctdb.py +++ b/sambacc/commands/ctdb.py @@ -315,7 +315,7 @@ def ctdb_manage_nodes(ctx: Context) -> None: limiter = ErrorLimiter("ctdb_manage_nodes", 10, pause_func=waiter.wait) while True: with limiter.catch(): - ctdb.monitor_cluster_meta_updates( + ctdb.manage_cluster_meta_updates( cmeta=np.cluster_meta(), pnn=expected_pnn, real_path=np.persistent_path, diff --git a/sambacc/ctdb.py b/sambacc/ctdb.py index bfb2a0a..66bfb88 100644 --- a/sambacc/ctdb.py +++ b/sambacc/ctdb.py @@ -338,7 +338,7 @@ def manage_nodes( pause_func: typing.Callable, ) -> None: """Monitor nodes json for updates, reflecting those changes into ctdb.""" - monitor_cluster_meta_updates( + manage_cluster_meta_updates( ClusterMetaJSONFile(nodes_json), pnn, real_path, @@ -346,7 +346,7 @@ def manage_nodes( ) -def monitor_cluster_meta_updates( +def manage_cluster_meta_updates( cmeta: ClusterMeta, pnn: int, real_path: str, From c1e995ccdea5017296cea21670269988a3c7c1e0 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Sat, 13 Jul 2024 11:22:05 -0400 Subject: [PATCH 4/8] commands: improve ctdb_manage_nodes doc comment Make the doc comment on ctdb_manage_nodes more accurate. Signed-off-by: John Mulligan --- sambacc/commands/ctdb.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sambacc/commands/ctdb.py b/sambacc/commands/ctdb.py index 428462e..bb2a81c 100644 --- a/sambacc/commands/ctdb.py +++ b/sambacc/commands/ctdb.py @@ -303,9 +303,9 @@ def ctdb_set_node(ctx: Context) -> None: @commands.command(name="ctdb-manage-nodes", arg_func=_ctdb_general_node_args) def ctdb_manage_nodes(ctx: Context) -> None: - """Run a long lived procees to monitor the node state file for new nodes. - When a new node is found, if the current node is in the correct state, this - node will add it to CTDB. + """Run a long lived process to manage the cluster metadata. It can add new + nodes. When a new node is found, if the current node is in the correct + state, this node will add it to CTDB. """ _ctdb_ok() np = NodeParams(ctx) From 38ca35e27737f14076bfcef313891309ea83f03d Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Sat, 13 Jul 2024 13:55:05 -0400 Subject: [PATCH 5/8] sambacc: split out some potentially common ctdb logic into funcs Split out some logic from the cluster_meta_to_nodes function into separate functions for current & later reuse. Signed-off-by: John Mulligan --- sambacc/ctdb.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/sambacc/ctdb.py b/sambacc/ctdb.py index 66bfb88..23bd191 100644 --- a/sambacc/ctdb.py +++ b/sambacc/ctdb.py @@ -464,10 +464,7 @@ def _node_update(cmeta: ClusterMeta, real_path: str) -> bool: new_ctdb_nodes.append(expected_line) else: new_ctdb_nodes[pnn] = expected_line - with open(real_path, "w") as nffh: - write_nodes_file(nffh, new_ctdb_nodes) - nffh.flush() - os.fsync(nffh) + _save_nodes(real_path, new_ctdb_nodes) _logger.info("running: ctdb reloadnodes") subprocess.check_call(list(samba_cmds.ctdb["reloadnodes"])) for entry in need_reload: @@ -490,17 +487,26 @@ def cluster_meta_to_nodes(cmeta: ClusterMeta, real_path: str) -> None: json_data = cmo.load() nodes = json_data.get("nodes", []) _logger.info("Found node metadata: %r", nodes) - pnn_max = max(n["pnn"] for n in nodes) + 1 # pnn is zero indexed - ctdb_nodes: list[str] = [""] * pnn_max - for entry in nodes: - pnn = entry["pnn"] - expected_line = _entry_to_node(ctdb_nodes, entry) - ctdb_nodes[pnn] = expected_line + ctdb_nodes = _cluster_meta_to_ctdb_nodes(nodes) _logger.info("Will write nodes: %s", ctdb_nodes) - with open(real_path, "w") as nffh: - write_nodes_file(nffh, ctdb_nodes) - nffh.flush() - os.fsync(nffh) + _save_nodes(real_path, ctdb_nodes) + + +def _cluster_meta_to_ctdb_nodes(nodes: list[dict]) -> list[str]: + pnn_max = max(n["pnn"] for n in nodes) + 1 # pnn is zero indexed + ctdb_nodes: list[str] = [""] * pnn_max + for entry in nodes: + pnn = entry["pnn"] + expected_line = _entry_to_node(ctdb_nodes, entry) + ctdb_nodes[pnn] = expected_line + return ctdb_nodes + + +def _save_nodes(path: str, ctdb_nodes: list[str]) -> None: + with open(path, "w") as nffh: + write_nodes_file(nffh, ctdb_nodes) + nffh.flush() + os.fsync(nffh) def ensure_ctdbd_etc_files( From 4b08b8e8204f5e12a9ec8d5b049f03818dbef710 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Tue, 16 Jul 2024 09:14:14 -0400 Subject: [PATCH 6/8] sambacc: simplify _cluster_meta_to_ctdb_nodes function Signed-off-by: John Mulligan --- sambacc/ctdb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sambacc/ctdb.py b/sambacc/ctdb.py index 23bd191..c51bffc 100644 --- a/sambacc/ctdb.py +++ b/sambacc/ctdb.py @@ -497,8 +497,8 @@ def _cluster_meta_to_ctdb_nodes(nodes: list[dict]) -> list[str]: ctdb_nodes: list[str] = [""] * pnn_max for entry in nodes: pnn = entry["pnn"] - expected_line = _entry_to_node(ctdb_nodes, entry) - ctdb_nodes[pnn] = expected_line + # overwrite the pnn indexed entry with expected value + ctdb_nodes[pnn] = _entry_to_node(ctdb_nodes, entry) return ctdb_nodes From 6f19de6a60b440085937d703c0a90899c3b5e273 Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Mon, 15 Jul 2024 13:41:47 -0400 Subject: [PATCH 7/8] sambacc: add monitor_cluster_meta_changes function Add the monitor_cluster_meta_changes to the ctdb module. This function monitors the cluster meta and can reflect those changes into the ctdb nodes file and/or call reload nodes. Unlike the previous manage_cluster_meta_updates the sambacc process is not an active participant in the management of the cluster meta rather we assume some outside entity is updating the metadata with the desired state of the cluster correctly. monitor_cluster_meta_changes exists to translate that state into something ctdb can use. Signed-off-by: John Mulligan --- sambacc/ctdb.py | 82 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/sambacc/ctdb.py b/sambacc/ctdb.py index c51bffc..095033a 100644 --- a/sambacc/ctdb.py +++ b/sambacc/ctdb.py @@ -23,6 +23,7 @@ import typing from sambacc import config +from sambacc import leader from sambacc import samba_cmds from sambacc.jfile import ClusterMetaJSONFile from sambacc.netcmd_loader import template_config @@ -509,6 +510,87 @@ def _save_nodes(path: str, ctdb_nodes: list[str]) -> None: os.fsync(nffh) +def monitor_cluster_meta_changes( + cmeta: ClusterMeta, + pause_func: typing.Callable, + *, + nodes_file_path: typing.Optional[str] = None, + reload_all: bool = False, + leader_locator: typing.Optional[leader.LeaderLocator] = None, +) -> None: + """Monitor cluster meta for changes, reflecting those changes into ctdb. + + Unlike manage_cluster_meta_updates this function never changes the + contents of the nodes list in the cluster meta and takes those values + as a given, assuming some external agent has the correct global view of + the cluster and is updating it correctly. This function exists to + translate that content into something ctdb can understand. + """ + prev_meta: dict[str, typing.Any] = {} + if nodes_file_path: + prev_nodes = read_ctdb_nodes(nodes_file_path) + else: + with cmeta.open(locked=True) as cmo: + meta1 = cmo.load() + prev_nodes = _cluster_meta_to_ctdb_nodes(meta1.get("nodes", [])) + _logger.debug("initial cluster meta content: %r", prev_meta) + _logger.debug("initial nodes content: %r", prev_nodes) + while True: + pause_func() + with cmeta.open(locked=True) as cmo: + curr_meta = cmo.load() + if curr_meta == prev_meta: + _logger.debug("cluster meta content unchanged: %r", curr_meta) + continue + _logger.info("cluster meta content changed") + _logger.debug( + "cluster meta: previous=%r current=%r", prev_meta, curr_meta + ) + prev_meta = curr_meta + + # maybe some other metadata changed? + expected_nodes = _cluster_meta_to_ctdb_nodes( + curr_meta.get("nodes", []) + ) + if prev_nodes == expected_nodes: + _logger.debug("ctdb nodes list unchanged: %r", expected_nodes) + continue + _logger.info("ctdb nodes list changed") + _logger.debug( + "nodes list: previous=%r current=%r", prev_nodes, expected_nodes + ) + prev_nodes = expected_nodes + + if nodes_file_path: + _logger.info("updating nodes file: %s", nodes_file_path) + _save_nodes(nodes_file_path, expected_nodes) + _maybe_reload_nodes(leader_locator, reload_all=reload_all) + + +def _maybe_reload_nodes( + leader_locator: typing.Optional[leader.LeaderLocator] = None, + reload_all: bool = False, +) -> None: + """Issue a reloadnodes command if leader_locator is available and + node is leader or reload_all is true. + """ + if reload_all: + _logger.info("running: ctdb reloadnodes") + subprocess.check_call(list(samba_cmds.ctdb["reloadnodes"])) + return + if leader_locator is None: + _logger.warning("no leader locator: not calling reloadnodes") + return + # use the leader locator to only issue the reloadnodes command once + # for a change instead of all the nodes "spamming" the cluster + with leader_locator as ll: + if ll.is_leader(): + _logger.info("running: ctdb reloadnodes") + subprocess.check_call(list(samba_cmds.ctdb["reloadnodes"])) + else: + _logger.info("node is not leader. skipping reloadnodes") + + def ensure_ctdbd_etc_files( etc_path: str = ETC_DIR, src_path: str = SHARE_DIR ) -> None: From c0154c0a365befe6f809794158c4eaed19f9574e Mon Sep 17 00:00:00 2001 From: John Mulligan Date: Mon, 15 Jul 2024 13:41:47 -0400 Subject: [PATCH 8/8] commands: add ctdb-monitor-nodes command Add a `ctdb-monitor-nodes` command - this command will monitor the cluster meta and update the containers/ctdb when/if it changes. The `--write-nodes` parameter is used to instruct the command to write the ctdb nodes file on changes. The `--reload` option is used to control what "nodes" issue `ctdb reloadnodes` commands: leader (only the cluster leader), all (all nodes that see the cluster meta change), and never (never issue reloadnodes commands). Signed-off-by: John Mulligan --- sambacc/commands/ctdb.py | 44 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/sambacc/commands/ctdb.py b/sambacc/commands/ctdb.py index bb2a81c..abbfe78 100644 --- a/sambacc/commands/ctdb.py +++ b/sambacc/commands/ctdb.py @@ -29,7 +29,7 @@ from sambacc import samba_cmds from sambacc.simple_waiter import Sleeper, Waiter -from .cli import best_waiter, commands, Context, Fail +from .cli import best_leader_locator, best_waiter, commands, Context, Fail _logger = logging.getLogger(__name__) @@ -323,6 +323,48 @@ def ctdb_manage_nodes(ctx: Context) -> None: ) +def _ctdb_monitor_nodes_args(parser: argparse.ArgumentParser) -> None: + _ctdb_must_have_node_args(parser) + parser.add_argument( + "--reload", + choices=("leader", "never", "all"), + default="leader", + help="Specify which nodes can command CTDB to reload nodes", + ) + + +@commands.command(name="ctdb-monitor-nodes", arg_func=_ctdb_monitor_nodes_args) +def ctdb_monitor_nodes(ctx: Context) -> None: + """Run a long lived process to monitor the cluster metadata. + Unlike ctdb_manage_nodes this function assumes that the node state + file is externally managed and primarily exists to reflect any changes + to the cluster meta into CTDB. + """ + _ctdb_ok() + np = NodeParams(ctx) + waiter = np.cluster_meta_waiter() + leader_locator = None + if ctx.cli.reload == "leader": + leader_locator = best_leader_locator(ctx.instance_config) + reload_all = ctx.cli.reload == "all" + nodes_file_path = np.persistent_path if ctx.cli.write_nodes else None + + _logger.info("monitoring cluster meta changes") + _logger.debug( + "reload_all=%s leader_locator=%r", reload_all, leader_locator + ) + limiter = ErrorLimiter("ctdb_monitor_nodes", 10, pause_func=waiter.wait) + while True: + with limiter.catch(): + ctdb.monitor_cluster_meta_changes( + cmeta=np.cluster_meta(), + pause_func=waiter.wait, + nodes_file_path=nodes_file_path, + leader_locator=leader_locator, + reload_all=reload_all, + ) + + def _ctdb_must_have_node_args(parser: argparse.ArgumentParser) -> None: _ctdb_general_node_args(parser) parser.add_argument(