From ba7a372bcae7518eccc9d8da03e6a0a13562f026 Mon Sep 17 00:00:00 2001
From: John Mulligan <jmulligan@redhat.com>
Date: Mon, 15 Jul 2024 10:13:23 -0400
Subject: [PATCH 1/8] commands: fix incorrect help text for --write-nodes

Fix the help text for `--write-nodes`. The previous help text was
incorrect copy-paste.

Signed-off-by: John Mulligan <jmulligan@redhat.com>
---
 sambacc/commands/ctdb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sambacc/commands/ctdb.py b/sambacc/commands/ctdb.py
index a3847c1..962339a 100644
--- a/sambacc/commands/ctdb.py
+++ b/sambacc/commands/ctdb.py
@@ -340,7 +340,7 @@ def _ctdb_must_have_node_args(parser: argparse.ArgumentParser) -> None:
     parser.add_argument(
         "--write-nodes",
         action="store_true",
-        help="Specify node by IP",
+        help="Write ctdb nodes file based on cluster meta contents",
     )
 
 

From 18506269a38010dd1909026978506cdda496dd6f Mon Sep 17 00:00:00 2001
From: John Mulligan <jmulligan@redhat.com>
Date: Sat, 13 Jul 2024 11:10:13 -0400
Subject: [PATCH 2/8] commands: encapsulate retry logic

Move the limited-retries-on-error logic to a class that can be reused
later.

Signed-off-by: John Mulligan <jmulligan@redhat.com>
---
 sambacc/commands/ctdb.py | 58 +++++++++++++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 15 deletions(-)

diff --git a/sambacc/commands/ctdb.py b/sambacc/commands/ctdb.py
index 962339a..4bf2aa1 100644
--- a/sambacc/commands/ctdb.py
+++ b/sambacc/commands/ctdb.py
@@ -17,6 +17,7 @@
 #
 
 import argparse
+import contextlib
 import logging
 import os
 import socket
@@ -311,28 +312,15 @@ def ctdb_manage_nodes(ctx: Context) -> None:
     expected_pnn = np.node_number or 0
     waiter = np.cluster_meta_waiter()
 
-    errors = 0
+    limiter = ErrorLimiter("ctdb_manage_nodes", 10, pause_func=waiter.wait)
     while True:
-        try:
+        with limiter.catch():
             ctdb.monitor_cluster_meta_updates(
                 cmeta=np.cluster_meta(),
                 pnn=expected_pnn,
                 real_path=np.persistent_path,
                 pause_func=waiter.wait,
             )
-            errors = 0
-        except KeyboardInterrupt:
-            raise
-        except Exception as err:
-            _logger.error(
-                f"error during manage_nodes: {err}, count={errors}",
-                exc_info=True,
-            )
-            errors += 1
-            if errors > 10:
-                _logger.error(f"too many retries ({errors}). giving up")
-                raise
-            waiter.wait()
 
 
 def _ctdb_must_have_node_args(parser: argparse.ArgumentParser) -> None:
@@ -411,3 +399,43 @@ def ctdb_rados_mutex(ctx: Context) -> None:
         cmd = cmd["-n", namespace]
     _logger.debug("executing command: %r", cmd)
     samba_cmds.execute(cmd)  # replaces process
+
+
+class ErrorLimiter:
+    def __init__(
+        self,
+        name: str,
+        limit: int,
+        *,
+        pause_func: typing.Optional[typing.Callable] = None,
+    ) -> None:
+        self.name = name
+        self.limit = limit
+        self.errors = 0
+        self.pause_func = pause_func
+
+    def post_catch(self):
+        if self.pause_func is not None:
+            self.pause_func()
+
+    @contextlib.contextmanager
+    def catch(self) -> typing.Iterator[None]:
+        try:
+            _logger.debug(
+                "error limiter proceeding: %s: errors=%r",
+                self.name,
+                self.errors,
+            )
+            yield
+        except KeyboardInterrupt:
+            raise
+        except Exception as err:
+            _logger.error(
+                f"error during {self.name}: {err}, count={self.errors}",
+                exc_info=True,
+            )
+            self.errors += 1
+            if self.errors > self.limit:
+                _logger.error(f"too many retries ({self.errors}). giving up")
+                raise
+            self.post_catch()

From e553414d2a57b5e946f82fb20d71f0d2cc381471 Mon Sep 17 00:00:00 2001
From: John Mulligan <jmulligan@redhat.com>
Date: Sat, 13 Jul 2024 11:21:20 -0400
Subject: [PATCH 3/8] sambacc: rename function to manage_cluster_meta_updates

Rename a function to manage_cluster_meta_updates to better match what it
does. A future change will want to use the word "monitor" for something
else and it would be confusing to continue calling this a "monitor"
function vs "manage -- and this function is the more active
"participant" of the two.

Signed-off-by: John Mulligan <jmulligan@redhat.com>
---
 sambacc/commands/ctdb.py | 2 +-
 sambacc/ctdb.py          | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sambacc/commands/ctdb.py b/sambacc/commands/ctdb.py
index 4bf2aa1..428462e 100644
--- a/sambacc/commands/ctdb.py
+++ b/sambacc/commands/ctdb.py
@@ -315,7 +315,7 @@ def ctdb_manage_nodes(ctx: Context) -> None:
     limiter = ErrorLimiter("ctdb_manage_nodes", 10, pause_func=waiter.wait)
     while True:
         with limiter.catch():
-            ctdb.monitor_cluster_meta_updates(
+            ctdb.manage_cluster_meta_updates(
                 cmeta=np.cluster_meta(),
                 pnn=expected_pnn,
                 real_path=np.persistent_path,
diff --git a/sambacc/ctdb.py b/sambacc/ctdb.py
index bfb2a0a..66bfb88 100644
--- a/sambacc/ctdb.py
+++ b/sambacc/ctdb.py
@@ -338,7 +338,7 @@ def manage_nodes(
     pause_func: typing.Callable,
 ) -> None:
     """Monitor nodes json for updates, reflecting those changes into ctdb."""
-    monitor_cluster_meta_updates(
+    manage_cluster_meta_updates(
         ClusterMetaJSONFile(nodes_json),
         pnn,
         real_path,
@@ -346,7 +346,7 @@ def manage_nodes(
     )
 
 
-def monitor_cluster_meta_updates(
+def manage_cluster_meta_updates(
     cmeta: ClusterMeta,
     pnn: int,
     real_path: str,

From c1e995ccdea5017296cea21670269988a3c7c1e0 Mon Sep 17 00:00:00 2001
From: John Mulligan <jmulligan@redhat.com>
Date: Sat, 13 Jul 2024 11:22:05 -0400
Subject: [PATCH 4/8] commands: improve ctdb_manage_nodes doc comment

Make the doc comment on ctdb_manage_nodes more accurate.

Signed-off-by: John Mulligan <jmulligan@redhat.com>
---
 sambacc/commands/ctdb.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sambacc/commands/ctdb.py b/sambacc/commands/ctdb.py
index 428462e..bb2a81c 100644
--- a/sambacc/commands/ctdb.py
+++ b/sambacc/commands/ctdb.py
@@ -303,9 +303,9 @@ def ctdb_set_node(ctx: Context) -> None:
 
 @commands.command(name="ctdb-manage-nodes", arg_func=_ctdb_general_node_args)
 def ctdb_manage_nodes(ctx: Context) -> None:
-    """Run a long lived procees to monitor the node state file for new nodes.
-    When a new node is found, if the current node is in the correct state, this
-    node will add it to CTDB.
+    """Run a long lived process to manage the cluster metadata. It can add new
+    nodes. When a new node is found, if the current node is in the correct
+    state, this node will add it to CTDB.
     """
     _ctdb_ok()
     np = NodeParams(ctx)

From 38ca35e27737f14076bfcef313891309ea83f03d Mon Sep 17 00:00:00 2001
From: John Mulligan <jmulligan@redhat.com>
Date: Sat, 13 Jul 2024 13:55:05 -0400
Subject: [PATCH 5/8] sambacc: split out some potentially common ctdb logic
 into funcs

Split out some logic from the cluster_meta_to_nodes function into
separate functions for current & later reuse.

Signed-off-by: John Mulligan <jmulligan@redhat.com>
---
 sambacc/ctdb.py | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/sambacc/ctdb.py b/sambacc/ctdb.py
index 66bfb88..23bd191 100644
--- a/sambacc/ctdb.py
+++ b/sambacc/ctdb.py
@@ -464,10 +464,7 @@ def _node_update(cmeta: ClusterMeta, real_path: str) -> bool:
                 new_ctdb_nodes.append(expected_line)
             else:
                 new_ctdb_nodes[pnn] = expected_line
-        with open(real_path, "w") as nffh:
-            write_nodes_file(nffh, new_ctdb_nodes)
-            nffh.flush()
-            os.fsync(nffh)
+        _save_nodes(real_path, new_ctdb_nodes)
         _logger.info("running: ctdb reloadnodes")
         subprocess.check_call(list(samba_cmds.ctdb["reloadnodes"]))
         for entry in need_reload:
@@ -490,17 +487,26 @@ def cluster_meta_to_nodes(cmeta: ClusterMeta, real_path: str) -> None:
         json_data = cmo.load()
         nodes = json_data.get("nodes", [])
         _logger.info("Found node metadata: %r", nodes)
-        pnn_max = max(n["pnn"] for n in nodes) + 1  # pnn is zero indexed
-        ctdb_nodes: list[str] = [""] * pnn_max
-        for entry in nodes:
-            pnn = entry["pnn"]
-            expected_line = _entry_to_node(ctdb_nodes, entry)
-            ctdb_nodes[pnn] = expected_line
+        ctdb_nodes = _cluster_meta_to_ctdb_nodes(nodes)
         _logger.info("Will write nodes: %s", ctdb_nodes)
-        with open(real_path, "w") as nffh:
-            write_nodes_file(nffh, ctdb_nodes)
-            nffh.flush()
-            os.fsync(nffh)
+        _save_nodes(real_path, ctdb_nodes)
+
+
+def _cluster_meta_to_ctdb_nodes(nodes: list[dict]) -> list[str]:
+    pnn_max = max(n["pnn"] for n in nodes) + 1  # pnn is zero indexed
+    ctdb_nodes: list[str] = [""] * pnn_max
+    for entry in nodes:
+        pnn = entry["pnn"]
+        expected_line = _entry_to_node(ctdb_nodes, entry)
+        ctdb_nodes[pnn] = expected_line
+    return ctdb_nodes
+
+
+def _save_nodes(path: str, ctdb_nodes: list[str]) -> None:
+    with open(path, "w") as nffh:
+        write_nodes_file(nffh, ctdb_nodes)
+        nffh.flush()
+        os.fsync(nffh)
 
 
 def ensure_ctdbd_etc_files(

From 4b08b8e8204f5e12a9ec8d5b049f03818dbef710 Mon Sep 17 00:00:00 2001
From: John Mulligan <jmulligan@redhat.com>
Date: Tue, 16 Jul 2024 09:14:14 -0400
Subject: [PATCH 6/8] sambacc: simplify _cluster_meta_to_ctdb_nodes function

Signed-off-by: John Mulligan <jmulligan@redhat.com>
---
 sambacc/ctdb.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sambacc/ctdb.py b/sambacc/ctdb.py
index 23bd191..c51bffc 100644
--- a/sambacc/ctdb.py
+++ b/sambacc/ctdb.py
@@ -497,8 +497,8 @@ def _cluster_meta_to_ctdb_nodes(nodes: list[dict]) -> list[str]:
     ctdb_nodes: list[str] = [""] * pnn_max
     for entry in nodes:
         pnn = entry["pnn"]
-        expected_line = _entry_to_node(ctdb_nodes, entry)
-        ctdb_nodes[pnn] = expected_line
+        # overwrite the pnn indexed entry with expected value
+        ctdb_nodes[pnn] = _entry_to_node(ctdb_nodes, entry)
     return ctdb_nodes
 
 

From 6f19de6a60b440085937d703c0a90899c3b5e273 Mon Sep 17 00:00:00 2001
From: John Mulligan <jmulligan@redhat.com>
Date: Mon, 15 Jul 2024 13:41:47 -0400
Subject: [PATCH 7/8] sambacc: add monitor_cluster_meta_changes function

Add the monitor_cluster_meta_changes to the ctdb module. This function
monitors the cluster meta and can reflect those changes into the ctdb
nodes file and/or call reload nodes. Unlike the previous
manage_cluster_meta_updates the sambacc process is not an active
participant in the management of the cluster meta rather we assume some
outside entity is updating the metadata with the desired state of the
cluster correctly. monitor_cluster_meta_changes exists to translate
that state into something ctdb can use.

Signed-off-by: John Mulligan <jmulligan@redhat.com>
---
 sambacc/ctdb.py | 82 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/sambacc/ctdb.py b/sambacc/ctdb.py
index c51bffc..095033a 100644
--- a/sambacc/ctdb.py
+++ b/sambacc/ctdb.py
@@ -23,6 +23,7 @@
 import typing
 
 from sambacc import config
+from sambacc import leader
 from sambacc import samba_cmds
 from sambacc.jfile import ClusterMetaJSONFile
 from sambacc.netcmd_loader import template_config
@@ -509,6 +510,87 @@ def _save_nodes(path: str, ctdb_nodes: list[str]) -> None:
         os.fsync(nffh)
 
 
+def monitor_cluster_meta_changes(
+    cmeta: ClusterMeta,
+    pause_func: typing.Callable,
+    *,
+    nodes_file_path: typing.Optional[str] = None,
+    reload_all: bool = False,
+    leader_locator: typing.Optional[leader.LeaderLocator] = None,
+) -> None:
+    """Monitor cluster meta for changes, reflecting those changes into ctdb.
+
+    Unlike manage_cluster_meta_updates this function never changes the
+    contents of the nodes list in the cluster meta and takes those values
+    as a given, assuming some external agent has the correct global view of
+    the cluster and is updating it correctly. This function exists to
+    translate that content into something ctdb can understand.
+    """
+    prev_meta: dict[str, typing.Any] = {}
+    if nodes_file_path:
+        prev_nodes = read_ctdb_nodes(nodes_file_path)
+    else:
+        with cmeta.open(locked=True) as cmo:
+            meta1 = cmo.load()
+        prev_nodes = _cluster_meta_to_ctdb_nodes(meta1.get("nodes", []))
+    _logger.debug("initial cluster meta content: %r", prev_meta)
+    _logger.debug("initial nodes content: %r", prev_nodes)
+    while True:
+        pause_func()
+        with cmeta.open(locked=True) as cmo:
+            curr_meta = cmo.load()
+        if curr_meta == prev_meta:
+            _logger.debug("cluster meta content unchanged: %r", curr_meta)
+            continue
+        _logger.info("cluster meta content changed")
+        _logger.debug(
+            "cluster meta: previous=%r current=%r", prev_meta, curr_meta
+        )
+        prev_meta = curr_meta
+
+        # maybe some other metadata changed?
+        expected_nodes = _cluster_meta_to_ctdb_nodes(
+            curr_meta.get("nodes", [])
+        )
+        if prev_nodes == expected_nodes:
+            _logger.debug("ctdb nodes list unchanged: %r", expected_nodes)
+            continue
+        _logger.info("ctdb nodes list changed")
+        _logger.debug(
+            "nodes list: previous=%r current=%r", prev_nodes, expected_nodes
+        )
+        prev_nodes = expected_nodes
+
+        if nodes_file_path:
+            _logger.info("updating nodes file: %s", nodes_file_path)
+            _save_nodes(nodes_file_path, expected_nodes)
+        _maybe_reload_nodes(leader_locator, reload_all=reload_all)
+
+
+def _maybe_reload_nodes(
+    leader_locator: typing.Optional[leader.LeaderLocator] = None,
+    reload_all: bool = False,
+) -> None:
+    """Issue a reloadnodes command if leader_locator is available and
+    node is leader or reload_all is true.
+    """
+    if reload_all:
+        _logger.info("running: ctdb reloadnodes")
+        subprocess.check_call(list(samba_cmds.ctdb["reloadnodes"]))
+        return
+    if leader_locator is None:
+        _logger.warning("no leader locator: not calling reloadnodes")
+        return
+    # use the leader locator to only issue the reloadnodes command once
+    # for a change instead of all the nodes "spamming" the cluster
+    with leader_locator as ll:
+        if ll.is_leader():
+            _logger.info("running: ctdb reloadnodes")
+            subprocess.check_call(list(samba_cmds.ctdb["reloadnodes"]))
+        else:
+            _logger.info("node is not leader. skipping reloadnodes")
+
+
 def ensure_ctdbd_etc_files(
     etc_path: str = ETC_DIR, src_path: str = SHARE_DIR
 ) -> None:

From c0154c0a365befe6f809794158c4eaed19f9574e Mon Sep 17 00:00:00 2001
From: John Mulligan <jmulligan@redhat.com>
Date: Mon, 15 Jul 2024 13:41:47 -0400
Subject: [PATCH 8/8] commands: add ctdb-monitor-nodes command

Add a `ctdb-monitor-nodes` command - this command will monitor the
cluster meta and update the containers/ctdb when/if it changes.
The `--write-nodes` parameter is used to instruct the command to
write the ctdb nodes file on changes. The `--reload` option is used to
control what "nodes" issue `ctdb reloadnodes` commands: leader (only the
cluster leader), all (all nodes that see the cluster meta change), and never
(never issue reloadnodes commands).

Signed-off-by: John Mulligan <jmulligan@redhat.com>
---
 sambacc/commands/ctdb.py | 44 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/sambacc/commands/ctdb.py b/sambacc/commands/ctdb.py
index bb2a81c..abbfe78 100644
--- a/sambacc/commands/ctdb.py
+++ b/sambacc/commands/ctdb.py
@@ -29,7 +29,7 @@
 from sambacc import samba_cmds
 from sambacc.simple_waiter import Sleeper, Waiter
 
-from .cli import best_waiter, commands, Context, Fail
+from .cli import best_leader_locator, best_waiter, commands, Context, Fail
 
 _logger = logging.getLogger(__name__)
 
@@ -323,6 +323,48 @@ def ctdb_manage_nodes(ctx: Context) -> None:
             )
 
 
+def _ctdb_monitor_nodes_args(parser: argparse.ArgumentParser) -> None:
+    _ctdb_must_have_node_args(parser)
+    parser.add_argument(
+        "--reload",
+        choices=("leader", "never", "all"),
+        default="leader",
+        help="Specify which nodes can command CTDB to reload nodes",
+    )
+
+
+@commands.command(name="ctdb-monitor-nodes", arg_func=_ctdb_monitor_nodes_args)
+def ctdb_monitor_nodes(ctx: Context) -> None:
+    """Run a long lived process to monitor the cluster metadata.
+    Unlike ctdb_manage_nodes this function assumes that the node state
+    file is externally managed and primarily exists to reflect any changes
+    to the cluster meta into CTDB.
+    """
+    _ctdb_ok()
+    np = NodeParams(ctx)
+    waiter = np.cluster_meta_waiter()
+    leader_locator = None
+    if ctx.cli.reload == "leader":
+        leader_locator = best_leader_locator(ctx.instance_config)
+    reload_all = ctx.cli.reload == "all"
+    nodes_file_path = np.persistent_path if ctx.cli.write_nodes else None
+
+    _logger.info("monitoring cluster meta changes")
+    _logger.debug(
+        "reload_all=%s leader_locator=%r", reload_all, leader_locator
+    )
+    limiter = ErrorLimiter("ctdb_monitor_nodes", 10, pause_func=waiter.wait)
+    while True:
+        with limiter.catch():
+            ctdb.monitor_cluster_meta_changes(
+                cmeta=np.cluster_meta(),
+                pause_func=waiter.wait,
+                nodes_file_path=nodes_file_path,
+                leader_locator=leader_locator,
+                reload_all=reload_all,
+            )
+
+
 def _ctdb_must_have_node_args(parser: argparse.ArgumentParser) -> None:
     _ctdb_general_node_args(parser)
     parser.add_argument(