Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[crmsh-4.6] Setup bootstrap stages dependency #1401

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 108 additions & 43 deletions crmsh/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from .sh import ShellUtils
from .ui_node import NodeMgmt
from .user_of_host import UserOfHost, UserNotFoundError
import crmsh.healthcheck

logger = log.setup_logger(__name__)
logger_utils = log.LoggerUtils(logger)
Expand Down Expand Up @@ -74,7 +75,11 @@
"/etc/drbd.conf", "/etc/drbd.d", "/etc/ha.d/ldirectord.cf", "/etc/lvm/lvm.conf", "/etc/multipath.conf",
"/etc/samba/smb.conf", SYSCONFIG_NFS, SYSCONFIG_PCMK, SYSCONFIG_SBD, PCMK_REMOTE_AUTH, WATCHDOG_CFG,
PROFILES_FILE, CRM_CFG, SBD_SYSTEMD_DELAY_START_DIR)
INIT_STAGES = ("ssh", "csync2", "csync2_remote", "qnetd_remote", "corosync", "remote_auth", "sbd", "cluster", "ocfs2", "admin", "qdevice")

INIT_STAGES_EXTERNAL = ("ssh", "csync2", "corosync", "sbd", "cluster", "ocfs2", "admin", "qdevice")
INIT_STAGES_INTERNAL = ("csync2_remote", "qnetd_remote", "remote_auth")
INIT_STAGES_ALL = INIT_STAGES_EXTERNAL + INIT_STAGES_INTERNAL
JOIN_STAGES_EXTERNAL = ("ssh", "csync2", "ssh_merge", "cluster")


class Context(object):
Expand Down Expand Up @@ -232,15 +237,40 @@
"""
Validate cluster_node on join side
"""
if self.cluster_node and self.type == 'join':
if self.type == "join" and self.cluster_node:
user, node = _parse_user_at_host(self.cluster_node, None)
try:
# self.cluster_node might be hostname or IP address
ip_addr = socket.gethostbyname(node)
if utils.InterfacesInfo.ip_in_local(ip_addr):
utils.fatal("Please specify peer node's hostname or IP address")
except socket.gaierror as err:
utils.fatal("\"{}\": {}".format(node, err))
utils.fatal(f"\"{node}\": {err}")

Check warning on line 248 in crmsh/bootstrap.py

View check run for this annotation

Codecov / codecov/patch

crmsh/bootstrap.py#L248

Added line #L248 was not covered by tests

def _validate_stage(self):
"""
Validate stage argument
"""
if not self.stage:
if self.cluster_is_running:
utils.fatal("Cluster is already running!")

Check warning on line 256 in crmsh/bootstrap.py

View check run for this annotation

Codecov / codecov/patch

crmsh/bootstrap.py#L256

Added line #L256 was not covered by tests
return

if self.type == "init":
if self.stage not in INIT_STAGES_ALL:
utils.fatal(f"Invalid stage: {self.stage}(available stages: {', '.join(INIT_STAGES_EXTERNAL)})")
if self.stage in ("admin", "qdevice", "ocfs2") and not self.cluster_is_running:
utils.fatal(f"Cluster is inactive, can't run '{self.stage}' stage")
if self.stage in ("corosync", "cluster") and self.cluster_is_running:
utils.fatal(f"Cluster is active, can't run '{self.stage}' stage")

elif self.type == "join":
if self.stage not in JOIN_STAGES_EXTERNAL:
utils.fatal(f"Invalid stage: {self.stage}(available stages: {', '.join(JOIN_STAGES_EXTERNAL)})")
if self.stage and self.cluster_node is None:
utils.fatal(f"Can't use stage({self.stage}) without specifying cluster node")
if self.stage in ("cluster", ) and self.cluster_is_running:
utils.fatal(f"Cluster is active, can't run '{self.stage}' stage")

Check warning on line 273 in crmsh/bootstrap.py

View check run for this annotation

Codecov / codecov/patch

crmsh/bootstrap.py#L273

Added line #L273 was not covered by tests

def validate_option(self):
"""
Expand All @@ -263,6 +293,7 @@
self.skip_csync2 = utils.get_boolean(os.getenv("SKIP_CSYNC2_SYNC"))
if self.skip_csync2 and self.stage:
utils.fatal("-x option or SKIP_CSYNC2_SYNC can't be used with any stage")
self._validate_stage()
self._validate_cluster_node()
self._validate_nodes_option()
self._validate_sbd_option()
Expand Down Expand Up @@ -553,7 +584,7 @@
return False


def check_prereqs(stage):
def check_prereqs():
warned = False

if not my_hostname_resolves():
Expand Down Expand Up @@ -1710,6 +1741,9 @@
change_user_shell('hacluster')
swap_public_ssh_key_for_secondary_user(sh.cluster_shell(), seed_host, 'hacluster')

if _context.stage:
setup_passwordless_with_other_nodes(seed_host, seed_user)


def join_ssh_with_ssh_agent(
local_shell: sh.LocalShell,
Expand Down Expand Up @@ -2367,46 +2401,87 @@
corosync.set_value("quorum.expected_votes", str(new_quorum))


def ssh_stage_finished():
"""
Dectect if the ssh stage is finished
"""
feature_check = crmsh.healthcheck.PasswordlessHaclusterAuthenticationFeature()
return feature_check.check_quick() and feature_check.check_local([utils.this_node()])


def csync2_stage_finished():
"""
Dectect if the csync2 stage is finished
"""
return ServiceManager().service_is_active(CSYNC2_SERVICE)


def corosync_stage_finished():
"""
Dectect if the corosync stage is finished
"""
return os.path.exists(corosync.conf())


INIT_STAGE_CHECKER = {
# stage: (function, is_internal)
"ssh": (ssh_stage_finished, False),
"csync2": (csync2_stage_finished, False),
"corosync": (corosync_stage_finished, False),
"remote_auth": (init_remote_auth, True),
"sbd": (lambda: True, False),
"upgradeutil": (init_upgradeutil, True),
"cluster": (is_online, False)
}


JOIN_STAGE_CHECKER = {
# stage: (function, is_internal)
"ssh": (ssh_stage_finished, False),
"csync2": (csync2_stage_finished, False),
"ssh_merge": (lambda: True, False),
"cluster": (is_online, False)
}


def check_stage_dependency(stage):
stage_checker = INIT_STAGE_CHECKER if _context.type == "init" else JOIN_STAGE_CHECKER
if stage not in stage_checker:
return
stage_order = list(stage_checker.keys())
for stage_name in stage_order:
if stage == stage_name:
break
func, is_internal = stage_checker[stage_name]
if is_internal:
func()
elif not func():
utils.fatal(f"Please run '{stage_name}' stage first")


def bootstrap_init(context):
"""
Init cluster process
"""
global _context
_context = context
stage = _context.stage

init()

stage = _context.stage
if stage is None:
stage = ""

# vgfs stage requires running cluster, everything else requires inactive cluster,
# except ssh and csync2 (which don't care) and csync2_remote (which mustn't care,
# just in case this breaks ha-cluster-join on another node).
if stage in ("vgfs", "admin", "qdevice", "ocfs2"):
if not _context.cluster_is_running:
utils.fatal("Cluster is inactive - can't run %s stage" % (stage))
elif stage == "":
if _context.cluster_is_running:
utils.fatal("Cluster is currently active - can't run")
elif stage not in ("ssh", "csync2", "csync2_remote", "qnetd_remote", "sbd", "ocfs2"):
if _context.cluster_is_running:
utils.fatal("Cluster is currently active - can't run %s stage" % (stage))

_context.load_profiles()
_context.init_sbd_manager()

# Need hostname resolution to work, want NTP (but don't block csync2_remote)
if stage not in ('csync2_remote', 'qnetd_remote'):
check_tty()
if not check_prereqs(stage):
return
else:
if stage in ('csync2_remote', 'qnetd_remote'):
args = _context.args
logger_utils.log_only_to_file("args: {}".format(args))
logger_utils.log_only_to_file(f"args: {args}")
if len(args) != 2:
utils.fatal(f"Expected NODE argument to {stage} stage")
utils.fatal(f"Expected NODE argument for '{stage}' stage")

Check warning on line 2479 in crmsh/bootstrap.py

View check run for this annotation

Codecov / codecov/patch

crmsh/bootstrap.py#L2479

Added line #L2479 was not covered by tests
_context.cluster_node = args[1]
else:
check_tty()
if not check_prereqs():
return

Check warning on line 2484 in crmsh/bootstrap.py

View check run for this annotation

Codecov / codecov/patch

crmsh/bootstrap.py#L2484

Added line #L2484 was not covered by tests

if stage and _context.cluster_is_running and \
not ServiceManager(shell=sh.ClusterShellAdaptorForLocalShell(sh.LocalShell())).service_is_active(CSYNC2_SERVICE):
Expand All @@ -2416,6 +2491,7 @@
_context.node_list_in_cluster = [utils.this_node()]

if stage != "":
check_stage_dependency(stage)
globals()["init_" + stage]()
else:
init_ssh()
Expand Down Expand Up @@ -2492,15 +2568,13 @@

check_tty()

corosync_active = ServiceManager(sh.ClusterShellAdaptorForLocalShell(sh.LocalShell())).service_is_active("corosync.service")
if corosync_active and _context.stage != "ssh":
utils.fatal("Abort: Cluster is currently active. Run this command on a node joining the cluster.")

if not check_prereqs("join"):
if not check_prereqs():
return

if _context.stage != "":
remote_user, cluster_node = _parse_user_at_host(_context.cluster_node, _context.current_user)
init_upgradeutil()
check_stage_dependency(_context.stage)
globals()["join_" + _context.stage](cluster_node, remote_user)
else:
if not _context.yes_to_all and _context.cluster_node is None:
Expand All @@ -2527,7 +2601,6 @@
service_manager = ServiceManager()
_context.node_list_in_cluster = utils.fetch_cluster_node_list_from_node(cluster_node)
setup_passwordless_with_other_nodes(cluster_node, remote_user)
join_remote_auth(cluster_node, remote_user)
_context.skip_csync2 = not service_manager.service_is_active(CSYNC2_SERVICE, cluster_node)
if _context.skip_csync2:
service_manager.stop_service(CSYNC2_SERVICE, disable=True)
Expand Down Expand Up @@ -2557,14 +2630,6 @@
ocfs2_inst.join_ocfs2(peer_host)


def join_remote_auth(node, user):
if os.path.exists(PCMK_REMOTE_AUTH):
utils.rmfile(PCMK_REMOTE_AUTH)
pcmk_remote_dir = os.path.dirname(PCMK_REMOTE_AUTH)
utils.mkdirs_owned(pcmk_remote_dir, mode=0o750, gid="haclient")
utils.touch(PCMK_REMOTE_AUTH)


def remove_qdevice():
"""
Remove qdevice service and configuration from cluster
Expand Down
11 changes: 1 addition & 10 deletions crmsh/ui_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,12 +331,6 @@ def do_init(self, context, *args):
'''
Initialize a cluster.
'''
def looks_like_hostnames(lst):
sectionlist = bootstrap.INIT_STAGES
return all(not (l.startswith('-') or l in sectionlist) for l in lst)
if len(args) > 0:
if '--dry-run' in args or looks_like_hostnames(args):
args = ['--yes', '--nodes'] + [arg for arg in args if arg != '--dry-run']
parser = ArgumentParser(description="""
Initialize a cluster from scratch. This command configures
a complete cluster, and can also add additional cluster
Expand Down Expand Up @@ -471,8 +465,6 @@ def looks_like_hostnames(lst):
if stage == "vgfs":
stage = "ocfs2"
logger.warning("vgfs stage was deprecated and is an alias of ocfs2 stage now")
if stage not in bootstrap.INIT_STAGES and stage != "":
parser.error("Invalid stage (%s)" % (stage))

if options.qnetd_addr_input:
if not ServiceManager().service_is_available("corosync-qdevice.service"):
Expand Down Expand Up @@ -547,12 +539,11 @@ def do_join(self, context, *args):
stage = ""
if len(args) == 1:
stage = args[0]
if stage not in ("ssh", "csync2", "ssh_merge", "cluster", ""):
parser.error("Invalid stage (%s)" % (stage))

join_context = bootstrap.Context.set_context(options)
join_context.ui_context = context
join_context.stage = stage
join_context.cluster_is_running = ServiceManager(sh.ClusterShellAdaptorForLocalShell(sh.LocalShell())).service_is_active("pacemaker.service")
join_context.type = "join"
join_context.validate_option()

Expand Down
8 changes: 0 additions & 8 deletions crmsh/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,14 +468,6 @@ def chmod(path, mod):
fatal("Failed to chmod {}: {}".format(path, err))


def touch(file_name):
rc, out, err = ShellUtils().get_stdout_stderr("touch " + file_name, no_reg=True)
if rc != 0:
rc, out, err = ShellUtils().get_stdout_stderr("sudo touch " + file_name, no_reg=True)
if rc != 0:
fatal("Failed create file {}: {}".format(file_name, err))


def copy_local_file(src, dest):
try:
shutil.copyfile(src, dest)
Expand Down
25 changes: 25 additions & 0 deletions test/features/bootstrap_bugs.feature
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,31 @@ Feature: Regression test for bootstrap bugs
Tag @clean means need to stop cluster service if the service is available
Need nodes: hanode1 hanode2 hanode3

@clean
Scenario: Stages dependency (bsc#1175865)
Given Cluster service is "stopped" on "hanode1"
And Cluster service is "stopped" on "hanode2"
When Try "crm cluster init cluster -y" on "hanode1"
Then Except "ERROR: cluster.init: Please run 'ssh' stage first"
When Run "crm cluster init ssh -y" on "hanode1"
When Try "crm cluster init cluster -y" on "hanode1"
Then Except "ERROR: cluster.init: Please run 'csync2' stage first"
When Run "crm cluster init csync2 -y" on "hanode1"
When Try "crm cluster init cluster -y" on "hanode1"
Then Except "ERROR: cluster.init: Please run 'corosync' stage first"
When Run "crm cluster init corosync -y" on "hanode1"
When Run "crm cluster init cluster -y" on "hanode1"
Then Cluster service is "started" on "hanode1"

When Try "crm cluster join cluster -c hanode1 -y" on "hanode2"
Then Except "ERROR: cluster.join: Please run 'ssh' stage first"
When Try "crm cluster join ssh -c hanode1 -y" on "hanode2"
When Try "crm cluster join cluster -c hanode1 -y" on "hanode2"
Then Except "ERROR: cluster.join: Please run 'csync2' stage first"
When Try "crm cluster join csync2 -c hanode1 -y" on "hanode2"
When Try "crm cluster join cluster -c hanode1 -y" on "hanode2"
Then Cluster service is "started" on "hanode2"

@clean
Scenario: Set placement-strategy value as "default"(bsc#1129462)
Given Cluster service is "stopped" on "hanode1"
Expand Down
15 changes: 15 additions & 0 deletions test/features/bootstrap_options.feature
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,15 @@ Feature: crmsh bootstrap process - options
When Try "crm cluster init sbd -N hanode1 -N hanode2 -y" on "hanode1"
Then Expected "Can't use -N/--nodes option and stage(sbd) together" in stderr

@clean
Scenario: Stage validation
When Try "crm cluster init fdsf -y" on "hanode1"
Then Expected "Invalid stage: fdsf(available stages: ssh, csync2, corosync, sbd, cluster, ocfs2, admin, qdevice)" in stderr
When Try "crm cluster join fdsf -y" on "hanode1"
Then Expected "Invalid stage: fdsf(available stages: ssh, csync2, ssh_merge, cluster)" in stderr
When Try "crm cluster join ssh -y" on "hanode1"
Then Expected "Can't use stage(ssh) without specifying cluster node" in stderr

@clean
Scenario: Init whole cluster service on node "hanode1" using "--node" option
Given Cluster service is "stopped" on "hanode1"
Expand All @@ -51,6 +60,9 @@ Feature: crmsh bootstrap process - options
And Online nodes are "hanode1 hanode2"
And Show cluster status on "hanode1"

When Try "crm cluster init cluster -y" on "hanode1"
Then Expected "Cluster is active, can't run 'cluster' stage" in stderr

@clean
Scenario: Bind specific network interface using "-i" option
Given Cluster service is "stopped" on "hanode1"
Expand Down Expand Up @@ -96,6 +108,9 @@ Feature: crmsh bootstrap process - options
And Cluster virtual IP is "@vip.0"
And Show cluster status on "hanode1"

When Try "crm cluster init cluster -y" on "hanode1"
Then Expected "Cluster is active, can't run 'cluster' stage" in stderr

@clean
Scenario: Init cluster service with udpu using "-u" option
Given Cluster service is "stopped" on "hanode1"
Expand Down
2 changes: 1 addition & 1 deletion test/features/qdevice_validate.feature
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ Feature: corosync qdevice/qnetd options validate
Scenario: Run qdevice stage on inactive cluster node
Given Cluster service is "stopped" on "hanode1"
When Try "crm cluster init qdevice --qnetd-hostname=qnetd-node"
Then Except "ERROR: cluster.init: Cluster is inactive - can't run qdevice stage"
Then Except "ERROR: cluster.init: Cluster is inactive, can't run 'qdevice' stage"

@clean
Scenario: Run qdevice stage but miss "--qnetd-hostname" option
Expand Down
2 changes: 1 addition & 1 deletion test/unittests/test_bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,7 @@ def test_join_ssh(
mock_get_node_cononical_hostname,
mock_detect_cluster_service_on_node
):
bootstrap._context = mock.Mock(current_user="bob", default_nic="eth1", use_ssh_agent=False)
bootstrap._context = mock.Mock(current_user="bob", default_nic="eth1", use_ssh_agent=False, stage=None)
mock_swap.return_value = None
mock_ssh_copy_id.return_value = 0
mock_get_node_cononical_hostname.return_value='node1'
Expand Down
Loading