From bab0537ba3d175f301e72d6fd5acfeb212b73c8c Mon Sep 17 00:00:00 2001 From: xin liang Date: Mon, 20 Oct 2025 19:58:33 +0800 Subject: [PATCH 1/3] Dev: utils: Adjust node_reachable_check function and the way it is used - Drop ping-based check and only use SSH to determine node reachability - When SSH check fails, raise NoSSHError when config.core.no_ssh is set to yes - Otherwise, raise ValueError as before --- crmsh/qdevice.py | 3 --- crmsh/ui_cluster.py | 26 ++++++++++++++------------ crmsh/utils.py | 18 ++++++++---------- 3 files changed, 22 insertions(+), 25 deletions(-) diff --git a/crmsh/qdevice.py b/crmsh/qdevice.py index 9d04969d7..8dc6474db 100644 --- a/crmsh/qdevice.py +++ b/crmsh/qdevice.py @@ -213,9 +213,6 @@ def check_qnetd_addr(qnetd_addr): if utils.InterfacesInfo.ip_in_local(qnetd_ip): raise ValueError("host for qnetd must be a remote one") - if not utils.check_port_open(qnetd_ip, 22): - raise ValueError("ssh service on \"{}\" not available".format(qnetd_addr)) - @staticmethod def check_qdevice_port(qdevice_port): if not utils.valid_port(qdevice_port): diff --git a/crmsh/ui_cluster.py b/crmsh/ui_cluster.py index f9230646d..1e93db75b 100644 --- a/crmsh/ui_cluster.py +++ b/crmsh/ui_cluster.py @@ -167,7 +167,13 @@ def do_start(self, context, *args): ''' Starts the cluster stack on all nodes or specific node(s) ''' - node_list = parse_option_for_nodes(context, *args) + try: + node_list = parse_option_for_nodes(context, *args) + except utils.NoSSHError as msg: + logger.error('%s', msg) + logger.info("Please try 'crm cluster start' on each node") + return False + service_check_list = ["pacemaker.service"] start_qdevice = False if corosync.is_qdevice_configured(): @@ -175,15 +181,10 @@ def do_start(self, context, *args): service_check_list.append("corosync-qdevice.service") service_manager = ServiceManager() - try: - for node in node_list[:]: - if all([service_manager.service_is_active(srv, remote_addr=node) for srv in service_check_list]): - logger.info("The cluster stack already started on {}".format(node)) - node_list.remove(node) - except utils.NoSSHError as msg: - logger.error('%s', msg) - logger.info("Please try 'crm cluster start' on each node") - return False + for node in node_list[:]: + if all([service_manager.service_is_active(srv, remote_addr=node) for srv in service_check_list]): + logger.info("The cluster stack already started on {}".format(node)) + node_list.remove(node) if not node_list: return @@ -248,13 +249,14 @@ def do_stop(self, context, *args): ''' Stops the cluster stack on all nodes or specific node(s) ''' - node_list = parse_option_for_nodes(context, *args) try: - node_list = [n for n in node_list if self._node_ready_to_stop_cluster_service(n)] + node_list = parse_option_for_nodes(context, *args) except utils.NoSSHError as msg: logger.error('%s', msg) logger.info("Please try 'crm cluster stop' on each node") return False + + node_list = [n for n in node_list if self._node_ready_to_stop_cluster_service(n)] if not node_list: return logger.debug(f"stop node list: {node_list}") diff --git a/crmsh/utils.py b/crmsh/utils.py index f9e209bd2..eabf7462b 100644 --- a/crmsh/utils.py +++ b/crmsh/utils.py @@ -2427,25 +2427,23 @@ def package_is_installed(pkg, remote_addr=None): return rc == 0 -def node_reachable_check(node, ping_count=1, port=22, timeout=3): +def node_reachable_check(node): """ - Check if node is reachable by using ping and socket to ssh port + Check if node is reachable by checking SSH port is open """ - rc, _, _ = ShellUtils().get_stdout_stderr(f"ping -n -c {ping_count} -W {timeout} {node}") - if rc == 0: - return True - # ping failed, try to connect to ssh port by socket - if check_port_open(node, port, timeout): + if node == this_node() or check_port_open(node, 22): return True - # both ping and socket failed - raise ValueError(f"host \"{node}\" is unreachable") + if config.core.no_ssh: + raise NoSSHError(constants.NO_SSH_ERROR_MSG) + else: + raise ValueError(f"host \"{node}\" is unreachable via SSH") def get_reachable_node_list(node_list:list[str]) -> list[str]: reachable_node_list = [] for node in node_list: try: - if node == this_node() or node_reachable_check(node): + if node_reachable_check(node): reachable_node_list.append(node) except ValueError as e: logger.warning(str(e)) From ed5d2d71634fb41c59afc90ea90843bdf9d3aa6b Mon Sep 17 00:00:00 2001 From: xin liang Date: Sun, 19 Oct 2025 11:39:10 +0800 Subject: [PATCH 2/3] Fix: utils: Raise UnreachableNodeError for those ssh unreachable nodes (bsc#1250645) --- crmsh/utils.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/crmsh/utils.py b/crmsh/utils.py index eabf7462b..7fa7c8371 100644 --- a/crmsh/utils.py +++ b/crmsh/utils.py @@ -2471,6 +2471,12 @@ def __init__(self, msg: str, dead_nodes=None): self.dead_nodes = dead_nodes or [] +class UnreachableNodeError(ValueError): + def __init__(self, msg: str, unreachable_nodes=None): + super().__init__(msg) + self.unreachable_nodes = unreachable_nodes or [] + + def check_all_nodes_reachable(action_to_do: str, peer_node: str = None): """ Check if all cluster nodes are reachable @@ -2492,8 +2498,17 @@ def check_all_nodes_reachable(action_to_do: str, peer_node: str = None): """ raise DeadNodeError(msg, dead_nodes) + unreachable_nodes = [] for node in online_nodes: - node_reachable_check(node) + try: + node_reachable_check(node) + except ValueError: + unreachable_nodes.append(node) + if unreachable_nodes: + msg = f"""There are unreachable nodes: {', '.join(unreachable_nodes)}. +Please check the network connectivity before {action_to_do}. + """ + raise UnreachableNodeError(msg, unreachable_nodes) def re_split_string(reg, string): From 7aaa85e2e1e408872ca322749a1402403bb7bbda Mon Sep 17 00:00:00 2001 From: xin liang Date: Mon, 20 Oct 2025 11:04:01 +0800 Subject: [PATCH 3/3] Dev: behave: Adjust functional test for previous commit --- test/features/cluster_blocking_ssh.feature | 2 -- test/features/qdevice_validate.feature | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/test/features/cluster_blocking_ssh.feature b/test/features/cluster_blocking_ssh.feature index c2385e34b..b4717f146 100644 --- a/test/features/cluster_blocking_ssh.feature +++ b/test/features/cluster_blocking_ssh.feature @@ -60,8 +60,6 @@ Feature: cluster testing with ssh blocked And Run "firewall-cmd --zone=public --add-rich-rule='rule port port=22 protocol=tcp drop' --permanent && firewall-cmd --reload" on "hanode2" And Try "ssh -o ConnectTimeout=5 hanode2" on "hanode1" Then Except "ssh: connect to host hanode2 port 22: Connection timed out" in stderr - When Run "timeout 5s crm report || echo "timeout"" on "hanode1" - Then Expected "timeout" in stdout When Write multi lines to file "/etc/crm/crm.conf" on "hanode1" """ [core] diff --git a/test/features/qdevice_validate.feature b/test/features/qdevice_validate.feature index 1d38ea728..729b40164 100644 --- a/test/features/qdevice_validate.feature +++ b/test/features/qdevice_validate.feature @@ -23,7 +23,7 @@ Feature: corosync qdevice/qnetd options validate Scenario: Service ssh on qnetd node not available When Run "systemctl stop sshd.service" on "node-without-ssh" When Try "crm cluster init --qnetd-hostname=node-without-ssh" - Then Except "ERROR: cluster.init: ssh service on "node-without-ssh" not available" + Then Except "ERROR: cluster.init: host "node-without-ssh" is unreachable via SSH" @clean Scenario: Option "--qdevice-port" set wrong port