From bab0537ba3d175f301e72d6fd5acfeb212b73c8c Mon Sep 17 00:00:00 2001
From: xin liang <xliang@suse.com>
Date: Mon, 20 Oct 2025 19:58:33 +0800
Subject: [PATCH 1/3] Dev: utils: Adjust node_reachable_check function and the
 way it is used

- Drop ping-based check and only use SSH to determine node reachability
- When SSH check fails, raise NoSSHError when config.core.no_ssh is set to yes
- Otherwise, raise ValueError as before
---
 crmsh/qdevice.py    |  3 ---
 crmsh/ui_cluster.py | 26 ++++++++++++++------------
 crmsh/utils.py      | 18 ++++++++----------
 3 files changed, 22 insertions(+), 25 deletions(-)

diff --git a/crmsh/qdevice.py b/crmsh/qdevice.py
index 9d04969d7..8dc6474db 100644
--- a/crmsh/qdevice.py
+++ b/crmsh/qdevice.py
@@ -213,9 +213,6 @@ def check_qnetd_addr(qnetd_addr):
         if utils.InterfacesInfo.ip_in_local(qnetd_ip):
             raise ValueError("host for qnetd must be a remote one")
 
-        if not utils.check_port_open(qnetd_ip, 22):
-            raise ValueError("ssh service on \"{}\" not available".format(qnetd_addr))
-
     @staticmethod
     def check_qdevice_port(qdevice_port):
         if not utils.valid_port(qdevice_port):
diff --git a/crmsh/ui_cluster.py b/crmsh/ui_cluster.py
index f9230646d..1e93db75b 100644
--- a/crmsh/ui_cluster.py
+++ b/crmsh/ui_cluster.py
@@ -167,7 +167,13 @@ def do_start(self, context, *args):
         '''
         Starts the cluster stack on all nodes or specific node(s)
         '''
-        node_list = parse_option_for_nodes(context, *args)
+        try:
+            node_list = parse_option_for_nodes(context, *args)
+        except utils.NoSSHError as msg:
+            logger.error('%s', msg)
+            logger.info("Please try 'crm cluster start' on each node")
+            return False
+
         service_check_list = ["pacemaker.service"]
         start_qdevice = False
         if corosync.is_qdevice_configured():
@@ -175,15 +181,10 @@ def do_start(self, context, *args):
             service_check_list.append("corosync-qdevice.service")
 
         service_manager = ServiceManager()
-        try:
-            for node in node_list[:]:
-                if all([service_manager.service_is_active(srv, remote_addr=node) for srv in service_check_list]):
-                    logger.info("The cluster stack already started on {}".format(node))
-                    node_list.remove(node)
-        except utils.NoSSHError as msg:
-            logger.error('%s', msg)
-            logger.info("Please try 'crm cluster start' on each node")
-            return False
+        for node in node_list[:]:
+            if all([service_manager.service_is_active(srv, remote_addr=node) for srv in service_check_list]):
+                logger.info("The cluster stack already started on {}".format(node))
+                node_list.remove(node)
         if not node_list:
             return
 
@@ -248,13 +249,14 @@ def do_stop(self, context, *args):
         '''
         Stops the cluster stack on all nodes or specific node(s)
         '''
-        node_list = parse_option_for_nodes(context, *args)
         try:
-            node_list = [n for n in node_list if self._node_ready_to_stop_cluster_service(n)]
+            node_list = parse_option_for_nodes(context, *args)
         except utils.NoSSHError as msg:
             logger.error('%s', msg)
             logger.info("Please try 'crm cluster stop' on each node")
             return False
+
+        node_list = [n for n in node_list if self._node_ready_to_stop_cluster_service(n)]
         if not node_list:
             return
         logger.debug(f"stop node list: {node_list}")
diff --git a/crmsh/utils.py b/crmsh/utils.py
index f9e209bd2..eabf7462b 100644
--- a/crmsh/utils.py
+++ b/crmsh/utils.py
@@ -2427,25 +2427,23 @@ def package_is_installed(pkg, remote_addr=None):
     return rc == 0
 
 
-def node_reachable_check(node, ping_count=1, port=22, timeout=3):
+def node_reachable_check(node):
     """
-    Check if node is reachable by using ping and socket to ssh port
+    Check if node is reachable by checking SSH port is open
     """
-    rc, _, _ = ShellUtils().get_stdout_stderr(f"ping -n -c {ping_count} -W {timeout} {node}")
-    if rc == 0:
-        return True
-    # ping failed, try to connect to ssh port by socket
-    if check_port_open(node, port, timeout):
+    if node == this_node() or check_port_open(node, 22):
         return True
-    # both ping and socket failed
-    raise ValueError(f"host \"{node}\" is unreachable")
+    if config.core.no_ssh:
+        raise NoSSHError(constants.NO_SSH_ERROR_MSG)
+    else:
+        raise ValueError(f"host \"{node}\" is unreachable via SSH")
 
 
 def get_reachable_node_list(node_list:list[str]) -> list[str]:
     reachable_node_list = []
     for node in node_list:
         try:
-            if node == this_node() or node_reachable_check(node):
+            if node_reachable_check(node):
                 reachable_node_list.append(node)
         except ValueError as e:
             logger.warning(str(e))

From ed5d2d71634fb41c59afc90ea90843bdf9d3aa6b Mon Sep 17 00:00:00 2001
From: xin liang <xliang@suse.com>
Date: Sun, 19 Oct 2025 11:39:10 +0800
Subject: [PATCH 2/3] Fix: utils: Raise UnreachableNodeError for those ssh
 unreachable nodes (bsc#1250645)

---
 crmsh/utils.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/crmsh/utils.py b/crmsh/utils.py
index eabf7462b..7fa7c8371 100644
--- a/crmsh/utils.py
+++ b/crmsh/utils.py
@@ -2471,6 +2471,12 @@ def __init__(self, msg: str, dead_nodes=None):
         self.dead_nodes = dead_nodes or []
 
 
+class UnreachableNodeError(ValueError):
+    def __init__(self, msg: str, unreachable_nodes=None):
+        super().__init__(msg)
+        self.unreachable_nodes = unreachable_nodes or []
+
+
 def check_all_nodes_reachable(action_to_do: str, peer_node: str = None):
     """
     Check if all cluster nodes are reachable
@@ -2492,8 +2498,17 @@ def check_all_nodes_reachable(action_to_do: str, peer_node: str = None):
         """
         raise DeadNodeError(msg, dead_nodes)
 
+    unreachable_nodes = []
     for node in online_nodes:
-        node_reachable_check(node)
+        try:
+            node_reachable_check(node)
+        except ValueError:
+            unreachable_nodes.append(node)
+    if unreachable_nodes:
+        msg = f"""There are unreachable nodes: {', '.join(unreachable_nodes)}.
+Please check the network connectivity before {action_to_do}.
+        """
+        raise UnreachableNodeError(msg, unreachable_nodes)
 
 
 def re_split_string(reg, string):

From 7aaa85e2e1e408872ca322749a1402403bb7bbda Mon Sep 17 00:00:00 2001
From: xin liang <xliang@suse.com>
Date: Mon, 20 Oct 2025 11:04:01 +0800
Subject: [PATCH 3/3] Dev: behave: Adjust functional test for previous commit

---
 test/features/cluster_blocking_ssh.feature | 2 --
 test/features/qdevice_validate.feature     | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/test/features/cluster_blocking_ssh.feature b/test/features/cluster_blocking_ssh.feature
index c2385e34b..b4717f146 100644
--- a/test/features/cluster_blocking_ssh.feature
+++ b/test/features/cluster_blocking_ssh.feature
@@ -60,8 +60,6 @@ Feature: cluster testing with ssh blocked
     And     Run "firewall-cmd --zone=public --add-rich-rule='rule port port=22 protocol=tcp drop' --permanent && firewall-cmd --reload" on "hanode2"
     And     Try "ssh -o ConnectTimeout=5 hanode2" on "hanode1"
     Then    Except "ssh: connect to host hanode2 port 22: Connection timed out" in stderr
-    When    Run "timeout 5s crm report || echo "timeout"" on "hanode1"
-    Then    Expected "timeout" in stdout
     When    Write multi lines to file "/etc/crm/crm.conf" on "hanode1"
       """
       [core]
diff --git a/test/features/qdevice_validate.feature b/test/features/qdevice_validate.feature
index 1d38ea728..729b40164 100644
--- a/test/features/qdevice_validate.feature
+++ b/test/features/qdevice_validate.feature
@@ -23,7 +23,7 @@ Feature: corosync qdevice/qnetd options validate
   Scenario: Service ssh on qnetd node not available
     When    Run "systemctl stop sshd.service" on "node-without-ssh"
     When    Try "crm cluster init --qnetd-hostname=node-without-ssh"
-    Then    Except "ERROR: cluster.init: ssh service on "node-without-ssh" not available"
+    Then    Except "ERROR: cluster.init: host "node-without-ssh" is unreachable via SSH"
 
   @clean
   Scenario: Option "--qdevice-port" set wrong port