Skip to content

Commit 51f7ccf

Browse files
authored
Wait until the nodes are ready (#924)
1 parent 2071365 commit 51f7ccf

File tree

5 files changed

+79
-30
lines changed

5 files changed

+79
-30
lines changed

benchmark_runner/common/oc/oc.py

Lines changed: 45 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -457,32 +457,61 @@ def get_worker_nodes(self):
457457
"""
458458
return self.run(fr""" {self.__cli} get nodes -l node-role.kubernetes.io/worker= -o jsonpath="{{range .items[*]}}{{.metadata.name}}{{'\n'}}{{end}}" """)
459459

460-
@staticmethod
461460
@typechecked
462-
def check_node_status(nodes_list: list):
461+
def wait_for_node_ready(self, node: str = None, wait_time: int = None, timeout: int = int(environment_variables.environment_variables_dict['timeout'])):
463462
"""
464-
This method check node status
465-
@param nodes_list:
466-
@return: True when all nodes in ready status
463+
This method waits until all nodes are in 'Ready' status or specific node
464+
@param node: wait for specific node to be ready, when None check all nodes
465+
@param wait_time: wait time between each loop
466+
@param timeout: Maximum wait time in seconds, negative value means no timeout (default set in environment variables)
467+
@return: True if all nodes are in 'Ready' status within the timeout period
468+
@raises: NodeNotReady if one or more nodes are not ready within the timeout
467469
"""
468-
# Check if any node is not in 'Ready' status
469-
for node in nodes_list:
470-
node_name, node_status = node.split()
470+
wait_time = wait_time or OC.SHORT_TIMEOUT
471+
nodes_status = None
472+
current_wait_time = 0
473+
while timeout <= 0 or current_wait_time < timeout:
474+
nodes_status = self.check_node_status(node=node)
475+
if nodes_status is True:
476+
return True
477+
logger.info(f"Waiting for '{nodes_status}' to reach 'Ready' status")
478+
time.sleep(wait_time)
479+
current_wait_time += wait_time
480+
logger.info(f"oc get nodes:\n{self.run('oc get nodes')}")
481+
raise NodeNotReady(nodes_status=nodes_status)
482+
483+
@typechecked
484+
def check_node_status(self, node: str = None):
485+
"""
486+
This method checks the status of all nodes or a specific node.
487+
@param node: The name of a specific node to check for "Ready" status; if None, check all nodes.
488+
@return: True if all nodes are in 'Ready' status, or a dictionary of nodes that are not in 'Ready' status.
489+
"""
490+
not_ready_nodes = {}
491+
492+
for node_state in self.get_node_status():
493+
node_name, node_status = node_state.split()
494+
495+
# If a specific node is given, only check that node
496+
if node and node != node_name:
497+
continue
498+
471499
if node_status != 'Ready':
472-
raise NodeNotReady(node_name, node_status)
500+
not_ready_nodes[node_name] = node_status
501+
# If checking a specific node and it's not ready, no need to check further
502+
if node:
503+
break
473504

474-
# If no nodes are found in a non-ready state
475-
return True
505+
return True if not not_ready_nodes else not_ready_nodes
476506

477-
def verify_nodes_ready(self):
507+
def get_node_status(self) -> list:
478508
"""
479-
This method verifies that all nodes are in 'Ready' status.
480-
If any node is not ready, it raises an error with the node name and its status.
481-
@return: True is all in 'Ready' status
509+
This method returns node status list
510+
@return:
482511
"""
483512
# Get the node name and status for all nodes
484513
nodes_list = self.run(f"{self.__cli} get nodes --no-headers | awk '{{print $1, $2}}'").splitlines()
485-
return self.check_node_status(nodes_list)
514+
return nodes_list
486515

487516
def delete_available_released_pv(self):
488517
"""

benchmark_runner/common/oc/oc_exceptions.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ def __init__(self):
167167

168168

169169
class NodeNotReady(OCError):
170-
"""This exception returns node not ready timeout error"""
171-
def __init__(self, node_name, node_status):
172-
self.message = f"Node {node_name} is not ready. Current status: {node_status}"
170+
"""This exception indicates a node not ready due to a timeout error"""
171+
def __init__(self, nodes_status: dict):
172+
self.message = f"Node not ready: {nodes_status}"
173173
super(NodeNotReady, self).__init__(self.message)

benchmark_runner/main/main.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ def upgrade_ocp_bare_metal(step: str):
175175
elif step == 'verify_bare_metal_upgrade_complete':
176176
if bare_metal_operations.is_cluster_upgraded(oc, cnv_version=cnv_version, odf_version=odf_version, lso_version=lso_version):
177177
bare_metal_operations.verify_cluster_is_up(oc)
178-
oc.verify_nodes_ready()
178+
oc.wait_for_node_ready()
179179
else:
180180
error_message = f'OCP {upgrade_ocp_version} upgrade failed'
181181
logger.error(error_message)
@@ -200,7 +200,7 @@ def install_resources():
200200
logger.info(f'Start Bare-Metal OpenShift resources installation')
201201
oc = bare_metal_operations.oc_login()
202202
bare_metal_operations.verify_cluster_is_up(oc)
203-
oc.verify_nodes_ready()
203+
oc.wait_for_node_ready()
204204
bare_metal_operations.install_ocp_resources(resources=resources)
205205
bare_metal_operations.disconnect_from_provisioner()
206206
logger.info(f'End Bare-Metal OpenShift resources installation')

tests/integration/benchmark_runner/common/oc/test_oc.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -234,11 +234,11 @@ def test_collect_prometheus():
234234
assert tarfile.is_tarfile(tarball)
235235

236236

237-
def test_verify_nodes_ready():
237+
def test_wait_for_nodes_ready():
238238
"""
239-
This method test nodes are ready
239+
This method waits till nodes are ready
240240
@return:
241241
"""
242242
oc = OC(kubeadmin_password=test_environment_variable['kubeadmin_password'])
243243
oc.login()
244-
assert oc.verify_nodes_ready()
244+
assert oc.wait_for_node_ready()

tests/unittest/benchmark_runner/common/oc/test_oc.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11

22
import mock
33
import pytest
4+
from unittest.mock import patch
5+
46
from benchmark_runner.common.oc.oc import OC
57
from benchmark_runner.common.oc.oc_exceptions import YAMLNotExist, LoginFailed, NodeNotReady
68

@@ -49,15 +51,33 @@ def test_short_uuid():
4951
assert oc._OC__get_short_uuid(workload='stressng_pod') == 'bb2be20e'
5052

5153

52-
def test_check_node_status_ready():
54+
def test_check_all_nodes_status_ready():
5355
oc = OC()
54-
result = oc.check_node_status(nodes_list=['node-0 Ready', 'node-1 Ready', 'node-2 Ready'])
55-
assert result
56+
with patch.object(OC, 'get_node_status', return_value=['node-0 Ready', 'node-1 Ready', 'node-2 Ready']):
57+
result = oc.wait_for_node_ready()
58+
assert result
5659

5760

58-
def test_check_node_status_not_ready():
61+
def test_check_all_nodes_status_not_ready():
5962
oc = OC()
6063
with pytest.raises(NodeNotReady) as exc_info:
61-
oc.check_node_status(nodes_list=['node-0 Ready', 'node-1 NotReady', 'node-2 Ready'])
64+
with patch.object(OC, 'get_node_status', return_value=['node-0 NotReady', 'node-1 NotReady', 'node-2 Ready']):
65+
oc.wait_for_node_ready(wait_time=3, timeout=10)
6266
# Check that the exception message is as expected
63-
assert str(exc_info.value) == "Node node-1 is not ready. Current status: NotReady"
67+
assert str(exc_info.value) == "Node not ready: {'node-0': 'NotReady', 'node-1': 'NotReady'}"
68+
69+
70+
def test_check_not_ready_node_status_not_ready():
71+
oc = OC()
72+
with pytest.raises(NodeNotReady) as exc_info:
73+
with patch.object(OC, 'get_node_status', return_value=['node-0 NotReady', 'node-1 NotReady', 'node-2 Ready']):
74+
oc.wait_for_node_ready(node='node-1', wait_time=3, timeout=10)
75+
# Check that the exception message is as expected
76+
assert str(exc_info.value) == "Node not ready: {'node-1': 'NotReady'}"
77+
78+
79+
def test_check_ready_node_status_not_ready():
80+
oc = OC()
81+
with patch.object(OC, 'get_node_status', return_value=['node-0 NotReady', 'node-1 NotReady', 'node-2 Ready']):
82+
result = oc.wait_for_node_ready(node='node-2', wait_time=3, timeout=10)
83+
assert result

0 commit comments

Comments
 (0)