Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update OpenShift Scale to use timeout variable and add additional scale check #367

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 53 additions & 3 deletions snafu/scale_openshift_wrapper/trigger_scale.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def __init__(self, args):
self.poll_interval = args.poll_interval
self.kubeconfig = args.kubeconfig
self.is_rosa = False
self.timeout = int(args.timeout * 60)
if args.rosa_cluster is not None:
logger.info("Identified ROSA for scaling process")
if args.rosa_token is None:
Expand Down Expand Up @@ -248,6 +249,8 @@ def _run_scale(self):
logger.info("New worker per machine set %s" % (machine_spread))

logger.info("Starting Patching of machine sets")
start_time = time.time()
end_time = start_time + self.timeout
# Patch the machinesets
if not self.is_rosa:
for i in range(len(machineset_workers)):
Expand All @@ -269,6 +272,12 @@ def _run_scale(self):
while new_machine_sets.status.readyReplicas != machine_spread[i]:
if new_machine_sets.status.readyReplicas is None and machine_spread[i] == 0:
break

current_time = time.time()
if current_time >= end_time:
logger.error("Timeout %d minutes exceeded" % self.timeout)
exit(1)

new_machine_sets = machinesets.get(
namespace="openshift-machine-api", name=machineset_worker_list[i].metadata.name
)
Expand All @@ -287,21 +296,61 @@ def _run_scale(self):
# Ensure all workers are not listed as unschedulable
# If we don't do this it will auto-complete a scale-down even though the workers
# have not been eliminated yet
new_worker_list = nodes.get(label_selector="node-role.kubernetes.io/worker").attributes.items
new_worker_list = nodes.get(
label_selector="node-role.kubernetes.io/worker,"
"!node-role.kubernetes.io/master,"
"!node-role.kubernetes.io/infra,"
"!node-role.kubernetes.io/workload"
).attributes.items
for i in range(len(new_worker_list)):
while i < len(new_worker_list) and new_worker_list[i].spec.unschedulable:
new_worker_list = nodes.get(label_selector="node-role.kubernetes.io/worker").attributes.items
current_time = time.time()
if current_time >= end_time:
logger.error("Timeout %d minutes exceeded" % self.timeout)
exit(1)
new_worker_list = nodes.get(
label_selector="node-role.kubernetes.io/worker,"
"!node-role.kubernetes.io/master,"
"!node-role.kubernetes.io/infra,"
"!node-role.kubernetes.io/workload"
).attributes.items
logger.debug(
"Number of ready workers: %d. Waiting %d seconds for next check..."
% (len(new_worker_list), self.poll_interval)
)
time.sleep(self.poll_interval)
logger.info("All workers schedulable")

logger.inf("Verifying correct worker count")
current_workers = len(
nodes.get(
label_selector="node-role.kubernetes.io/worker,"
"!node-role.kubernetes.io/master,"
"!node-role.kubernetes.io/infra,"
"!node-role.kubernetes.io/workload"
).attributes.items
)
while current_workers != int(self.scale):
current_time = time.time()
if current_time >= end_time:
logger.error("Timeout %d minutes exceeded" % self.timeout)
exit(1)

logger.debug(
"Number of ready workers: %d. Waiting %d seconds for next check..."
% (len(new_worker_list), self.poll_interval)
)
time.sleep(self.poll_interval)

logger.info("Correct worker count verified")

worker_count = (
len(
nodes.get(
label_selector="node-role.kubernetes.io/worker,!node-role.kubernetes.io/master"
label_selector="node-role.kubernetes.io/worker,"
"!node-role.kubernetes.io/master,"
"!node-role.kubernetes.io/infra,"
"!node-role.kubernetes.io/workload"
).attributes.items
)
or 0
Expand Down Expand Up @@ -329,6 +378,7 @@ def emit_actions(self):
workload_count,
platform,
action,
successful,
) = self._run_scale()
end_time = time.time()
elaspsed_time = end_time - start_time
Expand Down