From 10579177313f46bfc9b1ae7117a37b23dd004eef Mon Sep 17 00:00:00 2001 From: Naga Ravi Chaitanya Elluri Date: Sun, 11 Aug 2024 19:23:02 -0400 Subject: [PATCH] Add duration parameter for node scenarios This option is enabled only for node_stop_start scenario where user will want to stop the node for certain duration to understand the impact before starting the node back on. This commit also bumps the timeout for the scenario to 360 seconds from 120 seconds to make sure there's enough time for the node to get to Ready state from the Kubernetes side after the node is started on the infra side. Signed-off-by: Naga Ravi Chaitanya Elluri --- kraken/node_actions/abstract_node_scenarios.py | 5 ++++- kraken/node_actions/run.py | 4 +++- scenarios/openshift/aws_node_scenarios.yml | 7 +++---- scenarios/openshift/azure_node_scenarios.yml | 8 ++++++++ scenarios/openshift/baremetal_node_scenarios.yml | 5 +++-- scenarios/openshift/gcp_node_scenarios.yml | 8 ++++++++ scenarios/openshift/ibmcloud_node_scenarios.yml | 5 +++-- 7 files changed, 32 insertions(+), 10 deletions(-) diff --git a/kraken/node_actions/abstract_node_scenarios.py b/kraken/node_actions/abstract_node_scenarios.py index 19d15c14..d133a67d 100644 --- a/kraken/node_actions/abstract_node_scenarios.py +++ b/kraken/node_actions/abstract_node_scenarios.py @@ -1,5 +1,6 @@ import sys import logging +import time import kraken.invoke.command as runcommand import kraken.node_actions.common_node_functions as nodeaction from krkn_lib.k8s import KrknKubernetes @@ -18,9 +19,11 @@ def node_stop_scenario(self, instance_kill_count, node, timeout): pass # Node scenario to stop and then start the node - def node_stop_start_scenario(self, instance_kill_count, node, timeout): + def node_stop_start_scenario(self, instance_kill_count, node, timeout, duration): logging.info("Starting node_stop_start_scenario injection") self.node_stop_scenario(instance_kill_count, node, timeout) + logging.info("Waiting for %s seconds before starting the node" % (duration)) + time.sleep(duration) self.node_start_scenario(instance_kill_count, node, timeout) logging.info("node_stop_start_scenario has been successfully injected!") diff --git a/kraken/node_actions/run.py b/kraken/node_actions/run.py index 92a5488d..ed5b0289 100644 --- a/kraken/node_actions/run.py +++ b/kraken/node_actions/run.py @@ -100,6 +100,8 @@ def inject_node_scenario(action, node_scenario, node_scenario_object, kubecli: K ) node_name = get_yaml_item_value(node_scenario, "node_name", "") label_selector = get_yaml_item_value(node_scenario, "label_selector", "") + if action == "node_stop_start_scenario": + duration = get_yaml_item_value(node_scenario, "duration", 120) timeout = get_yaml_item_value(node_scenario, "timeout", 120) service = get_yaml_item_value(node_scenario, "service", "") ssh_private_key = get_yaml_item_value( @@ -121,7 +123,7 @@ def inject_node_scenario(action, node_scenario, node_scenario_object, kubecli: K elif action == "node_stop_scenario": node_scenario_object.node_stop_scenario(run_kill_count, single_node, timeout) elif action == "node_stop_start_scenario": - node_scenario_object.node_stop_start_scenario(run_kill_count, single_node, timeout) + node_scenario_object.node_stop_start_scenario(run_kill_count, single_node, timeout, duration) elif action == "node_termination_scenario": node_scenario_object.node_termination_scenario(run_kill_count, single_node, timeout) elif action == "node_reboot_scenario": diff --git a/scenarios/openshift/aws_node_scenarios.yml b/scenarios/openshift/aws_node_scenarios.yml index e8b55fc9..57d00c49 100644 --- a/scenarios/openshift/aws_node_scenarios.yml +++ b/scenarios/openshift/aws_node_scenarios.yml @@ -1,14 +1,13 @@ node_scenarios: - actions: # node chaos scenarios to be injected - node_stop_start_scenario - - stop_start_kubelet_scenario - - node_crash_scenario node_name: # node on which scenario has to be injected; can set multiple names separated by comma label_selector: node-role.kubernetes.io/worker # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection instance_count: 1 # Number of nodes to perform action/select that match the label selector runs: 1 # number of times to inject each scenario under actions (will perform on same node each time) - timeout: 120 # duration to wait for completion of node scenario injection - cloud_type: aws # cloud type on which Kubernetes/OpenShift runs + timeout: 360 # duration to wait for completion of node scenario injection + duration: 120 # duration to stop the node before running the start action + cloud_type: aws # cloud type on which Kubernetes/OpenShift runs - actions: - node_reboot_scenario node_name: diff --git a/scenarios/openshift/azure_node_scenarios.yml b/scenarios/openshift/azure_node_scenarios.yml index 072e70df..7a24927d 100644 --- a/scenarios/openshift/azure_node_scenarios.yml +++ b/scenarios/openshift/azure_node_scenarios.yml @@ -6,3 +6,11 @@ node_scenarios: instance_count: 1 timeout: 120 cloud_type: azure + - actions: + - node_stop_start_scenario + node_name: + label_selector: node-role.kubernetes.io/infra + instance_count: 1 + timeout: 360 + duration: 120 + cloud_type: azure diff --git a/scenarios/openshift/baremetal_node_scenarios.yml b/scenarios/openshift/baremetal_node_scenarios.yml index 1cad9eac..10578aae 100644 --- a/scenarios/openshift/baremetal_node_scenarios.yml +++ b/scenarios/openshift/baremetal_node_scenarios.yml @@ -5,8 +5,9 @@ node_scenarios: label_selector: node-role.kubernetes.io/worker # When node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection. instance_count: 1 # Number of nodes to perform action/select that match the label selector. runs: 1 # Number of times to inject each scenario under actions (will perform on same node each time). - timeout: 120 # Duration to wait for completion of node scenario injection. - cloud_type: bm # Cloud type on which Kubernetes/OpenShift runs. + timeout: 360 # Duration to wait for completion of node scenario injection. + duration: 120 # Duration to stop the node before running the start action + cloud_type: bm # Cloud type on which Kubernetes/OpenShift runs. bmc_user: defaultuser # For baremetal (bm) cloud type. The default IPMI username. Optional if specified for all machines. bmc_password: defaultpass # For baremetal (bm) cloud type. The default IPMI password. Optional if specified for all machines. bmc_info: # This section is here to specify baremetal per-machine info, so it is optional if there is no per-machine info. diff --git a/scenarios/openshift/gcp_node_scenarios.yml b/scenarios/openshift/gcp_node_scenarios.yml index 7ed3fa25..8850847d 100644 --- a/scenarios/openshift/gcp_node_scenarios.yml +++ b/scenarios/openshift/gcp_node_scenarios.yml @@ -6,3 +6,11 @@ node_scenarios: instance_count: 1 timeout: 120 cloud_type: gcp + - actions: + - node_stop_start_scenario + node_name: + label_selector: node-role.kubernetes.io/worker + instance_count: 1 + timeout: 360 + duration: 120 + cloud_type: gcp diff --git a/scenarios/openshift/ibmcloud_node_scenarios.yml b/scenarios/openshift/ibmcloud_node_scenarios.yml index 956ac869..76de9917 100644 --- a/scenarios/openshift/ibmcloud_node_scenarios.yml +++ b/scenarios/openshift/ibmcloud_node_scenarios.yml @@ -5,5 +5,6 @@ label_selector: "node-role.kubernetes.io/worker" # When node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection runs: 1 # Number of times to inject each scenario under actions (will perform on same node each time) instance_count: 1 # Number of nodes to perform action/select that match the label selector - timeout: 30 # Duration to wait for completion of node scenario injection - skip_openshift_checks: False # Set to True if you don't want to wait for the status of the nodes to change on OpenShift before passing the scenario \ No newline at end of file + timeout: 360 # Duration to wait for completion of node scenario injection + duration: 120 # Duration to stop the node before running the start action + skip_openshift_checks: False # Set to True if you don't want to wait for the status of the nodes to change on OpenShift before passing the scenario