Skip to content

Commit

Permalink
Add duration parameter for node scenarios
Browse files Browse the repository at this point in the history
This option is enabled only for node_stop_start scenario where
user will want to stop the node for certain duration to understand
the impact before starting the node back on. This commit also bumps
the timeout for the scenario to 360 seconds from 120 seconds to make
sure there's enough time for the node to get to Ready state from the
Kubernetes side after the node is started on the infra side.

Signed-off-by: Naga Ravi Chaitanya Elluri <[email protected]>
  • Loading branch information
chaitanyaenr committed Aug 12, 2024
1 parent 5484828 commit 1057917
Show file tree
Hide file tree
Showing 7 changed files with 32 additions and 10 deletions.
5 changes: 4 additions & 1 deletion kraken/node_actions/abstract_node_scenarios.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import sys
import logging
import time
import kraken.invoke.command as runcommand
import kraken.node_actions.common_node_functions as nodeaction
from krkn_lib.k8s import KrknKubernetes
Expand All @@ -18,9 +19,11 @@ def node_stop_scenario(self, instance_kill_count, node, timeout):
pass

# Node scenario to stop and then start the node
def node_stop_start_scenario(self, instance_kill_count, node, timeout):
def node_stop_start_scenario(self, instance_kill_count, node, timeout, duration):
logging.info("Starting node_stop_start_scenario injection")
self.node_stop_scenario(instance_kill_count, node, timeout)
logging.info("Waiting for %s seconds before starting the node" % (duration))
time.sleep(duration)
self.node_start_scenario(instance_kill_count, node, timeout)
logging.info("node_stop_start_scenario has been successfully injected!")

Expand Down
4 changes: 3 additions & 1 deletion kraken/node_actions/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ def inject_node_scenario(action, node_scenario, node_scenario_object, kubecli: K
)
node_name = get_yaml_item_value(node_scenario, "node_name", "")
label_selector = get_yaml_item_value(node_scenario, "label_selector", "")
if action == "node_stop_start_scenario":
duration = get_yaml_item_value(node_scenario, "duration", 120)
timeout = get_yaml_item_value(node_scenario, "timeout", 120)
service = get_yaml_item_value(node_scenario, "service", "")
ssh_private_key = get_yaml_item_value(
Expand All @@ -121,7 +123,7 @@ def inject_node_scenario(action, node_scenario, node_scenario_object, kubecli: K
elif action == "node_stop_scenario":
node_scenario_object.node_stop_scenario(run_kill_count, single_node, timeout)
elif action == "node_stop_start_scenario":
node_scenario_object.node_stop_start_scenario(run_kill_count, single_node, timeout)
node_scenario_object.node_stop_start_scenario(run_kill_count, single_node, timeout, duration)
elif action == "node_termination_scenario":
node_scenario_object.node_termination_scenario(run_kill_count, single_node, timeout)
elif action == "node_reboot_scenario":
Expand Down
7 changes: 3 additions & 4 deletions scenarios/openshift/aws_node_scenarios.yml
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
node_scenarios:
- actions: # node chaos scenarios to be injected
- node_stop_start_scenario
- stop_start_kubelet_scenario
- node_crash_scenario
node_name: # node on which scenario has to be injected; can set multiple names separated by comma
label_selector: node-role.kubernetes.io/worker # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection
instance_count: 1 # Number of nodes to perform action/select that match the label selector
runs: 1 # number of times to inject each scenario under actions (will perform on same node each time)
timeout: 120 # duration to wait for completion of node scenario injection
cloud_type: aws # cloud type on which Kubernetes/OpenShift runs
timeout: 360 # duration to wait for completion of node scenario injection
duration: 120 # duration to stop the node before running the start action
cloud_type: aws # cloud type on which Kubernetes/OpenShift runs
- actions:
- node_reboot_scenario
node_name:
Expand Down
8 changes: 8 additions & 0 deletions scenarios/openshift/azure_node_scenarios.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,11 @@ node_scenarios:
instance_count: 1
timeout: 120
cloud_type: azure
- actions:
- node_stop_start_scenario
node_name:
label_selector: node-role.kubernetes.io/infra
instance_count: 1
timeout: 360
duration: 120
cloud_type: azure
5 changes: 3 additions & 2 deletions scenarios/openshift/baremetal_node_scenarios.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@ node_scenarios:
label_selector: node-role.kubernetes.io/worker # When node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection.
instance_count: 1 # Number of nodes to perform action/select that match the label selector.
runs: 1 # Number of times to inject each scenario under actions (will perform on same node each time).
timeout: 120 # Duration to wait for completion of node scenario injection.
cloud_type: bm # Cloud type on which Kubernetes/OpenShift runs.
timeout: 360 # Duration to wait for completion of node scenario injection.
duration: 120 # Duration to stop the node before running the start action
cloud_type: bm # Cloud type on which Kubernetes/OpenShift runs.
bmc_user: defaultuser # For baremetal (bm) cloud type. The default IPMI username. Optional if specified for all machines.
bmc_password: defaultpass # For baremetal (bm) cloud type. The default IPMI password. Optional if specified for all machines.
bmc_info: # This section is here to specify baremetal per-machine info, so it is optional if there is no per-machine info.
Expand Down
8 changes: 8 additions & 0 deletions scenarios/openshift/gcp_node_scenarios.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,11 @@ node_scenarios:
instance_count: 1
timeout: 120
cloud_type: gcp
- actions:
- node_stop_start_scenario
node_name:
label_selector: node-role.kubernetes.io/worker
instance_count: 1
timeout: 360
duration: 120
cloud_type: gcp
5 changes: 3 additions & 2 deletions scenarios/openshift/ibmcloud_node_scenarios.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@
label_selector: "node-role.kubernetes.io/worker" # When node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection
runs: 1 # Number of times to inject each scenario under actions (will perform on same node each time)
instance_count: 1 # Number of nodes to perform action/select that match the label selector
timeout: 30 # Duration to wait for completion of node scenario injection
skip_openshift_checks: False # Set to True if you don't want to wait for the status of the nodes to change on OpenShift before passing the scenario
timeout: 360 # Duration to wait for completion of node scenario injection
duration: 120 # Duration to stop the node before running the start action
skip_openshift_checks: False # Set to True if you don't want to wait for the status of the nodes to change on OpenShift before passing the scenario

0 comments on commit 1057917

Please sign in to comment.