From 10579177313f46bfc9b1ae7117a37b23dd004eef Mon Sep 17 00:00:00 2001
From: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>
Date: Sun, 11 Aug 2024 19:23:02 -0400
Subject: [PATCH] Add duration parameter for node scenarios

This option is enabled only for node_stop_start scenario where
user will want to stop the node for certain duration to understand
the impact before starting the node back on. This commit also bumps
the timeout for the scenario to 360 seconds from 120 seconds to make
sure there's enough time for the node to get to Ready state from the
Kubernetes side after the node is started on the infra side.

Signed-off-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>
---
 kraken/node_actions/abstract_node_scenarios.py   | 5 ++++-
 kraken/node_actions/run.py                       | 4 +++-
 scenarios/openshift/aws_node_scenarios.yml       | 7 +++----
 scenarios/openshift/azure_node_scenarios.yml     | 8 ++++++++
 scenarios/openshift/baremetal_node_scenarios.yml | 5 +++--
 scenarios/openshift/gcp_node_scenarios.yml       | 8 ++++++++
 scenarios/openshift/ibmcloud_node_scenarios.yml  | 5 +++--
 7 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/kraken/node_actions/abstract_node_scenarios.py b/kraken/node_actions/abstract_node_scenarios.py
index 19d15c14..d133a67d 100644
--- a/kraken/node_actions/abstract_node_scenarios.py
+++ b/kraken/node_actions/abstract_node_scenarios.py
@@ -1,5 +1,6 @@
 import sys
 import logging
+import time
 import kraken.invoke.command as runcommand
 import kraken.node_actions.common_node_functions as nodeaction
 from krkn_lib.k8s import KrknKubernetes
@@ -18,9 +19,11 @@ def node_stop_scenario(self, instance_kill_count, node, timeout):
         pass
 
     # Node scenario to stop and then start the node
-    def node_stop_start_scenario(self, instance_kill_count, node, timeout):
+    def node_stop_start_scenario(self, instance_kill_count, node, timeout, duration):
         logging.info("Starting node_stop_start_scenario injection")
         self.node_stop_scenario(instance_kill_count, node, timeout)
+        logging.info("Waiting for %s seconds before starting the node" % (duration))
+        time.sleep(duration)
         self.node_start_scenario(instance_kill_count, node, timeout)
         logging.info("node_stop_start_scenario has been successfully injected!")
 
diff --git a/kraken/node_actions/run.py b/kraken/node_actions/run.py
index 92a5488d..ed5b0289 100644
--- a/kraken/node_actions/run.py
+++ b/kraken/node_actions/run.py
@@ -100,6 +100,8 @@ def inject_node_scenario(action, node_scenario, node_scenario_object, kubecli: K
     )
     node_name = get_yaml_item_value(node_scenario, "node_name", "")
     label_selector = get_yaml_item_value(node_scenario, "label_selector", "")
+    if action == "node_stop_start_scenario":
+        duration = get_yaml_item_value(node_scenario, "duration", 120)
     timeout = get_yaml_item_value(node_scenario, "timeout", 120)
     service = get_yaml_item_value(node_scenario, "service", "")
     ssh_private_key = get_yaml_item_value(
@@ -121,7 +123,7 @@ def inject_node_scenario(action, node_scenario, node_scenario_object, kubecli: K
                 elif action == "node_stop_scenario":
                     node_scenario_object.node_stop_scenario(run_kill_count, single_node, timeout)
                 elif action == "node_stop_start_scenario":
-                    node_scenario_object.node_stop_start_scenario(run_kill_count, single_node, timeout)
+                    node_scenario_object.node_stop_start_scenario(run_kill_count, single_node, timeout, duration)
                 elif action == "node_termination_scenario":
                     node_scenario_object.node_termination_scenario(run_kill_count, single_node, timeout)
                 elif action == "node_reboot_scenario":
diff --git a/scenarios/openshift/aws_node_scenarios.yml b/scenarios/openshift/aws_node_scenarios.yml
index e8b55fc9..57d00c49 100644
--- a/scenarios/openshift/aws_node_scenarios.yml
+++ b/scenarios/openshift/aws_node_scenarios.yml
@@ -1,14 +1,13 @@
 node_scenarios:
   - actions:                                                        # node chaos scenarios to be injected
     - node_stop_start_scenario
-    - stop_start_kubelet_scenario
-    - node_crash_scenario
     node_name:                                                      # node on which scenario has to be injected; can set multiple names separated by comma
     label_selector: node-role.kubernetes.io/worker                  # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection
     instance_count: 1                                               # Number of nodes to perform action/select that match the label selector
     runs: 1                                                         # number of times to inject each scenario under actions (will perform on same node each time)
-    timeout: 120                                                    # duration to wait for completion of node scenario injection
-    cloud_type: aws                                                 # cloud type on which Kubernetes/OpenShift runs
+    timeout: 360                                                    # duration to wait for completion of node scenario injection
+    duration: 120                                                   # duration to stop the node before running the start action
+    cloud_type: aws                                                 # cloud type on which Kubernetes/OpenShift runs  
   - actions:
     - node_reboot_scenario
     node_name:
diff --git a/scenarios/openshift/azure_node_scenarios.yml b/scenarios/openshift/azure_node_scenarios.yml
index 072e70df..7a24927d 100644
--- a/scenarios/openshift/azure_node_scenarios.yml
+++ b/scenarios/openshift/azure_node_scenarios.yml
@@ -6,3 +6,11 @@ node_scenarios:
     instance_count: 1
     timeout: 120
     cloud_type: azure
+  - actions:
+    - node_stop_start_scenario
+    node_name:
+    label_selector: node-role.kubernetes.io/infra
+    instance_count: 1
+    timeout: 360
+    duration: 120
+    cloud_type: azure
diff --git a/scenarios/openshift/baremetal_node_scenarios.yml b/scenarios/openshift/baremetal_node_scenarios.yml
index 1cad9eac..10578aae 100644
--- a/scenarios/openshift/baremetal_node_scenarios.yml
+++ b/scenarios/openshift/baremetal_node_scenarios.yml
@@ -5,8 +5,9 @@ node_scenarios:
     label_selector: node-role.kubernetes.io/worker                  # When node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection.
     instance_count: 1                                               # Number of nodes to perform action/select that match the label selector.
     runs: 1                                                         # Number of times to inject each scenario under actions (will perform on same node each time).
-    timeout: 120                                                    # Duration to wait for completion of node scenario injection.
-    cloud_type: bm                                                 # Cloud type on which Kubernetes/OpenShift runs.
+    timeout: 360                                                    # Duration to wait for completion of node scenario injection.
+    duration: 120                                                   # Duration to stop the node before running the start action
+    cloud_type: bm                                                  # Cloud type on which Kubernetes/OpenShift runs.
     bmc_user: defaultuser                                           # For baremetal (bm) cloud type. The default IPMI username. Optional if specified for all machines.
     bmc_password: defaultpass                                       # For baremetal (bm) cloud type. The default IPMI password. Optional if specified for all machines.
     bmc_info:                                                       # This section is here to specify baremetal per-machine info, so it is optional if there is no per-machine info.
diff --git a/scenarios/openshift/gcp_node_scenarios.yml b/scenarios/openshift/gcp_node_scenarios.yml
index 7ed3fa25..8850847d 100644
--- a/scenarios/openshift/gcp_node_scenarios.yml
+++ b/scenarios/openshift/gcp_node_scenarios.yml
@@ -6,3 +6,11 @@ node_scenarios:
     instance_count: 1
     timeout: 120
     cloud_type: gcp
+  - actions:
+    - node_stop_start_scenario
+    node_name:
+    label_selector: node-role.kubernetes.io/worker
+    instance_count: 1
+    timeout: 360
+    duration: 120
+    cloud_type: gcp
diff --git a/scenarios/openshift/ibmcloud_node_scenarios.yml b/scenarios/openshift/ibmcloud_node_scenarios.yml
index 956ac869..76de9917 100644
--- a/scenarios/openshift/ibmcloud_node_scenarios.yml
+++ b/scenarios/openshift/ibmcloud_node_scenarios.yml
@@ -5,5 +5,6 @@
     label_selector: "node-role.kubernetes.io/worker"    # When node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection 
     runs: 1                             # Number of times to inject each scenario under actions (will perform on same node each time)                                                           
     instance_count: 1                   # Number of nodes to perform action/select that match the label selector                                             
-    timeout: 30                         # Duration to wait for completion of node scenario injection
-    skip_openshift_checks: False        # Set to True if you don't want to wait for the status of the nodes to change on OpenShift before passing the scenario 
\ No newline at end of file
+    timeout: 360                         # Duration to wait for completion of node scenario injection
+    duration: 120                       # Duration to stop the node before running the start action 
+    skip_openshift_checks: False        # Set to True if you don't want to wait for the status of the nodes to change on OpenShift before passing the scenario