Add node_disk_detach_attach_scenario for aws under node scenarios

Resolves #678 Signed-off-by: jtydlack <[email protected]> Add functions for aws detach disk scenario Signed-off-by: jtydlack <[email protected]> Add detach disk scenario in node scenario Signed-off-by: jtydlack <[email protected]> Add disk_deatch_attach_scenario in docs Signed-off-by: jtydlack <[email protected]>
krkn-chaos · Dec 4, 2024 · d7bf67b · d7bf67b
1 parent 97035a7
commit d7bf67b
Show file tree

Hide file tree

Showing 5 changed files with 145 additions and 3 deletions.
diff --git a/docs/node_scenarios.md b/docs/node_scenarios.md
@@ -4,14 +4,15 @@ The following node chaos scenarios are supported:
 
 1. **node_start_scenario**: Scenario to stop the node instance.
 2. **node_stop_scenario**: Scenario to stop the node instance.
-3. **node_stop_start_scenario**: Scenario to stop and then start the node instance. Not supported on VMware.
+3. **node_stop_start_scenario**: Scenario to stop the node instance for specified duration and then start the node instance. Not supported on VMware.
 4. **node_termination_scenario**: Scenario to terminate the node instance.
 5. **node_reboot_scenario**: Scenario to reboot the node instance.
 6. **stop_kubelet_scenario**: Scenario to stop the kubelet of the node instance.
 7. **stop_start_kubelet_scenario**: Scenario to stop and start the kubelet of the node instance.
 8. **restart_kubelet_scenario**: Scenario to restart the kubelet of the node instance.
 9. **node_crash_scenario**: Scenario to crash the node instance.
 10. **stop_start_helper_node_scenario**: Scenario to stop and start the helper node and check service status.
+11. **node_disk_detach_attach_scenario**: Scenario to detach node disk for specified duration.
 
 
 **NOTE**: If the node does not recover from the node_crash_scenario injection, reboot the node to get it back to Ready state.
@@ -20,6 +21,8 @@ The following node chaos scenarios are supported:
 , node_reboot_scenario and stop_start_kubelet_scenario are supported on AWS, Azure, OpenStack, BareMetal, GCP
 , VMware and Alibaba.
 
+**NOTE**: node_disk_detach_attach_scenario is supported only on AWS and cannot detach root disk.
+
 
 #### AWS
 

diff --git a/krkn/scenario_plugins/node_actions/abstract_node_scenarios.py b/krkn/scenario_plugins/node_actions/abstract_node_scenarios.py
@@ -36,6 +36,20 @@ def helper_node_stop_start_scenario(self, instance_kill_count, node, timeout):
         self.helper_node_start_scenario(instance_kill_count, node, timeout)
         logging.info("helper_node_stop_start_scenario has been successfully injected!")
 
+    # Node scenario to detach and attach the disk
+    def node_disk_detach_attach_scenario(self, instance_kill_count, node, timeout, duration):
+        logging.info("Starting disk_detach_attach_scenario injection")
+        disk_attachment_details = self.get_disk_attachment_info(instance_kill_count, node)
+        if disk_attachment_details:
+            self.disk_detach_scenario(instance_kill_count, node, timeout)
+            logging.info("Waiting for %s seconds before attaching the disk" % (duration))
+            time.sleep(duration)
+            self.disk_attach_scenario(instance_kill_count, disk_attachment_details, timeout)
+            logging.info("node_disk_detach_attach_scenario has been successfully injected!")
+        else:
+            logging.error("Node %s has only root disk attached" % (node))
+            logging.error("node_disk_detach_attach_scenario failed!")
+
     # Node scenario to terminate the node
     def node_termination_scenario(self, instance_kill_count, node, timeout):
         pass

diff --git a/krkn/scenario_plugins/node_actions/aws_node_scenarios.py b/krkn/scenario_plugins/node_actions/aws_node_scenarios.py
@@ -1,5 +1,7 @@
 import sys
 import time
+from http.client import responses
+
 import boto3
 import logging
 import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction
@@ -12,7 +14,8 @@
 class AWS:
     def __init__(self):
         self.boto_client = boto3.client("ec2")
-        self.boto_instance = boto3.resource("ec2").Instance("id")
+        self.boto_resource = boto3.resource("ec2")
+        self.boto_instance = self.boto_resource.Instance("id")
 
     # Get the instance ID of the node
     def get_instance_id(self, node):
@@ -179,6 +182,72 @@ def delete_network_acl(self, acl_id):
 
             raise RuntimeError()
 
+    # Detach volume
+    def detach_volumes(self, volumes_ids: list):
+        for volume in volumes_ids:
+            try:
+                self.boto_client.detach_volume(VolumeId=volume, Force=True)
+            except Exception as e:
+                logging.error(
+                    "Detaching volume %s failed with exception: %s"
+                    % (volume, e)
+                )
+
+    # Attach volume
+    def attach_volume(self, attachment: dict):
+        try:
+            if self.get_volume_state(attachment["VolumeId"]) == "in-use":
+                logging.info(
+                    "Volume %s is already in use." % attachment["VolumeId"]
+                )
+                return
+            logging.info(
+                "Attaching the %s volumes to instance %s."
+                % (attachment["VolumeId"], attachment["InstanceId"])
+            )
+            self.boto_client.attach_volume(
+                InstanceId=attachment["InstanceId"],
+                Device=attachment["Device"],
+                VolumeId=attachment["VolumeId"]
+            )
+        except Exception as e:
+            logging.error(
+                "Failed attaching disk %s to the %s instance. "
+                "Encountered following exception: %s"
+                % (attachment['VolumeId'], attachment['InstanceId'], e)
+            )
+            raise RuntimeError()
+
+    # Get IDs of node volumes
+    def get_volumes_ids(self, instance_id: list):
+        response = self.boto_client.describe_instances(InstanceIds=instance_id)
+        instance_attachment_details = response["Reservations"][0]["Instances"][0]["BlockDeviceMappings"]
+        root_volume_device_name = self.get_root_volume_id(instance_id)
+        volume_ids = []
+        for device in instance_attachment_details:
+            if device["DeviceName"] != root_volume_device_name:
+                volume_id = device["Ebs"]["VolumeId"]
+                volume_ids.append(volume_id)
+        return volume_ids
+
+    # Get volumes attachment details
+    def get_volume_attachment_details(self, volume_ids: list):
+        response = self.boto_client.describe_volumes(VolumeIds=volume_ids)
+        volumes_details = response["Volumes"]
+        return volumes_details
+
+    # Get root volume
+    def get_root_volume_id(self, instance_id):
+        instance_id = instance_id[0]
+        instance = self.boto_resource.Instance(instance_id)
+        root_volume_id = instance.root_device_name
+        return root_volume_id
+
+    # Get volume state
+    def get_volume_state(self, volume_id: str):
+        volume = self.boto_resource.Volume(volume_id)
+        state = volume.state
+        return state
 
 # krkn_lib
 class aws_node_scenarios(abstract_node_scenarios):
@@ -290,3 +359,49 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout):
                 logging.error("node_reboot_scenario injection failed!")
 
                 raise RuntimeError()
+
+    # Get volume attachment info
+    def get_disk_attachment_info(self, instance_kill_count, node):
+        for _ in range(instance_kill_count):
+            try:
+                logging.info("Obtaining disk attachment information")
+                instance_id = (self.aws.get_instance_id(node)).split()
+                volumes_ids = self.aws.get_volumes_ids(instance_id)
+                if volumes_ids:
+                    vol_attachment_details = self.aws.get_volume_attachment_details(
+                        volumes_ids
+                    )
+                    return vol_attachment_details
+                return
+            except Exception as e:
+                logging.error(
+                    "Failed to obtain disk attachment information of %s node. "
+                    "Encounteres following exception: %s." % (node, e)
+                )
+                raise RuntimeError()
+
+    # Node scenario to detach the volume
+    def disk_detach_scenario(self, instance_kill_count, node, timeout):
+        for _ in range(instance_kill_count):
+            try:
+                logging.info("Starting disk_detach_scenario injection")
+                instance_id = (self.aws.get_instance_id(node)).split()
+                volumes_ids = self.aws.get_volumes_ids(instance_id)
+                logging.info(
+                    "Detaching the %s volumes from instance %s "
+                    % (volumes_ids, node)
+                )
+                self.aws.detach_volumes(volumes_ids)
+            except Exception as e:
+                logging.error(
+                    "Failed to detach disk from %s node. Encountered following"
+                    "exception: %s." % (node, e)
+                )
+                logging.debug("")
+                raise RuntimeError()
+
+    # Node scenario to attach the volume
+    def disk_attach_scenario(self, instance_kill_count, attachment_details, timeout):
+        for _ in range(instance_kill_count):
+            for attachment in attachment_details:
+                self.aws.attach_volume(attachment["Attachments"][0])
diff --git a/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py b/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py
@@ -163,7 +163,7 @@ def run_node(self, single_node, node_scenario_object, action, node_scenario):
         logging.info("action" + str(action))
         # Get the scenario specifics for running action nodes
         run_kill_count = get_yaml_item_value(node_scenario, "runs", 1)
-        if action == "node_stop_start_scenario":
+        if action in ("node_stop_start_scenario", "node_disk_detach_attach_scenario"):
             duration = get_yaml_item_value(node_scenario, "duration", 120)
 
         timeout = get_yaml_item_value(node_scenario, "timeout", 120)
@@ -200,6 +200,9 @@ def run_node(self, single_node, node_scenario_object, action, node_scenario):
                 node_scenario_object.node_reboot_scenario(
                     run_kill_count, single_node, timeout
                 )
+            elif action == "node_disk_detach_attach_scenario":
+                node_scenario_object.node_disk_detach_attach_scenario(
+                    run_kill_count, single_node, timeout, duration)
             elif action == "stop_start_kubelet_scenario":
                 node_scenario_object.stop_start_kubelet_scenario(
                     run_kill_count, single_node, timeout

diff --git a/scenarios/openshift/aws_node_scenarios.yml b/scenarios/openshift/aws_node_scenarios.yml
@@ -16,3 +16,10 @@ node_scenarios:
     instance_count: 1
     timeout: 120
     cloud_type: aws
+  - actions:
+      - node_disk_detach_attach_scenario
+    node_name:
+    label_selector:
+    instance_count: 1
+    timeout: 120
+    cloud_type: aws