diff --git a/docs/node_scenarios.md b/docs/node_scenarios.md index 6d81b1dd..d85c08c9 100644 --- a/docs/node_scenarios.md +++ b/docs/node_scenarios.md @@ -4,7 +4,7 @@ The following node chaos scenarios are supported: 1. **node_start_scenario**: Scenario to stop the node instance. 2. **node_stop_scenario**: Scenario to stop the node instance. -3. **node_stop_start_scenario**: Scenario to stop and then start the node instance. Not supported on VMware. +3. **node_stop_start_scenario**: Scenario to stop the node instance for specified duration and then start the node instance. Not supported on VMware. 4. **node_termination_scenario**: Scenario to terminate the node instance. 5. **node_reboot_scenario**: Scenario to reboot the node instance. 6. **stop_kubelet_scenario**: Scenario to stop the kubelet of the node instance. @@ -12,6 +12,7 @@ The following node chaos scenarios are supported: 8. **restart_kubelet_scenario**: Scenario to restart the kubelet of the node instance. 9. **node_crash_scenario**: Scenario to crash the node instance. 10. **stop_start_helper_node_scenario**: Scenario to stop and start the helper node and check service status. +11. **node_disk_detach_attach_scenario**: Scenario to detach node disk for specified duration. **NOTE**: If the node does not recover from the node_crash_scenario injection, reboot the node to get it back to Ready state. @@ -20,6 +21,8 @@ The following node chaos scenarios are supported: , node_reboot_scenario and stop_start_kubelet_scenario are supported on AWS, Azure, OpenStack, BareMetal, GCP , VMware and Alibaba. +**NOTE**: node_disk_detach_attach_scenario is supported only on AWS and cannot detach root disk. + #### AWS diff --git a/krkn/scenario_plugins/node_actions/abstract_node_scenarios.py b/krkn/scenario_plugins/node_actions/abstract_node_scenarios.py index 73d3feec..0602dff7 100644 --- a/krkn/scenario_plugins/node_actions/abstract_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/abstract_node_scenarios.py @@ -36,6 +36,20 @@ def helper_node_stop_start_scenario(self, instance_kill_count, node, timeout): self.helper_node_start_scenario(instance_kill_count, node, timeout) logging.info("helper_node_stop_start_scenario has been successfully injected!") + # Node scenario to detach and attach the disk + def node_disk_detach_attach_scenario(self, instance_kill_count, node, timeout, duration): + logging.info("Starting disk_detach_attach_scenario injection") + disk_attachment_details = self.get_disk_attachment_info(instance_kill_count, node) + if disk_attachment_details: + self.disk_detach_scenario(instance_kill_count, node, timeout) + logging.info("Waiting for %s seconds before attaching the disk" % (duration)) + time.sleep(duration) + self.disk_attach_scenario(instance_kill_count, disk_attachment_details, timeout) + logging.info("node_disk_detach_attach_scenario has been successfully injected!") + else: + logging.error("Node %s has only root disk attached" % (node)) + logging.error("node_disk_detach_attach_scenario failed!") + # Node scenario to terminate the node def node_termination_scenario(self, instance_kill_count, node, timeout): pass diff --git a/krkn/scenario_plugins/node_actions/aws_node_scenarios.py b/krkn/scenario_plugins/node_actions/aws_node_scenarios.py index c715a3e8..f4784506 100644 --- a/krkn/scenario_plugins/node_actions/aws_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/aws_node_scenarios.py @@ -12,7 +12,8 @@ class AWS: def __init__(self): self.boto_client = boto3.client("ec2") - self.boto_instance = boto3.resource("ec2").Instance("id") + self.boto_resource = boto3.resource("ec2") + self.boto_instance = self.boto_resource.Instance("id") # Get the instance ID of the node def get_instance_id(self, node): @@ -179,6 +180,72 @@ def delete_network_acl(self, acl_id): raise RuntimeError() + # Detach volume + def detach_volumes(self, volumes_ids: list): + for volume in volumes_ids: + try: + self.boto_client.detach_volume(VolumeId=volume, Force=True) + except Exception as e: + logging.error( + "Detaching volume %s failed with exception: %s" + % (volume, e) + ) + + # Attach volume + def attach_volume(self, attachment: dict): + try: + if self.get_volume_state(attachment["VolumeId"]) == "in-use": + logging.info( + "Volume %s is already in use." % attachment["VolumeId"] + ) + return + logging.info( + "Attaching the %s volumes to instance %s." + % (attachment["VolumeId"], attachment["InstanceId"]) + ) + self.boto_client.attach_volume( + InstanceId=attachment["InstanceId"], + Device=attachment["Device"], + VolumeId=attachment["VolumeId"] + ) + except Exception as e: + logging.error( + "Failed attaching disk %s to the %s instance. " + "Encountered following exception: %s" + % (attachment['VolumeId'], attachment['InstanceId'], e) + ) + raise RuntimeError() + + # Get IDs of node volumes + def get_volumes_ids(self, instance_id: list): + response = self.boto_client.describe_instances(InstanceIds=instance_id) + instance_attachment_details = response["Reservations"][0]["Instances"][0]["BlockDeviceMappings"] + root_volume_device_name = self.get_root_volume_id(instance_id) + volume_ids = [] + for device in instance_attachment_details: + if device["DeviceName"] != root_volume_device_name: + volume_id = device["Ebs"]["VolumeId"] + volume_ids.append(volume_id) + return volume_ids + + # Get volumes attachment details + def get_volume_attachment_details(self, volume_ids: list): + response = self.boto_client.describe_volumes(VolumeIds=volume_ids) + volumes_details = response["Volumes"] + return volumes_details + + # Get root volume + def get_root_volume_id(self, instance_id): + instance_id = instance_id[0] + instance = self.boto_resource.Instance(instance_id) + root_volume_id = instance.root_device_name + return root_volume_id + + # Get volume state + def get_volume_state(self, volume_id: str): + volume = self.boto_resource.Volume(volume_id) + state = volume.state + return state # krkn_lib class aws_node_scenarios(abstract_node_scenarios): @@ -290,3 +357,49 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout): logging.error("node_reboot_scenario injection failed!") raise RuntimeError() + + # Get volume attachment info + def get_disk_attachment_info(self, instance_kill_count, node): + for _ in range(instance_kill_count): + try: + logging.info("Obtaining disk attachment information") + instance_id = (self.aws.get_instance_id(node)).split() + volumes_ids = self.aws.get_volumes_ids(instance_id) + if volumes_ids: + vol_attachment_details = self.aws.get_volume_attachment_details( + volumes_ids + ) + return vol_attachment_details + return + except Exception as e: + logging.error( + "Failed to obtain disk attachment information of %s node. " + "Encounteres following exception: %s." % (node, e) + ) + raise RuntimeError() + + # Node scenario to detach the volume + def disk_detach_scenario(self, instance_kill_count, node, timeout): + for _ in range(instance_kill_count): + try: + logging.info("Starting disk_detach_scenario injection") + instance_id = (self.aws.get_instance_id(node)).split() + volumes_ids = self.aws.get_volumes_ids(instance_id) + logging.info( + "Detaching the %s volumes from instance %s " + % (volumes_ids, node) + ) + self.aws.detach_volumes(volumes_ids) + except Exception as e: + logging.error( + "Failed to detach disk from %s node. Encountered following" + "exception: %s." % (node, e) + ) + logging.debug("") + raise RuntimeError() + + # Node scenario to attach the volume + def disk_attach_scenario(self, instance_kill_count, attachment_details, timeout): + for _ in range(instance_kill_count): + for attachment in attachment_details: + self.aws.attach_volume(attachment["Attachments"][0]) diff --git a/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py b/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py index cae3e66c..f5b91749 100644 --- a/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py +++ b/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py @@ -163,7 +163,7 @@ def run_node(self, single_node, node_scenario_object, action, node_scenario): logging.info("action" + str(action)) # Get the scenario specifics for running action nodes run_kill_count = get_yaml_item_value(node_scenario, "runs", 1) - if action == "node_stop_start_scenario": + if action in ("node_stop_start_scenario", "node_disk_detach_attach_scenario"): duration = get_yaml_item_value(node_scenario, "duration", 120) timeout = get_yaml_item_value(node_scenario, "timeout", 120) @@ -200,6 +200,9 @@ def run_node(self, single_node, node_scenario_object, action, node_scenario): node_scenario_object.node_reboot_scenario( run_kill_count, single_node, timeout ) + elif action == "node_disk_detach_attach_scenario": + node_scenario_object.node_disk_detach_attach_scenario( + run_kill_count, single_node, timeout, duration) elif action == "stop_start_kubelet_scenario": node_scenario_object.stop_start_kubelet_scenario( run_kill_count, single_node, timeout diff --git a/scenarios/openshift/aws_node_scenarios.yml b/scenarios/openshift/aws_node_scenarios.yml index 95e453f5..76953786 100644 --- a/scenarios/openshift/aws_node_scenarios.yml +++ b/scenarios/openshift/aws_node_scenarios.yml @@ -16,3 +16,10 @@ node_scenarios: instance_count: 1 timeout: 120 cloud_type: aws + - actions: + - node_disk_detach_attach_scenario + node_name: + label_selector: + instance_count: 1 + timeout: 120 + cloud_type: aws \ No newline at end of file