Skip to content

Commit

Permalink
Add node_disk_detach_attach_scenario for aws under node scenarios
Browse files Browse the repository at this point in the history
Resolves #678

Signed-off-by: jtydlack <[email protected]>

Add functions for aws detach disk scenario

Signed-off-by: jtydlack <[email protected]>

Add detach disk scenario in node scenario

Signed-off-by: jtydlack <[email protected]>

Add disk_deatch_attach_scenario in docs

Signed-off-by: jtydlack <[email protected]>
  • Loading branch information
jtydlack committed Dec 10, 2024
1 parent 2ba20fa commit 520ac52
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 3 deletions.
5 changes: 4 additions & 1 deletion docs/node_scenarios.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ The following node chaos scenarios are supported:

1. **node_start_scenario**: Scenario to stop the node instance.
2. **node_stop_scenario**: Scenario to stop the node instance.
3. **node_stop_start_scenario**: Scenario to stop and then start the node instance. Not supported on VMware.
3. **node_stop_start_scenario**: Scenario to stop the node instance for specified duration and then start the node instance. Not supported on VMware.
4. **node_termination_scenario**: Scenario to terminate the node instance.
5. **node_reboot_scenario**: Scenario to reboot the node instance.
6. **stop_kubelet_scenario**: Scenario to stop the kubelet of the node instance.
7. **stop_start_kubelet_scenario**: Scenario to stop and start the kubelet of the node instance.
8. **restart_kubelet_scenario**: Scenario to restart the kubelet of the node instance.
9. **node_crash_scenario**: Scenario to crash the node instance.
10. **stop_start_helper_node_scenario**: Scenario to stop and start the helper node and check service status.
11. **node_disk_detach_attach_scenario**: Scenario to detach node disk for specified duration.


**NOTE**: If the node does not recover from the node_crash_scenario injection, reboot the node to get it back to Ready state.
Expand All @@ -20,6 +21,8 @@ The following node chaos scenarios are supported:
, node_reboot_scenario and stop_start_kubelet_scenario are supported on AWS, Azure, OpenStack, BareMetal, GCP
, VMware and Alibaba.

**NOTE**: node_disk_detach_attach_scenario is supported only on AWS and cannot detach root disk.


#### AWS

Expand Down
14 changes: 14 additions & 0 deletions krkn/scenario_plugins/node_actions/abstract_node_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,20 @@ def helper_node_stop_start_scenario(self, instance_kill_count, node, timeout):
self.helper_node_start_scenario(instance_kill_count, node, timeout)
logging.info("helper_node_stop_start_scenario has been successfully injected!")

# Node scenario to detach and attach the disk
def node_disk_detach_attach_scenario(self, instance_kill_count, node, timeout, duration):
logging.info("Starting disk_detach_attach_scenario injection")
disk_attachment_details = self.get_disk_attachment_info(instance_kill_count, node)
if disk_attachment_details:
self.disk_detach_scenario(instance_kill_count, node, timeout)
logging.info("Waiting for %s seconds before attaching the disk" % (duration))
time.sleep(duration)
self.disk_attach_scenario(instance_kill_count, disk_attachment_details, timeout)
logging.info("node_disk_detach_attach_scenario has been successfully injected!")
else:
logging.error("Node %s has only root disk attached" % (node))
logging.error("node_disk_detach_attach_scenario failed!")

# Node scenario to terminate the node
def node_termination_scenario(self, instance_kill_count, node, timeout):
pass
Expand Down
115 changes: 114 additions & 1 deletion krkn/scenario_plugins/node_actions/aws_node_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
class AWS:
def __init__(self):
self.boto_client = boto3.client("ec2")
self.boto_instance = boto3.resource("ec2").Instance("id")
self.boto_resource = boto3.resource("ec2")
self.boto_instance = self.boto_resource.Instance("id")

# Get the instance ID of the node
def get_instance_id(self, node):
Expand Down Expand Up @@ -179,6 +180,72 @@ def delete_network_acl(self, acl_id):

raise RuntimeError()

# Detach volume
def detach_volumes(self, volumes_ids: list):
for volume in volumes_ids:
try:
self.boto_client.detach_volume(VolumeId=volume, Force=True)
except Exception as e:
logging.error(
"Detaching volume %s failed with exception: %s"
% (volume, e)
)

# Attach volume
def attach_volume(self, attachment: dict):
try:
if self.get_volume_state(attachment["VolumeId"]) == "in-use":
logging.info(
"Volume %s is already in use." % attachment["VolumeId"]
)
return
logging.info(
"Attaching the %s volumes to instance %s."
% (attachment["VolumeId"], attachment["InstanceId"])
)
self.boto_client.attach_volume(
InstanceId=attachment["InstanceId"],
Device=attachment["Device"],
VolumeId=attachment["VolumeId"]
)
except Exception as e:
logging.error(
"Failed attaching disk %s to the %s instance. "
"Encountered following exception: %s"
% (attachment['VolumeId'], attachment['InstanceId'], e)
)
raise RuntimeError()

# Get IDs of node volumes
def get_volumes_ids(self, instance_id: list):
response = self.boto_client.describe_instances(InstanceIds=instance_id)
instance_attachment_details = response["Reservations"][0]["Instances"][0]["BlockDeviceMappings"]
root_volume_device_name = self.get_root_volume_id(instance_id)
volume_ids = []
for device in instance_attachment_details:
if device["DeviceName"] != root_volume_device_name:
volume_id = device["Ebs"]["VolumeId"]
volume_ids.append(volume_id)
return volume_ids

# Get volumes attachment details
def get_volume_attachment_details(self, volume_ids: list):
response = self.boto_client.describe_volumes(VolumeIds=volume_ids)
volumes_details = response["Volumes"]
return volumes_details

# Get root volume
def get_root_volume_id(self, instance_id):
instance_id = instance_id[0]
instance = self.boto_resource.Instance(instance_id)
root_volume_id = instance.root_device_name
return root_volume_id

# Get volume state
def get_volume_state(self, volume_id: str):
volume = self.boto_resource.Volume(volume_id)
state = volume.state
return state

# krkn_lib
class aws_node_scenarios(abstract_node_scenarios):
Expand Down Expand Up @@ -290,3 +357,49 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout):
logging.error("node_reboot_scenario injection failed!")

raise RuntimeError()

# Get volume attachment info
def get_disk_attachment_info(self, instance_kill_count, node):
for _ in range(instance_kill_count):
try:
logging.info("Obtaining disk attachment information")
instance_id = (self.aws.get_instance_id(node)).split()
volumes_ids = self.aws.get_volumes_ids(instance_id)
if volumes_ids:
vol_attachment_details = self.aws.get_volume_attachment_details(
volumes_ids
)
return vol_attachment_details
return
except Exception as e:
logging.error(
"Failed to obtain disk attachment information of %s node. "
"Encounteres following exception: %s." % (node, e)
)
raise RuntimeError()

# Node scenario to detach the volume
def disk_detach_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
try:
logging.info("Starting disk_detach_scenario injection")
instance_id = (self.aws.get_instance_id(node)).split()
volumes_ids = self.aws.get_volumes_ids(instance_id)
logging.info(
"Detaching the %s volumes from instance %s "
% (volumes_ids, node)
)
self.aws.detach_volumes(volumes_ids)
except Exception as e:
logging.error(
"Failed to detach disk from %s node. Encountered following"
"exception: %s." % (node, e)
)
logging.debug("")
raise RuntimeError()

# Node scenario to attach the volume
def disk_attach_scenario(self, instance_kill_count, attachment_details, timeout):
for _ in range(instance_kill_count):
for attachment in attachment_details:
self.aws.attach_volume(attachment["Attachments"][0])
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def run_node(self, single_node, node_scenario_object, action, node_scenario):
logging.info("action" + str(action))
# Get the scenario specifics for running action nodes
run_kill_count = get_yaml_item_value(node_scenario, "runs", 1)
if action == "node_stop_start_scenario":
if action in ("node_stop_start_scenario", "node_disk_detach_attach_scenario"):
duration = get_yaml_item_value(node_scenario, "duration", 120)

timeout = get_yaml_item_value(node_scenario, "timeout", 120)
Expand Down Expand Up @@ -200,6 +200,9 @@ def run_node(self, single_node, node_scenario_object, action, node_scenario):
node_scenario_object.node_reboot_scenario(
run_kill_count, single_node, timeout
)
elif action == "node_disk_detach_attach_scenario":
node_scenario_object.node_disk_detach_attach_scenario(
run_kill_count, single_node, timeout, duration)
elif action == "stop_start_kubelet_scenario":
node_scenario_object.stop_start_kubelet_scenario(
run_kill_count, single_node, timeout
Expand Down
7 changes: 7 additions & 0 deletions scenarios/openshift/aws_node_scenarios.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,10 @@ node_scenarios:
instance_count: 1
timeout: 120
cloud_type: aws
- actions:
- node_disk_detach_attach_scenario
node_name:
label_selector:
instance_count: 1
timeout: 120
cloud_type: aws

0 comments on commit 520ac52

Please sign in to comment.