Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ebs loss scenario #692

Merged
merged 1 commit into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion docs/node_scenarios.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ The following node chaos scenarios are supported:

1. **node_start_scenario**: Scenario to stop the node instance.
2. **node_stop_scenario**: Scenario to stop the node instance.
3. **node_stop_start_scenario**: Scenario to stop and then start the node instance. Not supported on VMware.
3. **node_stop_start_scenario**: Scenario to stop the node instance for specified duration and then start the node instance. Not supported on VMware.
4. **node_termination_scenario**: Scenario to terminate the node instance.
5. **node_reboot_scenario**: Scenario to reboot the node instance.
6. **stop_kubelet_scenario**: Scenario to stop the kubelet of the node instance.
7. **stop_start_kubelet_scenario**: Scenario to stop and start the kubelet of the node instance.
8. **restart_kubelet_scenario**: Scenario to restart the kubelet of the node instance.
9. **node_crash_scenario**: Scenario to crash the node instance.
10. **stop_start_helper_node_scenario**: Scenario to stop and start the helper node and check service status.
11. **node_disk_detach_attach_scenario**: Scenario to detach node disk for specified duration.
jtydlack marked this conversation as resolved.
Show resolved Hide resolved
chaitanyaenr marked this conversation as resolved.
Show resolved Hide resolved


**NOTE**: If the node does not recover from the node_crash_scenario injection, reboot the node to get it back to Ready state.
Expand All @@ -20,6 +21,8 @@ The following node chaos scenarios are supported:
, node_reboot_scenario and stop_start_kubelet_scenario are supported on AWS, Azure, OpenStack, BareMetal, GCP
, VMware and Alibaba.

**NOTE**: node_disk_detach_attach_scenario is supported only on AWS and cannot detach root disk.


#### AWS

Expand Down
14 changes: 14 additions & 0 deletions krkn/scenario_plugins/node_actions/abstract_node_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,20 @@ def helper_node_stop_start_scenario(self, instance_kill_count, node, timeout):
self.helper_node_start_scenario(instance_kill_count, node, timeout)
logging.info("helper_node_stop_start_scenario has been successfully injected!")

# Node scenario to detach and attach the disk
def node_disk_detach_attach_scenario(self, instance_kill_count, node, timeout, duration):
logging.info("Starting disk_detach_attach_scenario injection")
disk_attachment_details = self.get_disk_attachment_info(instance_kill_count, node)
if disk_attachment_details:
self.disk_detach_scenario(instance_kill_count, node, timeout)
logging.info("Waiting for %s seconds before attaching the disk" % (duration))
time.sleep(duration)
self.disk_attach_scenario(instance_kill_count, disk_attachment_details, timeout)
logging.info("node_disk_detach_attach_scenario has been successfully injected!")
else:
logging.error("Node %s has only root disk attached" % (node))
logging.error("node_disk_detach_attach_scenario failed!")

# Node scenario to terminate the node
def node_termination_scenario(self, instance_kill_count, node, timeout):
pass
Expand Down
115 changes: 114 additions & 1 deletion krkn/scenario_plugins/node_actions/aws_node_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
class AWS:
def __init__(self):
self.boto_client = boto3.client("ec2")
self.boto_instance = boto3.resource("ec2").Instance("id")
self.boto_resource = boto3.resource("ec2")
self.boto_instance = self.boto_resource.Instance("id")

# Get the instance ID of the node
def get_instance_id(self, node):
Expand Down Expand Up @@ -179,6 +180,72 @@ def delete_network_acl(self, acl_id):

raise RuntimeError()

# Detach volume
def detach_volumes(self, volumes_ids: list):
for volume in volumes_ids:
try:
self.boto_client.detach_volume(VolumeId=volume, Force=True)
except Exception as e:
logging.error(
"Detaching volume %s failed with exception: %s"
% (volume, e)
)

# Attach volume
def attach_volume(self, attachment: dict):
try:
if self.get_volume_state(attachment["VolumeId"]) == "in-use":
logging.info(
"Volume %s is already in use." % attachment["VolumeId"]
)
return
logging.info(
"Attaching the %s volumes to instance %s."
% (attachment["VolumeId"], attachment["InstanceId"])
)
self.boto_client.attach_volume(
InstanceId=attachment["InstanceId"],
Device=attachment["Device"],
VolumeId=attachment["VolumeId"]
)
except Exception as e:
logging.error(
"Failed attaching disk %s to the %s instance. "
"Encountered following exception: %s"
% (attachment['VolumeId'], attachment['InstanceId'], e)
)
raise RuntimeError()

# Get IDs of node volumes
def get_volumes_ids(self, instance_id: list):
response = self.boto_client.describe_instances(InstanceIds=instance_id)
instance_attachment_details = response["Reservations"][0]["Instances"][0]["BlockDeviceMappings"]
root_volume_device_name = self.get_root_volume_id(instance_id)
volume_ids = []
for device in instance_attachment_details:
if device["DeviceName"] != root_volume_device_name:
volume_id = device["Ebs"]["VolumeId"]
volume_ids.append(volume_id)
return volume_ids

# Get volumes attachment details
def get_volume_attachment_details(self, volume_ids: list):
response = self.boto_client.describe_volumes(VolumeIds=volume_ids)
volumes_details = response["Volumes"]
return volumes_details

# Get root volume
def get_root_volume_id(self, instance_id):
instance_id = instance_id[0]
instance = self.boto_resource.Instance(instance_id)
root_volume_id = instance.root_device_name
return root_volume_id

# Get volume state
def get_volume_state(self, volume_id: str):
volume = self.boto_resource.Volume(volume_id)
state = volume.state
return state

# krkn_lib
class aws_node_scenarios(abstract_node_scenarios):
Expand Down Expand Up @@ -290,3 +357,49 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout):
logging.error("node_reboot_scenario injection failed!")

raise RuntimeError()

# Get volume attachment info
def get_disk_attachment_info(self, instance_kill_count, node):
for _ in range(instance_kill_count):
try:
logging.info("Obtaining disk attachment information")
instance_id = (self.aws.get_instance_id(node)).split()
volumes_ids = self.aws.get_volumes_ids(instance_id)
if volumes_ids:
vol_attachment_details = self.aws.get_volume_attachment_details(
volumes_ids
)
return vol_attachment_details
return
except Exception as e:
logging.error(
"Failed to obtain disk attachment information of %s node. "
"Encounteres following exception: %s." % (node, e)
)
raise RuntimeError()

# Node scenario to detach the volume
def disk_detach_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
try:
logging.info("Starting disk_detach_scenario injection")
instance_id = (self.aws.get_instance_id(node)).split()
volumes_ids = self.aws.get_volumes_ids(instance_id)
logging.info(
"Detaching the %s volumes from instance %s "
% (volumes_ids, node)
)
self.aws.detach_volumes(volumes_ids)
except Exception as e:
logging.error(
"Failed to detach disk from %s node. Encountered following"
"exception: %s." % (node, e)
)
logging.debug("")
raise RuntimeError()

# Node scenario to attach the volume
def disk_attach_scenario(self, instance_kill_count, attachment_details, timeout):
for _ in range(instance_kill_count):
for attachment in attachment_details:
self.aws.attach_volume(attachment["Attachments"][0])
jtydlack marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def run_node(self, single_node, node_scenario_object, action, node_scenario):
logging.info("action" + str(action))
# Get the scenario specifics for running action nodes
run_kill_count = get_yaml_item_value(node_scenario, "runs", 1)
if action == "node_stop_start_scenario":
if action in ("node_stop_start_scenario", "node_disk_detach_attach_scenario"):
duration = get_yaml_item_value(node_scenario, "duration", 120)

timeout = get_yaml_item_value(node_scenario, "timeout", 120)
Expand Down Expand Up @@ -200,6 +200,9 @@ def run_node(self, single_node, node_scenario_object, action, node_scenario):
node_scenario_object.node_reboot_scenario(
run_kill_count, single_node, timeout
)
elif action == "node_disk_detach_attach_scenario":
node_scenario_object.node_disk_detach_attach_scenario(
run_kill_count, single_node, timeout, duration)
elif action == "stop_start_kubelet_scenario":
node_scenario_object.stop_start_kubelet_scenario(
run_kill_count, single_node, timeout
Expand Down
7 changes: 7 additions & 0 deletions scenarios/openshift/aws_node_scenarios.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,10 @@ node_scenarios:
instance_count: 1
timeout: 120
cloud_type: aws
- actions:
- node_disk_detach_attach_scenario
node_name:
label_selector:
instance_count: 1
timeout: 120
cloud_type: aws
Loading