From 0e68dedb12dd897d280164b8bdde7b85299e9f46 Mon Sep 17 00:00:00 2001 From: Paige Patton <64206430+paigerube14@users.noreply.github.com> Date: Fri, 1 Nov 2024 15:16:07 -0400 Subject: [PATCH 01/10] adding ibm shut down scenario (#697) rh-pre-commit.version: 2.2.0 rh-pre-commit.check-secrets: ENABLED Signed-off-by: Auto User Signed-off-by: Paige Patton --- docs/cluster_shut_down_scenarios.md | 1 + .../native/node_scenarios/ibmcloud_plugin.py | 11 ++++++++++- .../node_actions/node_actions_scenario_plugin.py | 14 +++++++------- .../shut_down/shut_down_scenario_plugin.py | 3 +++ 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/docs/cluster_shut_down_scenarios.md b/docs/cluster_shut_down_scenarios.md index bb45f0b5..1baed38e 100644 --- a/docs/cluster_shut_down_scenarios.md +++ b/docs/cluster_shut_down_scenarios.md @@ -8,6 +8,7 @@ Current accepted cloud types: * [GCP](cloud_setup.md#gcp) * [AWS](cloud_setup.md#aws) * [Openstack](cloud_setup.md#openstack) +* [IBMCloud](cloud_setup.md#ibmcloud) ``` diff --git a/krkn/scenario_plugins/native/node_scenarios/ibmcloud_plugin.py b/krkn/scenario_plugins/native/node_scenarios/ibmcloud_plugin.py index f7d52921..93635447 100644 --- a/krkn/scenario_plugins/native/node_scenarios/ibmcloud_plugin.py +++ b/krkn/scenario_plugins/native/node_scenarios/ibmcloud_plugin.py @@ -34,7 +34,16 @@ def __init__(self): self.service.set_service_url(service_url) except Exception as e: logging.error("error authenticating" + str(e)) - sys.exit(1) + + + # Get the instance ID of the node + def get_instance_id(self, node_name): + node_list = self.list_instances() + for node in node_list: + if node_name == node["vpc_name"]: + return node["vpc_id"] + logging.error("Couldn't find node with name " + str(node_name) + ", you could try another region") + sys.exit(1) def delete_instance(self, instance_id): """ diff --git a/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py b/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py index c49afdaf..421f7472 100644 --- a/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py +++ b/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py @@ -64,23 +64,23 @@ def get_node_scenario_object(self, node_scenario, kubecli: KrknKubernetes): global node_general node_general = True return general_node_scenarios(kubecli) - if node_scenario["cloud_type"] == "aws": + if node_scenario["cloud_type"].lower() == "aws": return aws_node_scenarios(kubecli) - elif node_scenario["cloud_type"] == "gcp": + elif node_scenario["cloud_type"].lower() == "gcp": return gcp_node_scenarios(kubecli) - elif node_scenario["cloud_type"] == "openstack": + elif node_scenario["cloud_type"].lower() == "openstack": from krkn.scenario_plugins.node_actions.openstack_node_scenarios import ( openstack_node_scenarios, ) return openstack_node_scenarios(kubecli) elif ( - node_scenario["cloud_type"] == "azure" + node_scenario["cloud_type"].lower() == "azure" or node_scenario["cloud_type"] == "az" ): return azure_node_scenarios(kubecli) elif ( - node_scenario["cloud_type"] == "alibaba" + node_scenario["cloud_type"].lower() == "alibaba" or node_scenario["cloud_type"] == "alicloud" ): from krkn.scenario_plugins.node_actions.alibaba_node_scenarios import ( @@ -88,7 +88,7 @@ def get_node_scenario_object(self, node_scenario, kubecli: KrknKubernetes): ) return alibaba_node_scenarios(kubecli) - elif node_scenario["cloud_type"] == "bm": + elif node_scenario["cloud_type"].lower() == "bm": from krkn.scenario_plugins.node_actions.bm_node_scenarios import ( bm_node_scenarios, ) @@ -99,7 +99,7 @@ def get_node_scenario_object(self, node_scenario, kubecli: KrknKubernetes): node_scenario.get("bmc_password", None), kubecli, ) - elif node_scenario["cloud_type"] == "docker": + elif node_scenario["cloud_type"].lower() == "docker": return docker_node_scenarios(kubecli) else: logging.error( diff --git a/krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py b/krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py index ea915e32..b81906ff 100644 --- a/krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py +++ b/krkn/scenario_plugins/shut_down/shut_down_scenario_plugin.py @@ -13,6 +13,7 @@ from krkn.scenario_plugins.node_actions.az_node_scenarios import Azure from krkn.scenario_plugins.node_actions.gcp_node_scenarios import GCP from krkn.scenario_plugins.node_actions.openstack_node_scenarios import OPENSTACKCLOUD +from krkn.scenario_plugins.native.node_scenarios.ibmcloud_plugin import IbmCloud class ShutDownScenarioPlugin(AbstractScenarioPlugin): @@ -86,6 +87,8 @@ def cluster_shut_down(self, shut_down_config, kubecli: KrknKubernetes): cloud_object = OPENSTACKCLOUD() elif cloud_type.lower() in ["azure", "az"]: cloud_object = Azure() + elif cloud_type.lower() in ["ibm", "ibmcloud"]: + cloud_object = IbmCloud() else: logging.error( "Cloud type %s is not currently supported for cluster shut down" From 959766254d3dd61c5fdb9d847be7eb754333b44c Mon Sep 17 00:00:00 2001 From: Naga Ravi Chaitanya Elluri Date: Sat, 2 Nov 2024 18:58:46 -0400 Subject: [PATCH 02/10] Update status of the relevant work items under roadmap Signed-off-by: Naga Ravi Chaitanya Elluri --- ROADMAP.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ROADMAP.md b/ROADMAP.md index e60d471c..31af69d3 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -6,10 +6,11 @@ Following are a list of enhancements that we are planning to work on adding supp - [x] [Centralized storage for chaos experiments artifacts](https://github.com/krkn-chaos/krkn/issues/423) - [ ] [Support for causing DNS outages](https://github.com/krkn-chaos/krkn/issues/394) - [x] [Chaos recommender](https://github.com/krkn-chaos/krkn/tree/main/utils/chaos-recommender) to suggest scenarios having probability of impacting the service under test using profiling results -- [ ] Chaos AI integration to improve and automate test coverage +- [] Chaos AI integration to improve test coverage while reducing fault space to save costs and execution time - [x] [Support for pod level network traffic shaping](https://github.com/krkn-chaos/krkn/issues/393) - [ ] [Ability to visualize the metrics that are being captured by Kraken and stored in Elasticsearch](https://github.com/krkn-chaos/krkn/issues/124) -- [ ] Support for running all the scenarios of Kraken on Kubernetes distribution - see https://github.com/krkn-chaos/krkn/issues/185, https://github.com/redhat-chaos/krkn/issues/186 -- [ ] Continue to improve [Chaos Testing Guide](https://krkn-chaos.github.io/krkn) in terms of adding best practices, test environment recommendations and scenarios to make sure the OpenShift platform, as well the applications running on top it, are resilient and performant under chaotic conditions. -- [ ] [Switch documentation references to Kubernetes](https://github.com/krkn-chaos/krkn/issues/495) -- [ ] [OCP and Kubernetes functionalities segregation](https://github.com/krkn-chaos/krkn/issues/497) +- [x] Support for running all the scenarios of Kraken on Kubernetes distribution - see https://github.com/krkn-chaos/krkn/issues/185, https://github.com/redhat-chaos/krkn/issues/186 +- [x] Continue to improve [Chaos Testing Guide](https://krkn-chaos.github.io/krkn) in terms of adding best practices, test environment recommendations and scenarios to make sure the OpenShift platform, as well the applications running on top it, are resilient and performant under chaotic conditions. +- [x] [Switch documentation references to Kubernetes](https://github.com/krkn-chaos/krkn/issues/495) +- [x] [OCP and Kubernetes functionalities segregation](https://github.com/krkn-chaos/krkn/issues/497) +- [x] [Krknctl - client for running Krkn scenarios with ease](https://github.com/krkn-chaos/krknctl) From 949f1f09e03d0da9a603d8d3fb827740ebcc74e7 Mon Sep 17 00:00:00 2001 From: Henrick Goldwurm Date: Wed, 6 Nov 2024 12:58:25 -0500 Subject: [PATCH 03/10] Add support for user-provided default network ACL (#731) * Add support for user-provided default network ACL Signed-off-by: henrick * Add logs to notify user when their provided acl is used Signed-off-by: henrick * Update docs to include optional default_acl_id parameter in zone_outage Signed-off-by: henrick --------- Signed-off-by: henrick Co-authored-by: henrick --- docs/zone_outage.md | 2 ++ .../zone_outage/zone_outage_scenario_plugin.py | 18 ++++++++++++++++-- scenarios/openshift/zone_outage.yaml | 1 + 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/docs/zone_outage.md b/docs/zone_outage.md index 019e3fe0..2337baef 100644 --- a/docs/zone_outage.md +++ b/docs/zone_outage.md @@ -13,10 +13,12 @@ zone_outage: # Scenario to create an out duration: 600 # Duration in seconds after which the zone will be back online. vpc_id: # Cluster virtual private network to target. subnet_id: [subnet1, subnet2] # List of subnet-id's to deny both ingress and egress traffic. + default_acl_id: acl-xxxxxxxx # (Optional) ID of an existing network ACL to use instead of creating a new one. If provided, this ACL will not be deleted after the scenario. ``` **NOTE**: vpc_id and subnet_id can be obtained from the cloud web console by selecting one of the instances in the targeted zone ( us-west-2a for example ). **NOTE**: Multiple zones will experience downtime in case of targeting multiple subnets which might have an impact on the cluster health especially if the zones have control plane components deployed. +**NOTE**: default_acl_id can be obtained from the AWS VPC Console by selecting "Network ACLs" from the left sidebar ( the ID will be in the format 'acl-xxxxxxxx' ). Make sure the selected ACL has the desired ingress/egress rules for your outage scenario ( i.e., deny all ). ##### Debugging steps in case of failures In case of failures during the steps which revert back the network acl to allow traffic and bring back the cluster nodes in the zone, the nodes in the particular zone will be in `NotReady` condition. Here is how to fix it: diff --git a/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py b/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py index c2a83ee5..bce7d051 100644 --- a/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py +++ b/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py @@ -29,6 +29,8 @@ def run( subnet_ids = scenario_config["subnet_id"] duration = scenario_config["duration"] cloud_type = scenario_config["cloud_type"] + # Add support for user-provided default network ACL + default_acl_id = scenario_config.get("default_acl_id") ids = {} acl_ids_created = [] @@ -58,7 +60,20 @@ def run( "Network association ids associated with " "the subnet %s: %s" % (subnet_id, network_association_ids) ) - acl_id = cloud_object.create_default_network_acl(vpc_id) + + # Use provided default ACL if available, otherwise create a new one + if default_acl_id: + acl_id = default_acl_id + logging.info( + "Using provided default ACL ID %s - this ACL will not be deleted after the scenario", + default_acl_id + ) + # Don't add to acl_ids_created since we don't want to delete user-provided ACLs at cleanup + else: + acl_id = cloud_object.create_default_network_acl(vpc_id) + logging.info("Created new default ACL %s", acl_id) + acl_ids_created.append(acl_id) + new_association_id = cloud_object.replace_network_acl_association( network_association_ids[0], acl_id ) @@ -66,7 +81,6 @@ def run( # capture the orginal_acl_id, created_acl_id and # new association_id to use during the recovery ids[new_association_id] = original_acl_id - acl_ids_created.append(acl_id) # wait for the specified duration logging.info( diff --git a/scenarios/openshift/zone_outage.yaml b/scenarios/openshift/zone_outage.yaml index a54c000b..216cf020 100644 --- a/scenarios/openshift/zone_outage.yaml +++ b/scenarios/openshift/zone_outage.yaml @@ -3,3 +3,4 @@ zone_outage: # Scenario to create an out duration: 600 # duration in seconds after which the zone will be back online vpc_id: # cluster virtual private network to target subnet_id: [subnet1, subnet2] # List of subnet-id's to deny both ingress and egress traffic + default_acl_id: acl-xxxxxxxx # (Optional) ID of an existing network ACL to use instead of creating a new one. If provided, this ACL will not be deleted after the scenario. From 2549c9a146d9ffccedf3de53e36e49204e41d299 Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Tue, 12 Nov 2024 11:25:27 +0100 Subject: [PATCH 04/10] bump werkzeug to 3.0.6 to fix cve on krkn-hub baseimage --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ef8d6498..1a9f5dc6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,7 +32,7 @@ requests==2.32.2 service_identity==24.1.0 PyYAML==6.0.1 setuptools==70.0.0 -werkzeug==3.0.3 +werkzeug==3.0.6 wheel==0.42.0 zope.interface==5.4.0 From 491f59d152b336c99e087d139ba5a3c0f75bd712 Mon Sep 17 00:00:00 2001 From: Paige Patton Date: Mon, 4 Nov 2024 16:10:53 -0500 Subject: [PATCH 05/10] few small changes Signed-off-by: Paige Patton --- docs/node_scenarios.md | 2 + .../node_actions/common_node_functions.py | 38 ++-- .../node_actions_scenario_plugin.py | 190 ++++++++++-------- requirements.txt | 2 +- scenarios/openshift/aws_node_scenarios.yml | 17 +- 5 files changed, 144 insertions(+), 105 deletions(-) diff --git a/docs/node_scenarios.md b/docs/node_scenarios.md index 3913c0f1..6d81b1dd 100644 --- a/docs/node_scenarios.md +++ b/docs/node_scenarios.md @@ -57,6 +57,8 @@ kind was primarily designed for testing Kubernetes itself, but may be used for l #### GCP Cloud setup instructions can be found [here](cloud_setup.md#gcp). Sample scenario config can be found [here](https://github.com/krkn-chaos/krkn/blob/main/scenarios/openshift/gcp_node_scenarios.yml). +NOTE: The parallel option is not available for GCP, the api doesn't perform processes at the same time + #### Openstack diff --git a/krkn/scenario_plugins/node_actions/common_node_functions.py b/krkn/scenario_plugins/node_actions/common_node_functions.py index f4e47ae1..ddd78807 100644 --- a/krkn/scenario_plugins/node_actions/common_node_functions.py +++ b/krkn/scenario_plugins/node_actions/common_node_functions.py @@ -8,19 +8,28 @@ node_general = False +def get_node_by_name(node_name_list, kubecli: KrknKubernetes): + killable_nodes = kubecli.list_killable_nodes() + for node_name in node_name_list: + if node_name not in killable_nodes: + logging.info( + f"Node with provided ${node_name} does not exist or the node might " + "be in NotReady state." + ) + return + return node_name_list + + # Pick a random node with specified label selector -def get_node(node_name, label_selector, instance_kill_count, kubecli: KrknKubernetes): - if node_name in kubecli.list_killable_nodes(): - return [node_name] - elif node_name: - logging.info( - "Node with provided node_name does not exist or the node might " - "be in NotReady state." - ) - nodes = kubecli.list_killable_nodes(label_selector) +def get_node(label_selector, instance_kill_count, kubecli: KrknKubernetes): + + label_selector_list = label_selector.split(",") + nodes = [] + for label_selector in label_selector_list: + nodes.extend(kubecli.list_killable_nodes(label_selector)) if not nodes: raise Exception("Ready nodes with the provided label selector do not exist") - logging.info("Ready nodes with the label selector %s: %s" % (label_selector, nodes)) + logging.info("Ready nodes with the label selector %s: %s" % (label_selector_list, nodes)) number_of_nodes = len(nodes) if instance_kill_count == number_of_nodes: return nodes @@ -35,22 +44,19 @@ def get_node(node_name, label_selector, instance_kill_count, kubecli: KrknKubern # krkn_lib # Wait until the node status becomes Ready def wait_for_ready_status(node, timeout, kubecli: KrknKubernetes): - resource_version = kubecli.get_node_resource_version(node) - kubecli.watch_node_status(node, "True", timeout, resource_version) + kubecli.watch_node_status(node, "True", timeout) # krkn_lib # Wait until the node status becomes Not Ready def wait_for_not_ready_status(node, timeout, kubecli: KrknKubernetes): - resource_version = kubecli.get_node_resource_version(node) - kubecli.watch_node_status(node, "False", timeout, resource_version) + kubecli.watch_node_status(node, "False", timeout) # krkn_lib # Wait until the node status becomes Unknown def wait_for_unknown_status(node, timeout, kubecli: KrknKubernetes): - resource_version = kubecli.get_node_resource_version(node) - kubecli.watch_node_status(node, "Unknown", timeout, resource_version) + kubecli.watch_node_status(node, "Unknown", timeout) # Get the ip of the cluster node diff --git a/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py b/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py index 421f7472..486e8a21 100644 --- a/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py +++ b/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py @@ -1,5 +1,7 @@ import logging import time +from multiprocessing.pool import ThreadPool +from itertools import repeat import yaml from krkn_lib.k8s import KrknKubernetes @@ -120,100 +122,128 @@ def get_node_scenario_object(self, node_scenario, kubecli: KrknKubernetes): def inject_node_scenario( self, action, node_scenario, node_scenario_object, kubecli: KrknKubernetes ): - generic_cloud_scenarios = ("stop_kubelet_scenario", "node_crash_scenario") - # Get the node scenario configurations - run_kill_count = get_yaml_item_value(node_scenario, "runs", 1) + + # Get the node scenario configurations for setting nodes + instance_kill_count = get_yaml_item_value(node_scenario, "instance_count", 1) node_name = get_yaml_item_value(node_scenario, "node_name", "") label_selector = get_yaml_item_value(node_scenario, "label_selector", "") + parallel_nodes = get_yaml_item_value(node_scenario, "parallel", False) + + # Get the node to apply the scenario + if node_name: + node_name_list = node_name.split(",") + nodes = common_node_functions.get_node_by_name(node_name_list, kubecli) + else: + nodes = common_node_functions.get_node( + label_selector, instance_kill_count, kubecli + ) + + # GCP api doesn't support multiprocessing calls, will only actually run 1 + if parallel_nodes and node_scenario['cloud_type'].lower() is not "gcp": + self.multiprocess_nodes(nodes, node_scenario_object, action, node_scenario) + else: + for single_node in nodes: + self.run_node(single_node, node_scenario_object, action, node_scenario) + + def multiprocess_nodes(self, nodes, node_scenario_object, action, node_scenario): + try: + logging.info("parallely call to nodes") + # pool object with number of element + pool = ThreadPool(processes=len(nodes)) + + pool.starmap(self.run_node,zip(nodes, repeat(node_scenario_object), repeat(action), repeat(node_scenario))) + + pool.close() + except Exception as e: + logging.info("Error on pool multiprocessing: " + str(e)) + + + def run_node(self, single_node, node_scenario_object, action, node_scenario): + logging.info("action" + str(action)) + # Get the scenario specifics for running action nodes + run_kill_count = get_yaml_item_value(node_scenario, "runs", 1) if action == "node_stop_start_scenario": duration = get_yaml_item_value(node_scenario, "duration", 120) + timeout = get_yaml_item_value(node_scenario, "timeout", 120) service = get_yaml_item_value(node_scenario, "service", "") ssh_private_key = get_yaml_item_value( node_scenario, "ssh_private_key", "~/.ssh/id_rsa" ) - # Get the node to apply the scenario - if node_name: - node_name_list = node_name.split(",") - else: - node_name_list = [node_name] - for single_node_name in node_name_list: - nodes = common_node_functions.get_node( - single_node_name, label_selector, instance_kill_count, kubecli + generic_cloud_scenarios = ("stop_kubelet_scenario", "node_crash_scenario") + + if node_general and action not in generic_cloud_scenarios: + logging.info( + "Scenario: " + + action + + " is not set up for generic cloud type, skipping action" ) - for single_node in nodes: - if node_general and action not in generic_cloud_scenarios: - logging.info( - "Scenario: " - + action - + " is not set up for generic cloud type, skipping action" + else: + if action == "node_start_scenario": + node_scenario_object.node_start_scenario( + run_kill_count, single_node, timeout + ) + elif action == "node_stop_scenario": + node_scenario_object.node_stop_scenario( + run_kill_count, single_node, timeout + ) + elif action == "node_stop_start_scenario": + node_scenario_object.node_stop_start_scenario( + run_kill_count, single_node, timeout, duration + ) + elif action == "node_termination_scenario": + node_scenario_object.node_termination_scenario( + run_kill_count, single_node, timeout + ) + elif action == "node_reboot_scenario": + node_scenario_object.node_reboot_scenario( + run_kill_count, single_node, timeout + ) + elif action == "stop_start_kubelet_scenario": + node_scenario_object.stop_start_kubelet_scenario( + run_kill_count, single_node, timeout + ) + elif action == "restart_kubelet_scenario": + node_scenario_object.restart_kubelet_scenario( + run_kill_count, single_node, timeout + ) + elif action == "stop_kubelet_scenario": + node_scenario_object.stop_kubelet_scenario( + run_kill_count, single_node, timeout + ) + elif action == "node_crash_scenario": + node_scenario_object.node_crash_scenario( + run_kill_count, single_node, timeout + ) + elif action == "stop_start_helper_node_scenario": + if node_scenario["cloud_type"] != "openstack": + logging.error( + "Scenario: " + action + " is not supported for " + "cloud type " + + node_scenario["cloud_type"] + + ", skipping action" ) else: - if action == "node_start_scenario": - node_scenario_object.node_start_scenario( - run_kill_count, single_node, timeout - ) - elif action == "node_stop_scenario": - node_scenario_object.node_stop_scenario( - run_kill_count, single_node, timeout - ) - elif action == "node_stop_start_scenario": - node_scenario_object.node_stop_start_scenario( - run_kill_count, single_node, timeout, duration - ) - elif action == "node_termination_scenario": - node_scenario_object.node_termination_scenario( - run_kill_count, single_node, timeout - ) - elif action == "node_reboot_scenario": - node_scenario_object.node_reboot_scenario( - run_kill_count, single_node, timeout - ) - elif action == "stop_start_kubelet_scenario": - node_scenario_object.stop_start_kubelet_scenario( - run_kill_count, single_node, timeout - ) - elif action == "restart_kubelet_scenario": - node_scenario_object.restart_kubelet_scenario( - run_kill_count, single_node, timeout - ) - elif action == "stop_kubelet_scenario": - node_scenario_object.stop_kubelet_scenario( - run_kill_count, single_node, timeout - ) - elif action == "node_crash_scenario": - node_scenario_object.node_crash_scenario( - run_kill_count, single_node, timeout - ) - elif action == "stop_start_helper_node_scenario": - if node_scenario["cloud_type"] != "openstack": - logging.error( - "Scenario: " + action + " is not supported for " - "cloud type " - + node_scenario["cloud_type"] - + ", skipping action" - ) - else: - if not node_scenario["helper_node_ip"]: - logging.error("Helper node IP address is not provided") - raise Exception( - "Helper node IP address is not provided" - ) - node_scenario_object.helper_node_stop_start_scenario( - run_kill_count, node_scenario["helper_node_ip"], timeout - ) - node_scenario_object.helper_node_service_status( - node_scenario["helper_node_ip"], - service, - ssh_private_key, - timeout, - ) - else: - logging.info( - "There is no node action that matches %s, skipping scenario" - % action + if not node_scenario["helper_node_ip"]: + logging.error("Helper node IP address is not provided") + raise Exception( + "Helper node IP address is not provided" ) + node_scenario_object.helper_node_stop_start_scenario( + run_kill_count, node_scenario["helper_node_ip"], timeout + ) + node_scenario_object.helper_node_service_status( + node_scenario["helper_node_ip"], + service, + ssh_private_key, + timeout, + ) + else: + logging.info( + "There is no node action that matches %s, skipping scenario" + % action + ) def get_scenario_types(self) -> list[str]: return ["node_scenarios"] diff --git a/requirements.txt b/requirements.txt index 1a9f5dc6..3f69aff1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ google-api-python-client==2.116.0 ibm_cloud_sdk_core==3.18.0 ibm_vpc==0.20.0 jinja2==3.1.4 -krkn-lib==4.0.3 +krkn-lib==4.0.4 lxml==5.1.0 kubernetes==28.1.0 numpy==1.26.4 diff --git a/scenarios/openshift/aws_node_scenarios.yml b/scenarios/openshift/aws_node_scenarios.yml index 57d00c49..9ce36812 100644 --- a/scenarios/openshift/aws_node_scenarios.yml +++ b/scenarios/openshift/aws_node_scenarios.yml @@ -1,13 +1,14 @@ node_scenarios: - - actions: # node chaos scenarios to be injected + - actions: # node chaos scenarios to be injected - node_stop_start_scenario - node_name: # node on which scenario has to be injected; can set multiple names separated by comma - label_selector: node-role.kubernetes.io/worker # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection - instance_count: 1 # Number of nodes to perform action/select that match the label selector - runs: 1 # number of times to inject each scenario under actions (will perform on same node each time) - timeout: 360 # duration to wait for completion of node scenario injection - duration: 120 # duration to stop the node before running the start action - cloud_type: aws # cloud type on which Kubernetes/OpenShift runs + node_name: # node on which scenario has to be injected; can set multiple names separated by comma + label_selector: node-role.kubernetes.io/worker # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection + instance_count: 2 # Number of nodes to perform action/select that match the label selector + runs: 1 # number of times to inject each scenario under actions (will perform on same node each time) + timeout: 360 # duration to wait for completion of node scenario injection + duration: 20 # duration to stop the node before running the start action + cloud_type: aws # cloud type on which Kubernetes/OpenShift runs + parallel: true # Run action on label or node name in parallel or sequential, defaults to sequential - actions: - node_reboot_scenario node_name: From 0ecba410828cead7873b666e6870c37d6e0fbf79 Mon Sep 17 00:00:00 2001 From: Paige Patton Date: Tue, 12 Nov 2024 09:52:38 -0700 Subject: [PATCH 06/10] adding multi label comment --- scenarios/openshift/aws_node_scenarios.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scenarios/openshift/aws_node_scenarios.yml b/scenarios/openshift/aws_node_scenarios.yml index 9ce36812..95e453f5 100644 --- a/scenarios/openshift/aws_node_scenarios.yml +++ b/scenarios/openshift/aws_node_scenarios.yml @@ -2,7 +2,7 @@ node_scenarios: - actions: # node chaos scenarios to be injected - node_stop_start_scenario node_name: # node on which scenario has to be injected; can set multiple names separated by comma - label_selector: node-role.kubernetes.io/worker # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection + label_selector: node-role.kubernetes.io/worker # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection; can specify multiple by a comma separated list instance_count: 2 # Number of nodes to perform action/select that match the label selector runs: 1 # number of times to inject each scenario under actions (will perform on same node each time) timeout: 360 # duration to wait for completion of node scenario injection From 10ba53574efeccabc313f78ae5bb20978317ca3b Mon Sep 17 00:00:00 2001 From: Paige Patton Date: Thu, 14 Nov 2024 07:25:19 -0700 Subject: [PATCH 07/10] not equal to gcp Signed-off-by: Paige Patton --- .../node_actions/node_actions_scenario_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py b/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py index 486e8a21..cae3e66c 100644 --- a/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py +++ b/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py @@ -140,7 +140,7 @@ def inject_node_scenario( ) # GCP api doesn't support multiprocessing calls, will only actually run 1 - if parallel_nodes and node_scenario['cloud_type'].lower() is not "gcp": + if parallel_nodes and node_scenario['cloud_type'].lower() != "gcp": self.multiprocess_nodes(nodes, node_scenario_object, action, node_scenario) else: for single_node in nodes: From 97035a765cfa3209def1bde5708a7a4a04fd9a96 Mon Sep 17 00:00:00 2001 From: Paige Patton Date: Thu, 21 Nov 2024 14:33:57 -0500 Subject: [PATCH 08/10] adding get node name list changes Signed-off-by: Paige Patton --- .../network_chaos_scenario_plugin.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/krkn/scenario_plugins/network_chaos/network_chaos_scenario_plugin.py b/krkn/scenario_plugins/network_chaos/network_chaos_scenario_plugin.py index eaa0719f..7265b756 100644 --- a/krkn/scenario_plugins/network_chaos/network_chaos_scenario_plugin.py +++ b/krkn/scenario_plugins/network_chaos/network_chaos_scenario_plugin.py @@ -42,19 +42,13 @@ def run( test_egress = get_yaml_item_value( test_dict, "egress", {"bandwidth": "100mbit"} ) + if test_node: node_name_list = test_node.split(",") + nodelst = common_node_functions.get_node_by_name(node_name_list, lib_telemetry.get_lib_kubernetes()) else: - node_name_list = [test_node] - nodelst = [] - for single_node_name in node_name_list: - nodelst.extend( - common_node_functions.get_node( - single_node_name, - test_node_label, - test_instance_count, - lib_telemetry.get_lib_kubernetes(), - ) + nodelst = common_node_functions.get_node( + test_node_label, test_instance_count, lib_telemetry.get_lib_kubernetes() ) file_loader = FileSystemLoader( os.path.abspath(os.path.dirname(__file__)) @@ -149,7 +143,10 @@ def run( finally: logging.info("Deleting jobs") self.delete_job(joblst[:], lib_telemetry.get_lib_kubernetes()) - except (RuntimeError, Exception): + except (RuntimeError, Exception) as e: + logging.error( + "NetworkChaosScenarioPlugin exiting due to Exception %s" % e + ) scenario_telemetry.exit_status = 1 return 1 else: From 2ba20fa48357d71dadb17e3198359a8262cb9331 Mon Sep 17 00:00:00 2001 From: Paige Patton Date: Thu, 5 Dec 2024 11:10:06 -0500 Subject: [PATCH 09/10] adding code bock --- docs/network_chaos.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/network_chaos.md b/docs/network_chaos.md index b69817bd..f1d802d4 100644 --- a/docs/network_chaos.md +++ b/docs/network_chaos.md @@ -18,7 +18,7 @@ network_chaos: # Scenario to create an outage ``` ##### Sample scenario config for ingress traffic shaping (using a plugin) -''' +``` - id: network_chaos config: node_interface_name: # Dictionary with key as node name(s) and value as a list of its interfaces to test @@ -35,7 +35,7 @@ network_chaos: # Scenario to create an outage bandwidth: 10mbit wait_duration: 120 test_duration: 60 - ''' +``` Note: For ingress traffic shaping, ensure that your node doesn't have any [IFB](https://wiki.linuxfoundation.org/networking/ifb) interfaces already present. The scenario relies on creating IFBs to do the shaping, and they are deleted at the end of the scenario. From 0c30d89a1b9506331de2c7eee871d38a4dc96fba Mon Sep 17 00:00:00 2001 From: jtydlack <139967002+jtydlack@users.noreply.github.com> Date: Wed, 28 Aug 2024 10:20:17 +0200 Subject: [PATCH 10/10] Add node_disk_detach_attach_scenario for aws under node scenarios Resolves #678 Signed-off-by: jtydlack <139967002+jtydlack@users.noreply.github.com> Add functions for aws detach disk scenario Signed-off-by: jtydlack <139967002+jtydlack@users.noreply.github.com> Add detach disk scenario in node scenario Signed-off-by: jtydlack <139967002+jtydlack@users.noreply.github.com> Add disk_deatch_attach_scenario in docs Signed-off-by: jtydlack <139967002+jtydlack@users.noreply.github.com> --- docs/node_scenarios.md | 5 +- .../node_actions/abstract_node_scenarios.py | 14 +++ .../node_actions/aws_node_scenarios.py | 115 +++++++++++++++++- .../node_actions_scenario_plugin.py | 5 +- scenarios/openshift/aws_node_scenarios.yml | 7 ++ 5 files changed, 143 insertions(+), 3 deletions(-) diff --git a/docs/node_scenarios.md b/docs/node_scenarios.md index 6d81b1dd..d85c08c9 100644 --- a/docs/node_scenarios.md +++ b/docs/node_scenarios.md @@ -4,7 +4,7 @@ The following node chaos scenarios are supported: 1. **node_start_scenario**: Scenario to stop the node instance. 2. **node_stop_scenario**: Scenario to stop the node instance. -3. **node_stop_start_scenario**: Scenario to stop and then start the node instance. Not supported on VMware. +3. **node_stop_start_scenario**: Scenario to stop the node instance for specified duration and then start the node instance. Not supported on VMware. 4. **node_termination_scenario**: Scenario to terminate the node instance. 5. **node_reboot_scenario**: Scenario to reboot the node instance. 6. **stop_kubelet_scenario**: Scenario to stop the kubelet of the node instance. @@ -12,6 +12,7 @@ The following node chaos scenarios are supported: 8. **restart_kubelet_scenario**: Scenario to restart the kubelet of the node instance. 9. **node_crash_scenario**: Scenario to crash the node instance. 10. **stop_start_helper_node_scenario**: Scenario to stop and start the helper node and check service status. +11. **node_disk_detach_attach_scenario**: Scenario to detach node disk for specified duration. **NOTE**: If the node does not recover from the node_crash_scenario injection, reboot the node to get it back to Ready state. @@ -20,6 +21,8 @@ The following node chaos scenarios are supported: , node_reboot_scenario and stop_start_kubelet_scenario are supported on AWS, Azure, OpenStack, BareMetal, GCP , VMware and Alibaba. +**NOTE**: node_disk_detach_attach_scenario is supported only on AWS and cannot detach root disk. + #### AWS diff --git a/krkn/scenario_plugins/node_actions/abstract_node_scenarios.py b/krkn/scenario_plugins/node_actions/abstract_node_scenarios.py index 73d3feec..0602dff7 100644 --- a/krkn/scenario_plugins/node_actions/abstract_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/abstract_node_scenarios.py @@ -36,6 +36,20 @@ def helper_node_stop_start_scenario(self, instance_kill_count, node, timeout): self.helper_node_start_scenario(instance_kill_count, node, timeout) logging.info("helper_node_stop_start_scenario has been successfully injected!") + # Node scenario to detach and attach the disk + def node_disk_detach_attach_scenario(self, instance_kill_count, node, timeout, duration): + logging.info("Starting disk_detach_attach_scenario injection") + disk_attachment_details = self.get_disk_attachment_info(instance_kill_count, node) + if disk_attachment_details: + self.disk_detach_scenario(instance_kill_count, node, timeout) + logging.info("Waiting for %s seconds before attaching the disk" % (duration)) + time.sleep(duration) + self.disk_attach_scenario(instance_kill_count, disk_attachment_details, timeout) + logging.info("node_disk_detach_attach_scenario has been successfully injected!") + else: + logging.error("Node %s has only root disk attached" % (node)) + logging.error("node_disk_detach_attach_scenario failed!") + # Node scenario to terminate the node def node_termination_scenario(self, instance_kill_count, node, timeout): pass diff --git a/krkn/scenario_plugins/node_actions/aws_node_scenarios.py b/krkn/scenario_plugins/node_actions/aws_node_scenarios.py index c715a3e8..f4784506 100644 --- a/krkn/scenario_plugins/node_actions/aws_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/aws_node_scenarios.py @@ -12,7 +12,8 @@ class AWS: def __init__(self): self.boto_client = boto3.client("ec2") - self.boto_instance = boto3.resource("ec2").Instance("id") + self.boto_resource = boto3.resource("ec2") + self.boto_instance = self.boto_resource.Instance("id") # Get the instance ID of the node def get_instance_id(self, node): @@ -179,6 +180,72 @@ def delete_network_acl(self, acl_id): raise RuntimeError() + # Detach volume + def detach_volumes(self, volumes_ids: list): + for volume in volumes_ids: + try: + self.boto_client.detach_volume(VolumeId=volume, Force=True) + except Exception as e: + logging.error( + "Detaching volume %s failed with exception: %s" + % (volume, e) + ) + + # Attach volume + def attach_volume(self, attachment: dict): + try: + if self.get_volume_state(attachment["VolumeId"]) == "in-use": + logging.info( + "Volume %s is already in use." % attachment["VolumeId"] + ) + return + logging.info( + "Attaching the %s volumes to instance %s." + % (attachment["VolumeId"], attachment["InstanceId"]) + ) + self.boto_client.attach_volume( + InstanceId=attachment["InstanceId"], + Device=attachment["Device"], + VolumeId=attachment["VolumeId"] + ) + except Exception as e: + logging.error( + "Failed attaching disk %s to the %s instance. " + "Encountered following exception: %s" + % (attachment['VolumeId'], attachment['InstanceId'], e) + ) + raise RuntimeError() + + # Get IDs of node volumes + def get_volumes_ids(self, instance_id: list): + response = self.boto_client.describe_instances(InstanceIds=instance_id) + instance_attachment_details = response["Reservations"][0]["Instances"][0]["BlockDeviceMappings"] + root_volume_device_name = self.get_root_volume_id(instance_id) + volume_ids = [] + for device in instance_attachment_details: + if device["DeviceName"] != root_volume_device_name: + volume_id = device["Ebs"]["VolumeId"] + volume_ids.append(volume_id) + return volume_ids + + # Get volumes attachment details + def get_volume_attachment_details(self, volume_ids: list): + response = self.boto_client.describe_volumes(VolumeIds=volume_ids) + volumes_details = response["Volumes"] + return volumes_details + + # Get root volume + def get_root_volume_id(self, instance_id): + instance_id = instance_id[0] + instance = self.boto_resource.Instance(instance_id) + root_volume_id = instance.root_device_name + return root_volume_id + + # Get volume state + def get_volume_state(self, volume_id: str): + volume = self.boto_resource.Volume(volume_id) + state = volume.state + return state # krkn_lib class aws_node_scenarios(abstract_node_scenarios): @@ -290,3 +357,49 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout): logging.error("node_reboot_scenario injection failed!") raise RuntimeError() + + # Get volume attachment info + def get_disk_attachment_info(self, instance_kill_count, node): + for _ in range(instance_kill_count): + try: + logging.info("Obtaining disk attachment information") + instance_id = (self.aws.get_instance_id(node)).split() + volumes_ids = self.aws.get_volumes_ids(instance_id) + if volumes_ids: + vol_attachment_details = self.aws.get_volume_attachment_details( + volumes_ids + ) + return vol_attachment_details + return + except Exception as e: + logging.error( + "Failed to obtain disk attachment information of %s node. " + "Encounteres following exception: %s." % (node, e) + ) + raise RuntimeError() + + # Node scenario to detach the volume + def disk_detach_scenario(self, instance_kill_count, node, timeout): + for _ in range(instance_kill_count): + try: + logging.info("Starting disk_detach_scenario injection") + instance_id = (self.aws.get_instance_id(node)).split() + volumes_ids = self.aws.get_volumes_ids(instance_id) + logging.info( + "Detaching the %s volumes from instance %s " + % (volumes_ids, node) + ) + self.aws.detach_volumes(volumes_ids) + except Exception as e: + logging.error( + "Failed to detach disk from %s node. Encountered following" + "exception: %s." % (node, e) + ) + logging.debug("") + raise RuntimeError() + + # Node scenario to attach the volume + def disk_attach_scenario(self, instance_kill_count, attachment_details, timeout): + for _ in range(instance_kill_count): + for attachment in attachment_details: + self.aws.attach_volume(attachment["Attachments"][0]) diff --git a/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py b/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py index cae3e66c..f5b91749 100644 --- a/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py +++ b/krkn/scenario_plugins/node_actions/node_actions_scenario_plugin.py @@ -163,7 +163,7 @@ def run_node(self, single_node, node_scenario_object, action, node_scenario): logging.info("action" + str(action)) # Get the scenario specifics for running action nodes run_kill_count = get_yaml_item_value(node_scenario, "runs", 1) - if action == "node_stop_start_scenario": + if action in ("node_stop_start_scenario", "node_disk_detach_attach_scenario"): duration = get_yaml_item_value(node_scenario, "duration", 120) timeout = get_yaml_item_value(node_scenario, "timeout", 120) @@ -200,6 +200,9 @@ def run_node(self, single_node, node_scenario_object, action, node_scenario): node_scenario_object.node_reboot_scenario( run_kill_count, single_node, timeout ) + elif action == "node_disk_detach_attach_scenario": + node_scenario_object.node_disk_detach_attach_scenario( + run_kill_count, single_node, timeout, duration) elif action == "stop_start_kubelet_scenario": node_scenario_object.stop_start_kubelet_scenario( run_kill_count, single_node, timeout diff --git a/scenarios/openshift/aws_node_scenarios.yml b/scenarios/openshift/aws_node_scenarios.yml index 95e453f5..76953786 100644 --- a/scenarios/openshift/aws_node_scenarios.yml +++ b/scenarios/openshift/aws_node_scenarios.yml @@ -16,3 +16,10 @@ node_scenarios: instance_count: 1 timeout: 120 cloud_type: aws + - actions: + - node_disk_detach_attach_scenario + node_name: + label_selector: + instance_count: 1 + timeout: 120 + cloud_type: aws \ No newline at end of file