diff --git a/docs/zone_outage.md b/docs/zone_outage.md index 019e3fe0..2337baef 100644 --- a/docs/zone_outage.md +++ b/docs/zone_outage.md @@ -13,10 +13,12 @@ zone_outage: # Scenario to create an out duration: 600 # Duration in seconds after which the zone will be back online. vpc_id: # Cluster virtual private network to target. subnet_id: [subnet1, subnet2] # List of subnet-id's to deny both ingress and egress traffic. + default_acl_id: acl-xxxxxxxx # (Optional) ID of an existing network ACL to use instead of creating a new one. If provided, this ACL will not be deleted after the scenario. ``` **NOTE**: vpc_id and subnet_id can be obtained from the cloud web console by selecting one of the instances in the targeted zone ( us-west-2a for example ). **NOTE**: Multiple zones will experience downtime in case of targeting multiple subnets which might have an impact on the cluster health especially if the zones have control plane components deployed. +**NOTE**: default_acl_id can be obtained from the AWS VPC Console by selecting "Network ACLs" from the left sidebar ( the ID will be in the format 'acl-xxxxxxxx' ). Make sure the selected ACL has the desired ingress/egress rules for your outage scenario ( i.e., deny all ). ##### Debugging steps in case of failures In case of failures during the steps which revert back the network acl to allow traffic and bring back the cluster nodes in the zone, the nodes in the particular zone will be in `NotReady` condition. Here is how to fix it: diff --git a/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py b/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py index c2a83ee5..bce7d051 100644 --- a/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py +++ b/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py @@ -29,6 +29,8 @@ def run( subnet_ids = scenario_config["subnet_id"] duration = scenario_config["duration"] cloud_type = scenario_config["cloud_type"] + # Add support for user-provided default network ACL + default_acl_id = scenario_config.get("default_acl_id") ids = {} acl_ids_created = [] @@ -58,7 +60,20 @@ def run( "Network association ids associated with " "the subnet %s: %s" % (subnet_id, network_association_ids) ) - acl_id = cloud_object.create_default_network_acl(vpc_id) + + # Use provided default ACL if available, otherwise create a new one + if default_acl_id: + acl_id = default_acl_id + logging.info( + "Using provided default ACL ID %s - this ACL will not be deleted after the scenario", + default_acl_id + ) + # Don't add to acl_ids_created since we don't want to delete user-provided ACLs at cleanup + else: + acl_id = cloud_object.create_default_network_acl(vpc_id) + logging.info("Created new default ACL %s", acl_id) + acl_ids_created.append(acl_id) + new_association_id = cloud_object.replace_network_acl_association( network_association_ids[0], acl_id ) @@ -66,7 +81,6 @@ def run( # capture the orginal_acl_id, created_acl_id and # new association_id to use during the recovery ids[new_association_id] = original_acl_id - acl_ids_created.append(acl_id) # wait for the specified duration logging.info( diff --git a/scenarios/openshift/zone_outage.yaml b/scenarios/openshift/zone_outage.yaml index a54c000b..216cf020 100644 --- a/scenarios/openshift/zone_outage.yaml +++ b/scenarios/openshift/zone_outage.yaml @@ -3,3 +3,4 @@ zone_outage: # Scenario to create an out duration: 600 # duration in seconds after which the zone will be back online vpc_id: # cluster virtual private network to target subnet_id: [subnet1, subnet2] # List of subnet-id's to deny both ingress and egress traffic + default_acl_id: acl-xxxxxxxx # (Optional) ID of an existing network ACL to use instead of creating a new one. If provided, this ACL will not be deleted after the scenario.