diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c01a50fc..0baa39bc 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -126,7 +126,7 @@ jobs: cat ./CI/results.markdown >> $GITHUB_STEP_SUMMARY echo >> $GITHUB_STEP_SUMMARY - name: Upload CI logs - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ci-logs path: CI/out @@ -140,13 +140,13 @@ jobs: pip install html2text html2text --ignore-images --ignore-links -b 0 htmlcov/index.html >> $GITHUB_STEP_SUMMARY - name: Upload coverage data - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: coverage path: htmlcov if-no-files-found: error - name: Upload json coverage - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: coverage.json path: coverage.json @@ -169,7 +169,7 @@ jobs: path: krkn-lib-docs ssh-key: ${{ secrets.KRKN_LIB_DOCS_PRIV_KEY }} - name: Download json coverage - uses: actions/download-artifact@v4.1.7 + uses: actions/download-artifact@v4 with: name: coverage.json - name: Set up Python diff --git a/CI/tests/test_telemetry.sh b/CI/tests/test_telemetry.sh index e1f83bf3..dc551061 100644 --- a/CI/tests/test_telemetry.sh +++ b/CI/tests/test_telemetry.sh @@ -26,7 +26,6 @@ function functional_test_telemetry { RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/files/(.*)#\1#p"` $AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files echo "checking if telemetry files are uploaded on s3" - cat s3_remote_files | grep events-00.json || ( echo "FAILED: events-00.json not uploaded" && exit 1 ) cat s3_remote_files | grep critical-alerts-00.log || ( echo "FAILED: critical-alerts-00.log not uploaded" && exit 1 ) cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 ) cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 ) diff --git a/docs/SLOs_validation.md b/docs/SLOs_validation.md index 09905931..dbe9f8d7 100644 --- a/docs/SLOs_validation.md +++ b/docs/SLOs_validation.md @@ -38,11 +38,11 @@ A couple of [alert profiles](https://github.com/redhat-chaos/krkn/tree/main/conf severity: critical ``` -Kube-burner supports setting the severity for the alerts with each one having different effects: +Krkn supports setting the severity for the alerts with each one having different effects: ``` info: Prints an info message with the alarm description to stdout. By default all expressions have this severity. warning: Prints a warning message with the alarm description to stdout. -error: Prints a error message with the alarm description to stdout and makes kube-burner rc = 1 +error: Prints a error message with the alarm description to stdout and sets Krkn rc = 1 critical: Prints a fatal message with the alarm description to stdout and exits execution inmediatly with rc != 0 ``` diff --git a/docs/index.md b/docs/index.md index 36f7a3bb..feb5869d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -11,7 +11,7 @@ * [Scenarios](#scenarios) * [Test Environment Recommendations - how and where to run chaos tests](#test-environment-recommendations---how-and-where-to-run-chaos-tests) * [Chaos testing in Practice](#chaos-testing-in-practice) - * [OpenShift oraganization](#openshift-organization) + * [OpenShift organization](#openshift-organization) * [startx-lab](#startx-lab) diff --git a/krkn/scenario_plugins/native/network/ingress_shaping.py b/krkn/scenario_plugins/native/network/ingress_shaping.py index cf74828f..2aa9d84f 100644 --- a/krkn/scenario_plugins/native/network/ingress_shaping.py +++ b/krkn/scenario_plugins/native/network/ingress_shaping.py @@ -18,17 +18,14 @@ @dataclass class NetworkScenarioConfig: - node_interface_name: typing.Dict[ - str, typing.List[str] - ] = field( + node_interface_name: typing.Dict[str, typing.List[str]] = field( default=None, metadata={ "name": "Node Interface Name", - "description": - "Dictionary with node names as key and values as a list of " - "their test interfaces. " - "Required if label_selector is not set.", - } + "description": "Dictionary with node names as key and values as a list of " + "their test interfaces. " + "Required if label_selector is not set.", + }, ) label_selector: typing.Annotated[ @@ -37,93 +34,76 @@ class NetworkScenarioConfig: default=None, metadata={ "name": "Label selector", - "description": - "Kubernetes label selector for the target nodes. " - "Required if node_interface_name is not set.\n" - "See https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ " # noqa - "for details.", - } + "description": "Kubernetes label selector for the target nodes. " + "Required if node_interface_name is not set.\n" + "See https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ " # noqa + "for details.", + }, ) - test_duration: typing.Annotated[ - typing.Optional[int], - validation.min(1) - ] = field( + test_duration: typing.Annotated[typing.Optional[int], validation.min(1)] = field( default=120, metadata={ "name": "Test duration", - "description": - "Duration for which each step of the ingress chaos testing " - "is to be performed.", + "description": "Duration for which each step of the ingress chaos testing " + "is to be performed.", }, ) - wait_duration: typing.Annotated[ - typing.Optional[int], - validation.min(1) - ] = field( + wait_duration: typing.Annotated[typing.Optional[int], validation.min(1)] = field( default=30, metadata={ "name": "Wait Duration", - "description": - "Wait duration for finishing a test and its cleanup." - "Ensure that it is significantly greater than wait_duration" - } + "description": "Wait duration for finishing a test and its cleanup." + "Ensure that it is significantly greater than wait_duration", + }, ) - instance_count: typing.Annotated[ - typing.Optional[int], - validation.min(1) - ] = field( + instance_count: typing.Annotated[typing.Optional[int], validation.min(1)] = field( default=1, metadata={ "name": "Instance Count", - "description": - "Number of nodes to perform action/select that match " - "the label selector.", - } + "description": "Number of nodes to perform action/select that match " + "the label selector.", + }, ) kubeconfig_path: typing.Optional[str] = field( default=None, metadata={ "name": "Kubeconfig path", - "description": - "Path to your Kubeconfig file. Defaults to ~/.kube/config.\n" - "See https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ " # noqa - "for details.", - } + "description": "Path to your Kubeconfig file. Defaults to ~/.kube/config.\n" + "See https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ " # noqa + "for details.", + }, ) execution_type: typing.Optional[str] = field( - default='parallel', + default="parallel", metadata={ "name": "Execution Type", - "description": - "The order in which the ingress filters are applied. " - "Execution type can be 'serial' or 'parallel'" - } + "description": "The order in which the ingress filters are applied. " + "Execution type can be 'serial' or 'parallel'", + }, ) network_params: typing.Dict[str, str] = field( default=None, metadata={ "name": "Network Parameters", - "description": - "The network filters that are applied on the interface. " - "The currently supported filters are latency, " - "loss and bandwidth" - } + "description": "The network filters that are applied on the interface. " + "The currently supported filters are latency, " + "loss and bandwidth", + }, ) kraken_config: typing.Optional[str] = field( - default='', + default="", metadata={ "name": "Kraken Config", - "description": - "Path to the config file of Kraken. " - "Set this field if you wish to publish status onto Cerberus" - } + "description": "Path to the config file of Kraken. " + "Set this field if you wish to publish status onto Cerberus", + }, ) @@ -132,33 +112,30 @@ class NetworkScenarioSuccessOutput: filter_direction: str = field( metadata={ "name": "Filter Direction", - "description": - "Direction in which the traffic control filters are applied " - "on the test interfaces" + "description": "Direction in which the traffic control filters are applied " + "on the test interfaces", } ) test_interfaces: typing.Dict[str, typing.List[str]] = field( metadata={ "name": "Test Interfaces", - "description": - "Dictionary of nodes and their interfaces on which " - "the chaos experiment was performed" + "description": "Dictionary of nodes and their interfaces on which " + "the chaos experiment was performed", } ) network_parameters: typing.Dict[str, str] = field( metadata={ "name": "Network Parameters", - "description": - "The network filters that are applied on the interfaces" + "description": "The network filters that are applied on the interfaces", } ) execution_type: str = field( metadata={ "name": "Execution Type", - "description": "The order in which the filters are applied" + "description": "The order in which the filters are applied", } ) @@ -168,18 +145,13 @@ class NetworkScenarioErrorOutput: error: str = field( metadata={ "name": "Error", - "description": - "Error message when there is a run-time error during " - "the execution of the scenario" + "description": "Error message when there is a run-time error during " + "the execution of the scenario", } ) -def get_default_interface( - node: str, - pod_template, - cli: CoreV1Api -) -> str: +def get_default_interface(node: str, pod_template, cli: CoreV1Api) -> str: """ Function that returns a random interface from a node @@ -210,9 +182,9 @@ def get_default_interface( logging.error("Exception occurred while executing command in pod") sys.exit(1) - routes = output.split('\n') + routes = output.split("\n") for route in routes: - if 'default' in route: + if "default" in route: default_route = route break @@ -226,10 +198,7 @@ def get_default_interface( def verify_interface( - input_interface_list: typing.List[str], - node: str, - pod_template, - cli: CoreV1Api + input_interface_list: typing.List[str], node: str, pod_template, cli: CoreV1Api ) -> typing.List[str]: """ Function that verifies whether a list of interfaces is present in the node. @@ -258,22 +227,15 @@ def verify_interface( try: if input_interface_list == []: cmd = ["ip", "r"] - output = kube_helper.exec_cmd_in_pod( - cli, - cmd, - "fedtools", - "default" - ) + output = kube_helper.exec_cmd_in_pod(cli, cmd, "fedtools", "default") if not output: - logging.error( - "Exception occurred while executing command in pod" - ) + logging.error("Exception occurred while executing command in pod") sys.exit(1) - routes = output.split('\n') + routes = output.split("\n") for route in routes: - if 'default' in route: + if "default" in route: default_route = route break @@ -281,20 +243,13 @@ def verify_interface( else: cmd = ["ip", "-br", "addr", "show"] - output = kube_helper.exec_cmd_in_pod( - cli, - cmd, - "fedtools", - "default" - ) + output = kube_helper.exec_cmd_in_pod(cli, cmd, "fedtools", "default") if not output: - logging.error( - "Exception occurred while executing command in pod" - ) + logging.error("Exception occurred while executing command in pod") sys.exit(1) - interface_ip = output.split('\n') + interface_ip = output.split("\n") node_interface_list = [ interface.split()[0] for interface in interface_ip[:-1] ] @@ -302,12 +257,12 @@ def verify_interface( for interface in input_interface_list: if interface not in node_interface_list: logging.error( - "Interface %s not found in node %s interface list %s" % - (interface, node, node_interface_list) + "Interface %s not found in node %s interface list %s" + % (interface, node, node_interface_list) ) raise Exception( - "Interface %s not found in node %s interface list %s" % - (interface, node, node_interface_list) + "Interface %s not found in node %s interface list %s" + % (interface, node, node_interface_list) ) finally: logging.info("Deleteing pod to query interface on node") @@ -321,9 +276,8 @@ def get_node_interfaces( label_selector: str, instance_count: int, pod_template, - cli: CoreV1Api + cli: CoreV1Api, ) -> typing.Dict[str, typing.List[str]]: - """ Function that is used to process the input dictionary with the nodes and its test interfaces. @@ -364,11 +318,7 @@ def get_node_interfaces( nodes = kube_helper.get_node(None, label_selector, instance_count, cli) node_interface_dict = {} for node in nodes: - node_interface_dict[node] = get_default_interface( - node, - pod_template, - cli - ) + node_interface_dict[node] = get_default_interface(node, pod_template, cli) else: node_name_list = node_interface_dict.keys() filtered_node_list = [] @@ -395,9 +345,8 @@ def apply_ingress_filter( batch_cli: BatchV1Api, cli: CoreV1Api, create_interfaces: bool = True, - param_selector: str = 'all' + param_selector: str = "all", ) -> str: - """ Function that applies the filters to shape incoming traffic to the provided node's interfaces. @@ -438,22 +387,18 @@ def apply_ingress_filter( """ network_params = cfg.network_params - if param_selector != 'all': + if param_selector != "all": network_params = {param_selector: cfg.network_params[param_selector]} if create_interfaces: create_virtual_interfaces(cli, interface_list, node, pod_template) exec_cmd = get_ingress_cmd( - interface_list, network_params, duration=cfg.test_duration - ) + interface_list, network_params, duration=cfg.test_duration + ) logging.info("Executing %s on node %s" % (exec_cmd, node)) job_body = yaml.safe_load( - job_template.render( - jobname=str(hash(node))[:5], - nodename=node, - cmd=exec_cmd - ) + job_template.render(jobname=str(hash(node))[:5], nodename=node, cmd=exec_cmd) ) api_response = kube_helper.create_job(batch_cli, job_body) @@ -464,10 +409,7 @@ def apply_ingress_filter( def create_virtual_interfaces( - cli: CoreV1Api, - interface_list: typing.List[str], - node: str, - pod_template + cli: CoreV1Api, interface_list: typing.List[str], node: str, pod_template ) -> None: """ Function that creates a privileged pod and uses it to create @@ -488,25 +430,20 @@ def create_virtual_interfaces( - The YAML template used to instantiate a pod to create virtual interfaces on the node """ - pod_body = yaml.safe_load( - pod_template.render(nodename=node) - ) + pod_body = yaml.safe_load(pod_template.render(nodename=node)) kube_helper.create_pod(cli, pod_body, "default", 300) logging.info( "Creating {0} virtual interfaces on node {1} using a pod".format( - len(interface_list), - node + len(interface_list), node ) ) - create_ifb(cli, len(interface_list), 'modtools') + create_ifb(cli, len(interface_list), "modtools") logging.info("Deleting pod used to create virtual interfaces") kube_helper.delete_pod(cli, "modtools", "default") def delete_virtual_interfaces( - cli: CoreV1Api, - node_list: typing.List[str], - pod_template + cli: CoreV1Api, node_list: typing.List[str], pod_template ): """ Function that creates a privileged pod and uses it to delete all @@ -529,14 +466,10 @@ def delete_virtual_interfaces( """ for node in node_list: - pod_body = yaml.safe_load( - pod_template.render(nodename=node) - ) + pod_body = yaml.safe_load(pod_template.render(nodename=node)) kube_helper.create_pod(cli, pod_body, "default", 300) - logging.info( - "Deleting all virtual interfaces on node {0}".format(node) - ) - delete_ifb(cli, 'modtools') + logging.info("Deleting all virtual interfaces on node {0}".format(node)) + delete_ifb(cli, "modtools") kube_helper.delete_pod(cli, "modtools", "default") @@ -546,21 +479,13 @@ def create_ifb(cli: CoreV1Api, number: int, pod_name: str): Makes use of modprobe commands """ - exec_command = [ - 'chroot', '/host', - 'modprobe', 'ifb', 'numifbs=' + str(number) - ] - kube_helper.exec_cmd_in_pod(cli, exec_command, pod_name, 'default') + exec_command = ["chroot", "/host", "modprobe", "ifb", "numifbs=" + str(number)] + kube_helper.exec_cmd_in_pod(cli, exec_command, pod_name, "default") for i in range(0, number): - exec_command = ['chroot', '/host', 'ip', 'link', 'set', 'dev'] - exec_command += ['ifb' + str(i), 'up'] - kube_helper.exec_cmd_in_pod( - cli, - exec_command, - pod_name, - 'default' - ) + exec_command = ["chroot", "/host", "ip", "link", "set", "dev"] + exec_command += ["ifb" + str(i), "up"] + kube_helper.exec_cmd_in_pod(cli, exec_command, pod_name, "default") def delete_ifb(cli: CoreV1Api, pod_name: str): @@ -569,8 +494,8 @@ def delete_ifb(cli: CoreV1Api, pod_name: str): Makes use of modprobe command """ - exec_command = ['chroot', '/host', 'modprobe', '-r', 'ifb'] - kube_helper.exec_cmd_in_pod(cli, exec_command, pod_name, 'default') + exec_command = ["chroot", "/host", "modprobe", "-r", "ifb"] + kube_helper.exec_cmd_in_pod(cli, exec_command, pod_name, "default") def get_job_pods(cli: CoreV1Api, api_response): @@ -591,18 +516,14 @@ def get_job_pods(cli: CoreV1Api, api_response): controllerUid = api_response.metadata.labels["controller-uid"] pod_label_selector = "controller-uid=" + controllerUid pods_list = kube_helper.list_pods( - cli, - label_selector=pod_label_selector, - namespace="default" + cli, label_selector=pod_label_selector, namespace="default" ) return pods_list[0] def wait_for_job( - batch_cli: BatchV1Api, - job_list: typing.List[str], - timeout: int = 300 + batch_cli: BatchV1Api, job_list: typing.List[str], timeout: int = 300 ) -> None: """ Function that waits for a list of jobs to finish within a time period @@ -625,13 +546,11 @@ def wait_for_job( for job_name in job_list: try: api_response = kube_helper.get_job_status( - batch_cli, - job_name, - namespace="default" + batch_cli, job_name, namespace="default" ) if ( - api_response.status.succeeded is not None or - api_response.status.failed is not None + api_response.status.succeeded is not None + or api_response.status.failed is not None ): count += 1 job_list.remove(job_name) @@ -645,11 +564,7 @@ def wait_for_job( time.sleep(5) -def delete_jobs( - cli: CoreV1Api, - batch_cli: BatchV1Api, - job_list: typing.List[str] -): +def delete_jobs(cli: CoreV1Api, batch_cli: BatchV1Api, job_list: typing.List[str]): """ Function that deletes jobs @@ -667,38 +582,28 @@ def delete_jobs( for job_name in job_list: try: api_response = kube_helper.get_job_status( - batch_cli, - job_name, - namespace="default" + batch_cli, job_name, namespace="default" ) if api_response.status.failed is not None: pod_name = get_job_pods(cli, api_response) - pod_stat = kube_helper.read_pod( - cli, - name=pod_name, - namespace="default" - ) + pod_stat = kube_helper.read_pod(cli, name=pod_name, namespace="default") logging.error(pod_stat.status.container_statuses) pod_log_response = kube_helper.get_pod_log( - cli, - name=pod_name, - namespace="default" + cli, name=pod_name, namespace="default" ) pod_log = pod_log_response.data.decode("utf-8") logging.error(pod_log) except Exception as e: logging.warn("Exception in getting job status: %s" % str(e)) api_response = kube_helper.delete_job( - batch_cli, - name=job_name, - namespace="default" + batch_cli, name=job_name, namespace="default" ) def get_ingress_cmd( interface_list: typing.List[str], network_parameters: typing.Dict[str, str], - duration: int = 300 + duration: int = 300, ): """ Function that returns the commands to the ingress traffic shaping on @@ -736,9 +641,7 @@ def get_ingress_cmd( for i, interface in enumerate(interface_list): if not interface_pattern.match(interface): - logging.error( - "Interface name can only consist of alphanumeric characters" - ) + logging.error("Interface name can only consist of alphanumeric characters") raise Exception( "Interface '{0}' does not match the required regex pattern :" r" ^[a-z0-9\-\@\_]+$".format(interface) @@ -752,33 +655,23 @@ def get_ingress_cmd( "follow the regex pattern ^ifb[0-9]+$".format(ifb_name) ) - tc_set += "tc qdisc add dev {0} handle ffff: ingress;".format( - interface - ) + tc_set += "tc qdisc add dev {0} handle ffff: ingress;".format(interface) tc_set += "tc filter add dev {0} parent ffff: protocol ip u32 match u32 0 0 action mirred egress redirect dev {1};".format( # noqa - interface, - ifb_name + interface, ifb_name ) tc_set = "{0} tc qdisc add dev {1} root netem".format(tc_set, ifb_name) tc_unset = "{0} tc qdisc del dev {1} root ;".format(tc_unset, ifb_name) - tc_unset += "tc qdisc del dev {0} handle ffff: ingress;".format( - interface - ) + tc_unset += "tc qdisc del dev {0} handle ffff: ingress;".format(interface) tc_ls = "{0} tc qdisc ls dev {1} ;".format(tc_ls, ifb_name) for parameter in network_parameters.keys(): tc_set += " {0} {1} ".format( - param_map[parameter], - network_parameters[parameter] + param_map[parameter], network_parameters[parameter] ) tc_set += ";" exec_cmd = "{0} {1} sleep {2};{3} sleep 20;{4}".format( - tc_set, - tc_ls, - duration, - tc_unset, - tc_ls + tc_set, tc_ls, duration, tc_unset, tc_ls ) return exec_cmd @@ -790,17 +683,14 @@ def get_ingress_cmd( description="Applies filters to ihe ingress side of node(s) interfaces", outputs={ "success": NetworkScenarioSuccessOutput, - "error": NetworkScenarioErrorOutput + "error": NetworkScenarioErrorOutput, }, ) -def network_chaos(cfg: NetworkScenarioConfig) -> typing.Tuple[ - str, - typing.Union[ - NetworkScenarioSuccessOutput, - NetworkScenarioErrorOutput - ] +def network_chaos( + cfg: NetworkScenarioConfig, +) -> typing.Tuple[ + str, typing.Union[NetworkScenarioSuccessOutput, NetworkScenarioErrorOutput] ]: - """ Function that performs the ingress network chaos scenario based on the provided configuration @@ -826,12 +716,10 @@ def network_chaos(cfg: NetworkScenarioConfig) -> typing.Tuple[ cfg.label_selector, cfg.instance_count, pod_interface_template, - cli + cli, ) except Exception: - return "error", NetworkScenarioErrorOutput( - format_exc() - ) + return "error", NetworkScenarioErrorOutput(format_exc()) job_list = [] publish = False if cfg.kraken_config: @@ -840,16 +728,12 @@ def network_chaos(cfg: NetworkScenarioConfig) -> typing.Tuple[ with open(cfg.kraken_config, "r") as f: config = yaml.full_load(f) except Exception: - logging.error( - "Error reading Kraken config from %s" % cfg.kraken_config - ) - return "error", NetworkScenarioErrorOutput( - format_exc() - ) + logging.error("Error reading Kraken config from %s" % cfg.kraken_config) + return "error", NetworkScenarioErrorOutput(format_exc()) publish = True try: - if cfg.execution_type == 'parallel': + if cfg.execution_type == "parallel": for node in node_interface_dict: job_list.append( apply_ingress_filter( @@ -859,22 +743,19 @@ def network_chaos(cfg: NetworkScenarioConfig) -> typing.Tuple[ pod_module_template, job_template, batch_cli, - cli + cli, ) ) logging.info("Waiting for parallel job to finish") start_time = int(time.time()) - wait_for_job(batch_cli, job_list[:], cfg.test_duration+100) + wait_for_job(batch_cli, job_list[:], cfg.test_duration + 100) end_time = int(time.time()) if publish: cerberus.publish_kraken_status( - config, - failed_post_scenarios, - start_time, - end_time + config, failed_post_scenarios, start_time, end_time ) - elif cfg.execution_type == 'serial': + elif cfg.execution_type == "serial": create_interfaces = True for param in cfg.network_params: for node in node_interface_dict: @@ -888,50 +769,39 @@ def network_chaos(cfg: NetworkScenarioConfig) -> typing.Tuple[ batch_cli, cli, create_interfaces=create_interfaces, - param_selector=param + param_selector=param, ) ) logging.info("Waiting for serial job to finish") start_time = int(time.time()) - wait_for_job(batch_cli, job_list[:], cfg.test_duration+100) + wait_for_job(batch_cli, job_list[:], cfg.test_duration + 100) logging.info("Deleting jobs") delete_jobs(cli, batch_cli, job_list[:]) job_list = [] - logging.info( - "Waiting for wait_duration : %ss" % cfg.wait_duration - ) + logging.info("Waiting for wait_duration : %ss" % cfg.wait_duration) time.sleep(cfg.wait_duration) end_time = int(time.time()) if publish: cerberus.publish_kraken_status( - config, - failed_post_scenarios, - start_time, - end_time + config, failed_post_scenarios, start_time, end_time ) create_interfaces = False else: return "error", NetworkScenarioErrorOutput( - "Invalid execution type - serial and parallel are " - "the only accepted types" - ) + "Invalid execution type - serial and parallel are " + "the only accepted types" + ) return "success", NetworkScenarioSuccessOutput( filter_direction="ingress", test_interfaces=node_interface_dict, network_parameters=cfg.network_params, - execution_type=cfg.execution_type + execution_type=cfg.execution_type, ) except Exception as e: logging.error("Network Chaos exiting due to Exception - %s" % e) - return "error", NetworkScenarioErrorOutput( - format_exc() - ) + return "error", NetworkScenarioErrorOutput(format_exc()) finally: - delete_virtual_interfaces( - cli, - node_interface_dict.keys(), - pod_module_template - ) + delete_virtual_interfaces(cli, node_interface_dict.keys(), pod_module_template) logging.info("Deleting jobs(if any)") delete_jobs(cli, batch_cli, job_list[:]) diff --git a/krkn/scenario_plugins/native/pod_network_outage/pod_network_outage_plugin.py b/krkn/scenario_plugins/native/pod_network_outage/pod_network_outage_plugin.py index 410a58b7..87d53464 100755 --- a/krkn/scenario_plugins/native/pod_network_outage/pod_network_outage_plugin.py +++ b/krkn/scenario_plugins/native/pod_network_outage/pod_network_outage_plugin.py @@ -42,8 +42,7 @@ def get_test_pods( pod names (string) in the namespace """ pods_list = [] - pods_list = kubecli.list_pods( - label_selector=pod_label, namespace=namespace) + pods_list = kubecli.list_pods(label_selector=pod_label, namespace=namespace) if pod_name and pod_name not in pods_list: raise Exception("pod name not found in namespace ") elif pod_name and pod_name in pods_list: @@ -92,8 +91,7 @@ def delete_jobs(kubecli: KrknKubernetes, job_list: typing.List[str]): for job_name in job_list: try: - api_response = kubecli.get_job_status( - job_name, namespace="default") + api_response = kubecli.get_job_status(job_name, namespace="default") if api_response.status.failed is not None: pod_name = get_job_pods(kubecli, api_response) pod_stat = kubecli.read_pod(name=pod_name, namespace="default") @@ -131,8 +129,7 @@ def wait_for_job( while count != job_len: for job_name in job_list: try: - api_response = kubecli.get_job_status( - job_name, namespace="default") + api_response = kubecli.get_job_status(job_name, namespace="default") if ( api_response.status.succeeded is not None or api_response.status.failed is not None @@ -149,8 +146,7 @@ def wait_for_job( time.sleep(5) -def get_bridge_name(cli: ApiextensionsV1Api, - custom_obj: CustomObjectsApi) -> str: +def get_bridge_name(cli: ApiextensionsV1Api, custom_obj: CustomObjectsApi) -> str: """ Function that gets OVS bridge present in node. @@ -328,16 +324,13 @@ def apply_ingress_policy( create_virtual_interfaces(kubecli, len(ips), node, pod_template) for count, pod_ip in enumerate(set(ips)): - pod_inf = get_pod_interface( - node, pod_ip, pod_template, bridge_name, kubecli) + pod_inf = get_pod_interface(node, pod_ip, pod_template, bridge_name, kubecli) exec_cmd = get_ingress_cmd( test_execution, pod_inf, mod, count, network_params, duration ) - logging.info("Executing %s on pod %s in node %s" % - (exec_cmd, pod_ip, node)) + logging.info("Executing %s on pod %s in node %s" % (exec_cmd, pod_ip, node)) job_body = yaml.safe_load( - job_template.render(jobname=mod + str(pod_ip), - nodename=node, cmd=exec_cmd) + job_template.render(jobname=mod + str(pod_ip), nodename=node, cmd=exec_cmd) ) job_list.append(job_body["metadata"]["name"]) api_response = kubecli.create_job(job_body) @@ -405,16 +398,13 @@ def apply_net_policy( job_list = [] for pod_ip in set(ips): - pod_inf = get_pod_interface( - node, pod_ip, pod_template, bridge_name, kubecli) + pod_inf = get_pod_interface(node, pod_ip, pod_template, bridge_name, kubecli) exec_cmd = get_egress_cmd( test_execution, pod_inf, mod, network_params, duration ) - logging.info("Executing %s on pod %s in node %s" % - (exec_cmd, pod_ip, node)) + logging.info("Executing %s on pod %s in node %s" % (exec_cmd, pod_ip, node)) job_body = yaml.safe_load( - job_template.render(jobname=mod + str(pod_ip), - nodename=node, cmd=exec_cmd) + job_template.render(jobname=mod + str(pod_ip), nodename=node, cmd=exec_cmd) ) job_list.append(job_body["metadata"]["name"]) api_response = kubecli.create_job(job_body) @@ -456,18 +446,16 @@ def get_ingress_cmd( Returns: str: ingress filter """ - ifb_dev = 'ifb{0}'.format(count) + ifb_dev = "ifb{0}".format(count) tc_set = tc_unset = tc_ls = "" param_map = {"latency": "delay", "loss": "loss", "bandwidth": "rate"} tc_set = "tc qdisc add dev {0} ingress ;".format(test_interface) tc_set = "{0} tc filter add dev {1} ingress matchall action mirred egress redirect dev {2} ;".format( - tc_set, test_interface, ifb_dev) - tc_set = "{0} tc qdisc replace dev {1} root netem".format( - tc_set, ifb_dev) - tc_unset = "{0} tc qdisc del dev {1} root ;".format( - tc_unset, ifb_dev) - tc_unset = "{0} tc qdisc del dev {1} ingress".format( - tc_unset, test_interface) + tc_set, test_interface, ifb_dev + ) + tc_set = "{0} tc qdisc replace dev {1} root netem".format(tc_set, ifb_dev) + tc_unset = "{0} tc qdisc del dev {1} root ;".format(tc_unset, ifb_dev) + tc_unset = "{0} tc qdisc del dev {1} ingress".format(tc_unset, test_interface) tc_ls = "{0} tc qdisc ls dev {1} ;".format(tc_ls, ifb_dev) if execution == "parallel": for val in vallst.keys(): @@ -475,8 +463,7 @@ def get_ingress_cmd( tc_set += ";" else: tc_set += " {0} {1} ;".format(param_map[mod], vallst[mod]) - exec_cmd = "{0} {1} sleep {2};{3}".format( - tc_set, tc_ls, duration, tc_unset) + exec_cmd = "{0} {1} sleep {2};{3}".format(tc_set, tc_ls, duration, tc_unset) return exec_cmd @@ -512,10 +499,8 @@ def get_egress_cmd( """ tc_set = tc_unset = tc_ls = "" param_map = {"latency": "delay", "loss": "loss", "bandwidth": "rate"} - tc_set = "{0} tc qdisc replace dev {1} root netem".format( - tc_set, test_interface) - tc_unset = "{0} tc qdisc del dev {1} root ;".format( - tc_unset, test_interface) + tc_set = "{0} tc qdisc replace dev {1} root netem".format(tc_set, test_interface) + tc_unset = "{0} tc qdisc del dev {1} root ;".format(tc_unset, test_interface) tc_ls = "{0} tc qdisc ls dev {1} ;".format(tc_ls, test_interface) if execution == "parallel": for val in vallst.keys(): @@ -523,17 +508,13 @@ def get_egress_cmd( tc_set += ";" else: tc_set += " {0} {1} ;".format(param_map[mod], vallst[mod]) - exec_cmd = "{0} {1} sleep {2};{3}".format( - tc_set, tc_ls, duration, tc_unset) + exec_cmd = "{0} {1} sleep {2};{3}".format(tc_set, tc_ls, duration, tc_unset) return exec_cmd def create_virtual_interfaces( - kubecli: KrknKubernetes, - nummber: int, - node: str, - pod_template + kubecli: KrknKubernetes, nummber: int, node: str, pod_template ) -> None: """ Function that creates a privileged pod and uses it to create @@ -554,25 +535,18 @@ def create_virtual_interfaces( - The YAML template used to instantiate a pod to create virtual interfaces on the node """ - pod_body = yaml.safe_load( - pod_template.render(nodename=node) - ) + pod_body = yaml.safe_load(pod_template.render(nodename=node)) kubecli.create_pod(pod_body, "default", 300) logging.info( - "Creating {0} virtual interfaces on node {1} using a pod".format( - nummber, - node - ) + "Creating {0} virtual interfaces on node {1} using a pod".format(nummber, node) ) - create_ifb(kubecli, nummber, 'modtools') + create_ifb(kubecli, nummber, "modtools") logging.info("Deleting pod used to create virtual interfaces") kubecli.delete_pod("modtools", "default") def delete_virtual_interfaces( - kubecli: KrknKubernetes, - node_list: typing.List[str], - pod_template + kubecli: KrknKubernetes, node_list: typing.List[str], pod_template ): """ Function that creates a privileged pod and uses it to delete all @@ -595,14 +569,10 @@ def delete_virtual_interfaces( """ for node in node_list: - pod_body = yaml.safe_load( - pod_template.render(nodename=node) - ) + pod_body = yaml.safe_load(pod_template.render(nodename=node)) kubecli.create_pod(pod_body, "default", 300) - logging.info( - "Deleting all virtual interfaces on node {0}".format(node) - ) - delete_ifb(kubecli, 'modtools') + logging.info("Deleting all virtual interfaces on node {0}".format(node)) + delete_ifb(kubecli, "modtools") kubecli.delete_pod("modtools", "default") @@ -612,24 +582,14 @@ def create_ifb(kubecli: KrknKubernetes, number: int, pod_name: str): Makes use of modprobe commands """ - exec_command = [ - '/host', - 'modprobe', 'ifb', 'numifbs=' + str(number) - ] - kubecli.exec_cmd_in_pod( - exec_command, - pod_name, - 'default', - base_command="chroot") + exec_command = ["/host", "modprobe", "ifb", "numifbs=" + str(number)] + kubecli.exec_cmd_in_pod(exec_command, pod_name, "default", base_command="chroot") for i in range(0, number): - exec_command = ['/host', 'ip', 'link', 'set', 'dev'] - exec_command += ['ifb' + str(i), 'up'] + exec_command = ["/host", "ip", "link", "set", "dev"] + exec_command += ["ifb" + str(i), "up"] kubecli.exec_cmd_in_pod( - exec_command, - pod_name, - 'default', - base_command="chroot" + exec_command, pod_name, "default", base_command="chroot" ) @@ -639,17 +599,11 @@ def delete_ifb(kubecli: KrknKubernetes, pod_name: str): Makes use of modprobe command """ - exec_command = ['/host', 'modprobe', '-r', 'ifb'] - kubecli.exec_cmd_in_pod( - exec_command, - pod_name, - 'default', - base_command="chroot") + exec_command = ["/host", "modprobe", "-r", "ifb"] + kubecli.exec_cmd_in_pod(exec_command, pod_name, "default", base_command="chroot") -def list_bridges( - node: str, pod_template, kubecli: KrknKubernetes -) -> typing.List[str]: +def list_bridges(node: str, pod_template, kubecli: KrknKubernetes) -> typing.List[str]: """ Function that returns a list of bridges on the node @@ -787,7 +741,7 @@ def get_pod_interface( find_ip = f"external-ids:ip_addresses={ip}/23" else: find_ip = f"external-ids:ip={ip}" - + cmd = [ "/host", "ovs-vsctl", @@ -797,24 +751,20 @@ def get_pod_interface( "interface", find_ip, ] - + output = kubecli.exec_cmd_in_pod( cmd, "modtools", "default", base_command="chroot" ) if not output: - cmd= [ - "/host", - "ip", - "addr", - "show" - ] + cmd = ["/host", "ip", "addr", "show"] output = kubecli.exec_cmd_in_pod( - cmd, "modtools", "default", base_command="chroot") + cmd, "modtools", "default", base_command="chroot" + ) for if_str in output.split("\n"): - if re.search(ip,if_str): - inf = if_str.split(' ')[-1] + if re.search(ip, if_str): + inf = if_str.split(" ")[-1] else: - inf = output + inf = output finally: logging.info("Deleting pod to query interface on node") kubecli.delete_pod("modtools", "default") @@ -927,11 +877,11 @@ class InputParams: }, ) - kraken_config: typing.Optional[str] = field( + kraken_config: typing.Dict[str, typing.Any] = field( default=None, metadata={ "name": "Kraken Config", - "description": "Path to the config file of Kraken. " + "description": "Kraken config file dictionary " "Set this field if you wish to publish status onto Cerberus", }, ) @@ -1043,14 +993,6 @@ def pod_outage( publish = False if params.kraken_config: - failed_post_scenarios = "" - try: - with open(params.kraken_config, "r") as f: - config = yaml.full_load(f) - except Exception: - logging.error("Error reading Kraken config from %s" % - params.kraken_config) - return "error", PodOutageErrorOutput(format_exc()) publish = True for i in params.direction: @@ -1106,7 +1048,7 @@ def pod_outage( end_time = int(time.time()) if publish: cerberus.publish_kraken_status( - config, failed_post_scenarios, start_time, end_time + params.kraken_config, "", start_time, end_time ) return "success", PodOutageSuccessOutput( @@ -1116,8 +1058,7 @@ def pod_outage( egress_ports=params.egress_ports, ) except Exception as e: - logging.error( - "Pod network outage scenario exiting due to Exception - %s" % e) + logging.error("Pod network outage scenario exiting due to Exception - %s" % e) return "error", PodOutageErrorOutput(format_exc()) finally: logging.info("Deleting jobs(if any)") @@ -1179,11 +1120,11 @@ class EgressParams: }, ) - kraken_config: typing.Optional[str] = field( + kraken_config: typing.Dict[str, typing.Any] = field( default=None, metadata={ "name": "Kraken Config", - "description": "Path to the config file of Kraken. " + "description": "Krkn config file dictionary " "Set this field if you wish to publish status onto Cerberus", }, ) @@ -1276,8 +1217,7 @@ class PodEgressNetShapingErrorOutput: def pod_egress_shaping( params: EgressParams, ) -> typing.Tuple[ - str, typing.Union[PodEgressNetShapingSuccessOutput, - PodEgressNetShapingErrorOutput] + str, typing.Union[PodEgressNetShapingSuccessOutput, PodEgressNetShapingErrorOutput] ]: """ Function that performs egress pod traffic shaping based @@ -1302,14 +1242,6 @@ def pod_egress_shaping( publish = False if params.kraken_config: - failed_post_scenarios = "" - try: - with open(params.kraken_config, "r") as f: - config = yaml.full_load(f) - except Exception: - logging.error("Error reading Kraken config from %s" % - params.kraken_config) - return "error", PodEgressNetShapingErrorOutput(format_exc()) publish = True try: @@ -1344,30 +1276,30 @@ def pod_egress_shaping( for mod in mod_lst: for node, ips in node_dict.items(): - job_list.extend( apply_net_policy( - mod, - node, - ips, - job_template, - pod_module_template, - params.network_params, - params.test_duration, - br_name, - kubecli, - params.execution_type, - )) + job_list.extend( + apply_net_policy( + mod, + node, + ips, + job_template, + pod_module_template, + params.network_params, + params.test_duration, + br_name, + kubecli, + params.execution_type, + ) + ) if params.execution_type == "serial": logging.info("Waiting for serial job to finish") start_time = int(time.time()) - wait_for_job(job_list[:], kubecli, - params.test_duration + 20) - logging.info("Waiting for wait_duration %s" % - params.test_duration) + wait_for_job(job_list[:], kubecli, params.test_duration + 20) + logging.info("Waiting for wait_duration %s" % params.test_duration) time.sleep(params.test_duration) end_time = int(time.time()) if publish: cerberus.publish_kraken_status( - config, failed_post_scenarios, start_time, end_time + params.kraken_config, "", start_time, end_time ) if params.execution_type == "parallel": break @@ -1380,7 +1312,7 @@ def pod_egress_shaping( end_time = int(time.time()) if publish: cerberus.publish_kraken_status( - config, failed_post_scenarios, start_time, end_time + params.kraken_config, "", start_time, end_time ) return "success", PodEgressNetShapingSuccessOutput( @@ -1389,8 +1321,7 @@ def pod_egress_shaping( execution_type=params.execution_type, ) except Exception as e: - logging.error( - "Pod network Shaping scenario exiting due to Exception - %s" % e) + logging.error("Pod network Shaping scenario exiting due to Exception - %s" % e) return "error", PodEgressNetShapingErrorOutput(format_exc()) finally: logging.info("Deleting jobs(if any)") @@ -1452,7 +1383,7 @@ class IngressParams: }, ) - kraken_config: typing.Optional[str] = field( + kraken_config: typing.Dict[str, typing.Any] = field( default=None, metadata={ "name": "Kraken Config", @@ -1549,8 +1480,8 @@ class PodIngressNetShapingErrorOutput: def pod_ingress_shaping( params: IngressParams, ) -> typing.Tuple[ - str, typing.Union[PodIngressNetShapingSuccessOutput, - PodIngressNetShapingErrorOutput] + str, + typing.Union[PodIngressNetShapingSuccessOutput, PodIngressNetShapingErrorOutput], ]: """ Function that performs ingress pod traffic shaping based @@ -1575,14 +1506,6 @@ def pod_ingress_shaping( publish = False if params.kraken_config: - failed_post_scenarios = "" - try: - with open(params.kraken_config, "r") as f: - config = yaml.full_load(f) - except Exception: - logging.error("Error reading Kraken config from %s" % - params.kraken_config) - return "error", PodIngressNetShapingErrorOutput(format_exc()) publish = True try: @@ -1617,30 +1540,30 @@ def pod_ingress_shaping( for mod in mod_lst: for node, ips in node_dict.items(): - job_list.extend(apply_ingress_policy( - mod, - node, - ips, - job_template, - pod_module_template, - params.network_params, - params.test_duration, - br_name, - kubecli, - params.execution_type, - )) + job_list.extend( + apply_ingress_policy( + mod, + node, + ips, + job_template, + pod_module_template, + params.network_params, + params.test_duration, + br_name, + kubecli, + params.execution_type, + ) + ) if params.execution_type == "serial": logging.info("Waiting for serial job to finish") start_time = int(time.time()) - wait_for_job(job_list[:], kubecli, - params.test_duration + 20) - logging.info("Waiting for wait_duration %s" % - params.test_duration) + wait_for_job(job_list[:], kubecli, params.test_duration + 20) + logging.info("Waiting for wait_duration %s" % params.test_duration) time.sleep(params.test_duration) end_time = int(time.time()) if publish: cerberus.publish_kraken_status( - config, failed_post_scenarios, start_time, end_time + params.kraken_config, "", start_time, end_time ) if params.execution_type == "parallel": break @@ -1653,7 +1576,7 @@ def pod_ingress_shaping( end_time = int(time.time()) if publish: cerberus.publish_kraken_status( - config, failed_post_scenarios, start_time, end_time + params.kraken_config, "", start_time, end_time ) return "success", PodIngressNetShapingSuccessOutput( @@ -1662,14 +1585,9 @@ def pod_ingress_shaping( execution_type=params.execution_type, ) except Exception as e: - logging.error( - "Pod network Shaping scenario exiting due to Exception - %s" % e) + logging.error("Pod network Shaping scenario exiting due to Exception - %s" % e) return "error", PodIngressNetShapingErrorOutput(format_exc()) finally: - delete_virtual_interfaces( - kubecli, - node_dict.keys(), - pod_module_template - ) + delete_virtual_interfaces(kubecli, node_dict.keys(), pod_module_template) logging.info("Deleting jobs(if any)") delete_jobs(kubecli, job_list[:]) diff --git a/krkn/scenario_plugins/pvc/pvc_scenario_plugin.py b/krkn/scenario_plugins/pvc/pvc_scenario_plugin.py index d842e955..731f02b5 100644 --- a/krkn/scenario_plugins/pvc/pvc_scenario_plugin.py +++ b/krkn/scenario_plugins/pvc/pvc_scenario_plugin.py @@ -29,6 +29,9 @@ def run( pvc_name = get_yaml_item_value(scenario_config, "pvc_name", "") pod_name = get_yaml_item_value(scenario_config, "pod_name", "") namespace = get_yaml_item_value(scenario_config, "namespace", "") + block_size = get_yaml_item_value( + scenario_config, "block_size", "102400" + ) target_fill_percentage = get_yaml_item_value( scenario_config, "fill_percentage", "50" ) @@ -176,10 +179,39 @@ def run( start_time = int(time.time()) # Create temp file in the PVC full_path = "%s/%s" % (str(mount_path), str(file_name)) - command = "fallocate -l $((%s*1024)) %s" % ( - str(file_size_kb), - str(full_path), + + fallocate = lib_telemetry.get_lib_kubernetes().exec_cmd_in_pod( + ["command -v fallocate"], + pod_name, + namespace, + container_name, + ) + + dd = lib_telemetry.get_lib_kubernetes().exec_cmd_in_pod( + ["command -v dd"], + pod_name, + namespace, + container_name, ) + + if fallocate: + command = "fallocate -l $((%s*1024)) %s" % ( + str(file_size_kb), + str(full_path), + ) + elif dd is not None: + block_size = int(block_size) + blocks = int(file_size_kb / int(block_size / 1024)) + logging.warning( + "fallocate not found, using dd, it may take longer based on the amount of data, please wait..." + ) + command = f"dd if=/dev/urandom of={str(full_path)} bs={str(block_size)} count={str(blocks)} oflag=direct" + else: + logging.error( + "failed to locate required binaries fallocate or dd to execute the scenario" + ) + return 1 + logging.debug("Create temp file in the PVC command:\n %s" % command) lib_telemetry.get_lib_kubernetes().exec_cmd_in_pod( [command], @@ -214,45 +246,6 @@ def run( ) return 1 - # Calculate file size - file_size_kb = int( - (float(target_fill_percentage / 100) * float(pvc_capacity_kb)) - - float(pvc_used_kb) - ) - logging.debug("File size: %s KB" % file_size_kb) - - file_name = "kraken.tmp" - logging.info( - "Creating %s file, %s KB size, in pod %s at %s (ns %s)" - % ( - str(file_name), - str(file_size_kb), - str(pod_name), - str(mount_path), - str(namespace), - ) - ) - - start_time = int(time.time()) - # Create temp file in the PVC - full_path = "%s/%s" % (str(mount_path), str(file_name)) - command = "fallocate -l $((%s*1024)) %s" % ( - str(file_size_kb), - str(full_path), - ) - logging.debug("Create temp file in the PVC command:\n %s" % command) - lib_telemetry.get_lib_kubernetes().exec_cmd_in_pod( - [command], pod_name, namespace, container_name - ) - - # Check if file is created - command = "ls -lh %s" % (str(mount_path)) - logging.debug("Check file is created command:\n %s" % command) - response = lib_telemetry.get_lib_kubernetes().exec_cmd_in_pod( - [command], pod_name, namespace, container_name - ) - logging.info("\n" + str(response)) - if str(file_name).lower() in str(response).lower(): logging.info( "Waiting for the specified duration in the config: %ss" % duration ) diff --git a/requirements.txt b/requirements.txt index d6712c6d..ef8d6498 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ google-api-python-client==2.116.0 ibm_cloud_sdk_core==3.18.0 ibm_vpc==0.20.0 jinja2==3.1.4 -krkn-lib==4.0.2 +krkn-lib==4.0.3 lxml==5.1.0 kubernetes==28.1.0 numpy==1.26.4 diff --git a/run_kraken.py b/run_kraken.py index 7e21576e..752b55f3 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -628,7 +628,7 @@ def main(cfg) -> int: junit_testcase_xml = get_junit_test_case( success=True if retval == 0 else False, time=int(junit_endtime - junit_start_time), - test_suite_name="krkn-test-suite", + test_suite_name="chaos-krkn", test_case_description=options.junit_testcase, test_stdout=tee_handler.get_output(), test_version=options.junit_testcase_version, diff --git a/scenarios/openshift/pvc_scenario.yaml b/scenarios/openshift/pvc_scenario.yaml index daa3e297..54bdb93b 100644 --- a/scenarios/openshift/pvc_scenario.yaml +++ b/scenarios/openshift/pvc_scenario.yaml @@ -4,3 +4,4 @@ pvc_scenario: namespace: # Namespace where the PVC is fill_percentage: 50 # Target percentage to fill up the cluster, value must be higher than current percentage, valid values are between 0 and 99 duration: 60 # Duration in seconds for the fault + block_size: 102400 # used only by dd if fallocate not present in the container diff --git a/utils/arcaflow/ocp-chaos/README.md b/utils/arcaflow/ocp-chaos/README.md new file mode 100644 index 00000000..e645fbaf --- /dev/null +++ b/utils/arcaflow/ocp-chaos/README.md @@ -0,0 +1,304 @@ +# OpenShift Shenanigans + +## Workflow Description + +Given a target OpenShift cluster, this workflow executes a +[kube-burner plugin](https://github.com/redhat-performance/arcaflow-plugin-kube-burner) +workflow to place a load on the cluster, repeatedly removes a targeted pod at a given time frequency with the [kill-pod plugin](https://github.com/krkn-chaos/arcaflow-plugin-kill-pod), +and runs a [stress-ng](https://github.com/ColinIanKing/stress-ng) CPU workload on the cluster. +Target your OpenShift cluster with the appropriate `kubeconfig` file, and add its file path as +the value for `kubernetes_target.kubeconfig_path`, in the input file. Any combination of subworkflows can be disabled in the input file by setting either `cpu_hog_enabled`, `pod_chaos_enabled`, or `kubeburner_enabled` to `false`. + + +## Files + +- [`workflow.yaml`](workflow.yaml) -- Defines the workflow input schema, the plugins to run + and their data relationships, and the output to present to the user +- [`input.yaml`](input.yaml) -- The input parameters that the user provides for running + the workflow +- [`config.yaml`](config.yaml) -- Global config parameters that are passed to the Arcaflow + engine +- [`cpu-hog.yaml`](subworkflows/cpu-hog.yaml) -- The StressNG workload on the CPU. +- [`kubeburner.yaml`](subworkflows/kubeburner.yaml) -- The KubeBurner workload for the Kubernetes Cluster API. +- [`pod-chaos.yaml`](subworkflows/pod-chaos.yaml) -- The Kill Pod workflow for the Kubernetes infrastructure pods. + +## Running the Workflow + +### Workflow Dependencies + +Install Python, at least `3.9`. + +First, add the path to your Python interpreter to `config.yaml` as the value +for `pythonPath` as shown here. A common choice for users working in +distributions of Linux operating systems is `usr/bin/python`. Second, add a +directory to which your Arcaflow process will have write access as the +value for `workdir`, `/tmp` is a common choice because your process will likely be able to write to it. + +```yaml +deployers: + python: + pythonPath: /usr/bin/python + workdir: /tmp +``` + +To use this Python interpreter with our `kill-pod` plugin, go to the `deploy` section of the `kill_pod` step in [`pod-chaos.yaml`](subworkflows/pod-chaos.yaml). You can use the same `pythonPath` and `workdir` that you used in +your `config.yaml`. + +```yaml +deploy: + deployer_name: python + modulePullPolicy: Always + pythonPath: /usr/bin/python + workdir: /tmp +``` + +Download a Go binary of the latest version of the Arcaflow engine from: https://github.com/arcalot/arcaflow-engine/releases. + +#### OpenShift Target + +Target your desired OpenShift cluster by setting the `kubeconfig_path` variable for each subworkflow's parameter list in [`input.yaml`](input.yaml). + +#### Kube-Burner Plugin + +The `kube-burner` plugin generates and reports the UUID to which the +`kube-burner` data is associated in your search database. The `uuidgen` +workflow step uses the `arcaflow-plugin-utilities` `uuid` plugin step to +randomly generate a UUID for you. + +### Workflow Execution + +Run the workflow: +``` +$ export WFPATH= +$ arcaflow --context ${WFPATH} --input input.yaml --config config.yaml --workflow workflow.yaml +``` + +## Workflow Diagram +This diagram shows the complete end-to-end workflow logic. + +### Main Workflow + +```mermaid +%% Mermaid markdown workflow +flowchart LR +%% Success path +input-->steps.cpu_hog_wf.enabling +input-->steps.cpu_hog_wf.execute +input-->steps.kubeburner_wf.enabling +input-->steps.kubeburner_wf.execute +input-->steps.pod_chaos_wf.enabling +input-->steps.pod_chaos_wf.execute +outputs.workflow_success.cpu_hog-->outputs.workflow_success +outputs.workflow_success.cpu_hog.disabled-->outputs.workflow_success.cpu_hog +outputs.workflow_success.cpu_hog.enabled-->outputs.workflow_success.cpu_hog +outputs.workflow_success.kubeburner-->outputs.workflow_success +outputs.workflow_success.kubeburner.disabled-->outputs.workflow_success.kubeburner +outputs.workflow_success.kubeburner.enabled-->outputs.workflow_success.kubeburner +outputs.workflow_success.pod_chaos-->outputs.workflow_success +outputs.workflow_success.pod_chaos.disabled-->outputs.workflow_success.pod_chaos +outputs.workflow_success.pod_chaos.enabled-->outputs.workflow_success.pod_chaos +steps.cpu_hog_wf.closed-->steps.cpu_hog_wf.closed.result +steps.cpu_hog_wf.disabled-->steps.cpu_hog_wf.disabled.output +steps.cpu_hog_wf.disabled.output-->outputs.workflow_success.cpu_hog.disabled +steps.cpu_hog_wf.enabling-->steps.cpu_hog_wf.closed +steps.cpu_hog_wf.enabling-->steps.cpu_hog_wf.disabled +steps.cpu_hog_wf.enabling-->steps.cpu_hog_wf.enabling.resolved +steps.cpu_hog_wf.enabling-->steps.cpu_hog_wf.execute +steps.cpu_hog_wf.execute-->steps.cpu_hog_wf.outputs +steps.cpu_hog_wf.outputs-->steps.cpu_hog_wf.outputs.success +steps.cpu_hog_wf.outputs.success-->outputs.workflow_success.cpu_hog.enabled +steps.kubeburner_wf.closed-->steps.kubeburner_wf.closed.result +steps.kubeburner_wf.disabled-->steps.kubeburner_wf.disabled.output +steps.kubeburner_wf.disabled.output-->outputs.workflow_success.kubeburner.disabled +steps.kubeburner_wf.enabling-->steps.kubeburner_wf.closed +steps.kubeburner_wf.enabling-->steps.kubeburner_wf.disabled +steps.kubeburner_wf.enabling-->steps.kubeburner_wf.enabling.resolved +steps.kubeburner_wf.enabling-->steps.kubeburner_wf.execute +steps.kubeburner_wf.execute-->steps.kubeburner_wf.outputs +steps.kubeburner_wf.outputs-->steps.kubeburner_wf.outputs.success +steps.kubeburner_wf.outputs.success-->outputs.workflow_success.kubeburner.enabled +steps.pod_chaos_wf.closed-->steps.pod_chaos_wf.closed.result +steps.pod_chaos_wf.disabled-->steps.pod_chaos_wf.disabled.output +steps.pod_chaos_wf.disabled.output-->outputs.workflow_success.pod_chaos.disabled +steps.pod_chaos_wf.enabling-->steps.pod_chaos_wf.closed +steps.pod_chaos_wf.enabling-->steps.pod_chaos_wf.disabled +steps.pod_chaos_wf.enabling-->steps.pod_chaos_wf.enabling.resolved +steps.pod_chaos_wf.enabling-->steps.pod_chaos_wf.execute +steps.pod_chaos_wf.execute-->steps.pod_chaos_wf.outputs +steps.pod_chaos_wf.outputs-->steps.pod_chaos_wf.outputs.success +steps.pod_chaos_wf.outputs.success-->outputs.workflow_success.pod_chaos.enabled +%% Error path +steps.cpu_hog_wf.execute-->steps.cpu_hog_wf.failed +steps.cpu_hog_wf.failed-->steps.cpu_hog_wf.failed.error +steps.kubeburner_wf.execute-->steps.kubeburner_wf.failed +steps.kubeburner_wf.failed-->steps.kubeburner_wf.failed.error +steps.pod_chaos_wf.execute-->steps.pod_chaos_wf.failed +steps.pod_chaos_wf.failed-->steps.pod_chaos_wf.failed.error +%% Mermaid end +``` + +### Pod Chaos Workflow + +```mermaid +%% Mermaid markdown workflow +flowchart LR +%% Success path +input-->steps.kill_pod.starting +steps.kill_pod.cancelled-->steps.kill_pod.closed +steps.kill_pod.cancelled-->steps.kill_pod.outputs +steps.kill_pod.closed-->steps.kill_pod.closed.result +steps.kill_pod.deploy-->steps.kill_pod.closed +steps.kill_pod.deploy-->steps.kill_pod.starting +steps.kill_pod.disabled-->steps.kill_pod.disabled.output +steps.kill_pod.enabling-->steps.kill_pod.closed +steps.kill_pod.enabling-->steps.kill_pod.disabled +steps.kill_pod.enabling-->steps.kill_pod.enabling.resolved +steps.kill_pod.enabling-->steps.kill_pod.starting +steps.kill_pod.outputs-->steps.kill_pod.outputs.success +steps.kill_pod.outputs.success-->outputs.success +steps.kill_pod.running-->steps.kill_pod.closed +steps.kill_pod.running-->steps.kill_pod.outputs +steps.kill_pod.starting-->steps.kill_pod.closed +steps.kill_pod.starting-->steps.kill_pod.running +steps.kill_pod.starting-->steps.kill_pod.starting.started +%% Error path +steps.kill_pod.cancelled-->steps.kill_pod.crashed +steps.kill_pod.cancelled-->steps.kill_pod.deploy_failed +steps.kill_pod.crashed-->steps.kill_pod.crashed.error +steps.kill_pod.deploy-->steps.kill_pod.deploy_failed +steps.kill_pod.deploy_failed-->steps.kill_pod.deploy_failed.error +steps.kill_pod.enabling-->steps.kill_pod.crashed +steps.kill_pod.outputs-->steps.kill_pod.outputs.error +steps.kill_pod.running-->steps.kill_pod.crashed +steps.kill_pod.starting-->steps.kill_pod.crashed +%% Mermaid end +``` + +### StressNG (CPU Hog) Workflow + +```mermaid +%% Mermaid markdown workflow +flowchart LR +%% Success path +input-->steps.kubeconfig.starting +input-->steps.stressng.deploy +input-->steps.stressng.starting +steps.kubeconfig.cancelled-->steps.kubeconfig.closed +steps.kubeconfig.cancelled-->steps.kubeconfig.outputs +steps.kubeconfig.closed-->steps.kubeconfig.closed.result +steps.kubeconfig.deploy-->steps.kubeconfig.closed +steps.kubeconfig.deploy-->steps.kubeconfig.starting +steps.kubeconfig.disabled-->steps.kubeconfig.disabled.output +steps.kubeconfig.enabling-->steps.kubeconfig.closed +steps.kubeconfig.enabling-->steps.kubeconfig.disabled +steps.kubeconfig.enabling-->steps.kubeconfig.enabling.resolved +steps.kubeconfig.enabling-->steps.kubeconfig.starting +steps.kubeconfig.outputs-->steps.kubeconfig.outputs.success +steps.kubeconfig.outputs.success-->steps.stressng.deploy +steps.kubeconfig.running-->steps.kubeconfig.closed +steps.kubeconfig.running-->steps.kubeconfig.outputs +steps.kubeconfig.starting-->steps.kubeconfig.closed +steps.kubeconfig.starting-->steps.kubeconfig.running +steps.kubeconfig.starting-->steps.kubeconfig.starting.started +steps.stressng.cancelled-->steps.stressng.closed +steps.stressng.cancelled-->steps.stressng.outputs +steps.stressng.closed-->steps.stressng.closed.result +steps.stressng.deploy-->steps.stressng.closed +steps.stressng.deploy-->steps.stressng.starting +steps.stressng.disabled-->steps.stressng.disabled.output +steps.stressng.enabling-->steps.stressng.closed +steps.stressng.enabling-->steps.stressng.disabled +steps.stressng.enabling-->steps.stressng.enabling.resolved +steps.stressng.enabling-->steps.stressng.starting +steps.stressng.outputs-->steps.stressng.outputs.success +steps.stressng.outputs.success-->outputs.success +steps.stressng.running-->steps.stressng.closed +steps.stressng.running-->steps.stressng.outputs +steps.stressng.starting-->steps.stressng.closed +steps.stressng.starting-->steps.stressng.running +steps.stressng.starting-->steps.stressng.starting.started +%% Error path +steps.kubeconfig.cancelled-->steps.kubeconfig.crashed +steps.kubeconfig.cancelled-->steps.kubeconfig.deploy_failed +steps.kubeconfig.crashed-->steps.kubeconfig.crashed.error +steps.kubeconfig.deploy-->steps.kubeconfig.deploy_failed +steps.kubeconfig.deploy_failed-->steps.kubeconfig.deploy_failed.error +steps.kubeconfig.enabling-->steps.kubeconfig.crashed +steps.kubeconfig.outputs-->steps.kubeconfig.outputs.error +steps.kubeconfig.running-->steps.kubeconfig.crashed +steps.kubeconfig.starting-->steps.kubeconfig.crashed +steps.stressng.cancelled-->steps.stressng.crashed +steps.stressng.cancelled-->steps.stressng.deploy_failed +steps.stressng.crashed-->steps.stressng.crashed.error +steps.stressng.deploy-->steps.stressng.deploy_failed +steps.stressng.deploy_failed-->steps.stressng.deploy_failed.error +steps.stressng.enabling-->steps.stressng.crashed +steps.stressng.outputs-->steps.stressng.outputs.error +steps.stressng.running-->steps.stressng.crashed +steps.stressng.starting-->steps.stressng.crashed +%% Mermaid end +``` + +### Kube-Burner Workflow + +```mermaid +%% Mermaid markdown workflow +flowchart LR +%% Success path +input-->steps.kubeburner.starting +steps.kubeburner.cancelled-->steps.kubeburner.closed +steps.kubeburner.cancelled-->steps.kubeburner.outputs +steps.kubeburner.closed-->steps.kubeburner.closed.result +steps.kubeburner.deploy-->steps.kubeburner.closed +steps.kubeburner.deploy-->steps.kubeburner.starting +steps.kubeburner.disabled-->steps.kubeburner.disabled.output +steps.kubeburner.enabling-->steps.kubeburner.closed +steps.kubeburner.enabling-->steps.kubeburner.disabled +steps.kubeburner.enabling-->steps.kubeburner.enabling.resolved +steps.kubeburner.enabling-->steps.kubeburner.starting +steps.kubeburner.outputs-->steps.kubeburner.outputs.success +steps.kubeburner.outputs.success-->outputs.success +steps.kubeburner.running-->steps.kubeburner.closed +steps.kubeburner.running-->steps.kubeburner.outputs +steps.kubeburner.starting-->steps.kubeburner.closed +steps.kubeburner.starting-->steps.kubeburner.running +steps.kubeburner.starting-->steps.kubeburner.starting.started +steps.uuidgen.cancelled-->steps.uuidgen.closed +steps.uuidgen.cancelled-->steps.uuidgen.outputs +steps.uuidgen.closed-->steps.uuidgen.closed.result +steps.uuidgen.deploy-->steps.uuidgen.closed +steps.uuidgen.deploy-->steps.uuidgen.starting +steps.uuidgen.disabled-->steps.uuidgen.disabled.output +steps.uuidgen.enabling-->steps.uuidgen.closed +steps.uuidgen.enabling-->steps.uuidgen.disabled +steps.uuidgen.enabling-->steps.uuidgen.enabling.resolved +steps.uuidgen.enabling-->steps.uuidgen.starting +steps.uuidgen.outputs-->steps.uuidgen.outputs.success +steps.uuidgen.outputs.success-->steps.kubeburner.starting +steps.uuidgen.running-->steps.uuidgen.closed +steps.uuidgen.running-->steps.uuidgen.outputs +steps.uuidgen.starting-->steps.uuidgen.closed +steps.uuidgen.starting-->steps.uuidgen.running +steps.uuidgen.starting-->steps.uuidgen.starting.started +%% Error path +steps.kubeburner.cancelled-->steps.kubeburner.crashed +steps.kubeburner.cancelled-->steps.kubeburner.deploy_failed +steps.kubeburner.crashed-->steps.kubeburner.crashed.error +steps.kubeburner.deploy-->steps.kubeburner.deploy_failed +steps.kubeburner.deploy_failed-->steps.kubeburner.deploy_failed.error +steps.kubeburner.enabling-->steps.kubeburner.crashed +steps.kubeburner.outputs-->steps.kubeburner.outputs.error +steps.kubeburner.running-->steps.kubeburner.crashed +steps.kubeburner.starting-->steps.kubeburner.crashed +steps.uuidgen.cancelled-->steps.uuidgen.crashed +steps.uuidgen.cancelled-->steps.uuidgen.deploy_failed +steps.uuidgen.crashed-->steps.uuidgen.crashed.error +steps.uuidgen.deploy-->steps.uuidgen.deploy_failed +steps.uuidgen.deploy_failed-->steps.uuidgen.deploy_failed.error +steps.uuidgen.enabling-->steps.uuidgen.crashed +steps.uuidgen.outputs-->steps.uuidgen.outputs.error +steps.uuidgen.running-->steps.uuidgen.crashed +steps.uuidgen.starting-->steps.uuidgen.crashed +%% Mermaid end +``` + diff --git a/utils/arcaflow/ocp-chaos/config.yaml b/utils/arcaflow/ocp-chaos/config.yaml new file mode 100644 index 00000000..31ef455e --- /dev/null +++ b/utils/arcaflow/ocp-chaos/config.yaml @@ -0,0 +1,18 @@ +--- +deployers: + image: + deployer_name: podman + deployment: + imagePullPolicy: IfNotPresent + python: + deployer_name: python + modulePullPolicy: Always + pythonPath: /usr/bin/python + workdir: /tmp +log: + level: debug +logged_outputs: + error: + level: debug + success: + level: debug diff --git a/utils/arcaflow/ocp-chaos/input.yaml b/utils/arcaflow/ocp-chaos/input.yaml new file mode 100644 index 00000000..a48c2101 --- /dev/null +++ b/utils/arcaflow/ocp-chaos/input.yaml @@ -0,0 +1,41 @@ +kubernetes_target: + kubeconfig_path: +cpu_hog_enabled: true +pod_chaos_enabled: true +kubeburner_enabled: true + +kubeburner_list: + - kubeburner: + kubeconfig: 'given later in workflow by kubeconfig plugin' + workload: 'cluster-density' + qps: 20 + burst: 20 + log_level: 'info' + timeout: '1m' + iterations: 1 + churn: 'true' + churn_duration: 1s + churn_delay: 1s + churn_percent: 10 + alerting: 'true' + gc: 'true' + +pod_chaos_list: + - namespace_pattern: ^openshift-etcd$ + label_selector: k8s-app=etcd + kill: 1 + krkn_pod_recovery_time: 1 + +cpu_hog_list: + - namespace: default + # set the node selector as a key-value pair eg. + # node_selector: + # kubernetes.io/hostname: kind-worker2 + node_selector: {} + stressng_params: + timeout: 1 + stressors: + - stressor: cpu + workers: 1 + cpu-load: 20 + cpu-method: all diff --git a/utils/arcaflow/ocp-chaos/subworkflows/cpu-hog.yaml b/utils/arcaflow/ocp-chaos/subworkflows/cpu-hog.yaml new file mode 100644 index 00000000..db7718a8 --- /dev/null +++ b/utils/arcaflow/ocp-chaos/subworkflows/cpu-hog.yaml @@ -0,0 +1,75 @@ +version: v0.2.0 +input: + root: CpuHog__KubernetesTarget + objects: + CpuHog__KubernetesTarget: + id: CpuHog__KubernetesTarget + properties: + constant: + type: + type_id: ref + id: KubernetesTarget + item: + type: + type_id: ref + id: CpuHog + KubernetesTarget: + id: KubernetesTarget + properties: + kubeconfig_path: + type: + type_id: string + CpuHog: + id: CpuHog + properties: + namespace: + display: + description: The namespace where the container will be deployed + name: Namespace + type: + type_id: string + required: true + node_selector: + display: + description: kubernetes node name where the plugin must be deployed + type: + type_id: map + values: + type_id: string + keys: + type_id: string + required: true + stressng_params: + type: + type_id: ref + id: StressNGParams + namespace: $.steps.stressng.starting.inputs.input + +steps: + kubeconfig: + plugin: + src: quay.io/arcalot/arcaflow-plugin-kubeconfig:0.3.1 + deployment_type: image + input: + kubeconfig: !expr 'readFile($.input.constant.kubeconfig_path)' + stressng: + plugin: + src: quay.io/arcalot/arcaflow-plugin-stressng:0.8.0 + deployment_type: image + step: workload + input: !expr $.input.item.stressng_params + deploy: + deployer_name: kubernetes + connection: !expr $.steps.kubeconfig.outputs.success.connection + pod: + metadata: + namespace: !expr $.input.item.namespace + labels: + arcaflow: stressng + spec: + nodeSelector: !expr $.input.item.node_selector + pluginContainer: + imagePullPolicy: Always + +outputs: + success: !expr $.steps.stressng.outputs.success diff --git a/utils/arcaflow/ocp-chaos/subworkflows/kubeburner.yaml b/utils/arcaflow/ocp-chaos/subworkflows/kubeburner.yaml new file mode 100644 index 00000000..e1d9b50a --- /dev/null +++ b/utils/arcaflow/ocp-chaos/subworkflows/kubeburner.yaml @@ -0,0 +1,54 @@ +version: v0.2.0 +input: + root: KubeBurner__KubernetesTarget + objects: + KubeBurner__KubernetesTarget: + id: KubeBurner__KubernetesTarget + properties: + constant: + type: + type_id: ref + id: KubernetesTarget + item: + type: + type_id: ref + id: KubeBurner + KubernetesTarget: + id: KubernetesTarget + properties: + kubeconfig_path: + type: + type_id: string + KubeBurner: + id: KubeBurner + properties: + kubeburner: + type: + type_id: ref + id: KubeBurnerInputParams + namespace: $.steps.kubeburner.starting.inputs.input + +steps: + uuidgen: + plugin: + deployment_type: image + src: quay.io/arcalot/arcaflow-plugin-utilities:0.6.0 + step: uuid + input: {} + kubeburner: + plugin: + deployment_type: image + src: quay.io/redhat-performance/arcaflow-plugin-kube-burner:latest + step: kube-burner + input: + kubeconfig: !expr 'readFile($.input.constant.kubeconfig_path)' + uuid: !expr $.steps.uuidgen.outputs.success.uuid + workload: !expr $.input.item.kubeburner.workload + iterations: !expr $.input.item.kubeburner.iterations + churn: !expr $.input.item.kubeburner.churn + churn_duration: !expr $.input.item.kubeburner.churn_duration + churn_delay: !expr $.input.item.kubeburner.churn_delay + +outputs: + success: + burner: !expr $.steps.kubeburner.outputs.success diff --git a/utils/arcaflow/ocp-chaos/subworkflows/pod-chaos.yaml b/utils/arcaflow/ocp-chaos/subworkflows/pod-chaos.yaml new file mode 100644 index 00000000..5c28da9d --- /dev/null +++ b/utils/arcaflow/ocp-chaos/subworkflows/pod-chaos.yaml @@ -0,0 +1,108 @@ +version: v0.2.0 +input: + root: KillPodConfig__KubernetesTarget + objects: + KillPodConfig__KubernetesTarget: + id: KillPodConfig__KubernetesTarget + properties: + constant: + type: + type_id: ref + id: KubernetesTarget + item: + type: + type_id: ref + id: KillPodConfig + KubernetesTarget: + id: KubernetesTarget + properties: + kubeconfig_path: + type: + type_id: string + KillPodConfig: + id: KillPodConfig + properties: + backoff: + default: '1' + display: + description: How many seconds to wait between checks for the target + pod status. + name: Backoff + required: false + type: + type_id: integer + kill: + default: '1' + display: + description: How many pods should we attempt to kill? + name: Number of pods to kill + required: false + type: + min: 1 + type_id: integer + krkn_pod_recovery_time: + default: '60' + display: + description: The Expected Recovery time fo the pod (used by Krkn to + monitor the pod lifecycle) + name: Recovery Time + required: false + type: + type_id: integer + label_selector: + display: + description: 'Kubernetes label selector for the target pods. Required + if name_pattern is not set. + See https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ + for details.' + name: Label selector + required: false + required_if_not: + - name_pattern + type: + type_id: string + name_pattern: + display: + description: Regular expression for target pods. Required if label_selector + is not set. + name: Name pattern + required: false + required_if_not: + - label_selector + type: + type_id: pattern + namespace_pattern: + display: + description: Regular expression for target pod namespaces. + name: Namespace pattern + required: true + type: + type_id: pattern + timeout: + default: '180' + display: + description: Timeout to wait for the target pod(s) to be removed in + seconds. + name: Timeout + required: false + type: + type_id: integer + +steps: + kill_pod: + step: kill-pods + plugin: + deployment_type: python + src: arcaflow-plugin-kill-pod@git+https://github.com/krkn-chaos/arcaflow-plugin-kill-pod.git@a9f87f88d8e7763d111613bd8b2c7862fc49624f + input: + namespace_pattern: !expr $.input.item.namespace_pattern + label_selector: !expr $.input.item.label_selector + kubeconfig_path: !expr $.input.constant.kubeconfig_path + deploy: + deployer_name: python + modulePullPolicy: Always + pythonPath: /usr/bin/python + workdir: /tmp + +outputs: + success: !expr $.steps.kill_pod.outputs.success diff --git a/utils/arcaflow/ocp-chaos/workflow.yaml b/utils/arcaflow/ocp-chaos/workflow.yaml new file mode 100644 index 00000000..c7eedead --- /dev/null +++ b/utils/arcaflow/ocp-chaos/workflow.yaml @@ -0,0 +1,73 @@ +version: v0.2.0 +input: + root: RootObject + objects: + KubernetesTarget: + id: KubernetesTarget + properties: + kubeconfig_path: + type: + type_id: string + RootObject: + id: RootObject + properties: + cpu_hog_enabled: + type: + type_id: bool + pod_chaos_enabled: + type: + type_id: bool + kubeburner_enabled: + type: + type_id: bool + kubernetes_target: + type: + type_id: ref + id: KubernetesTarget + kubeburner_list: + type: + type_id: list + items: + type_id: ref + id: KubeBurner + namespace: $.steps.kubeburner_wf.execute.inputs.items + pod_chaos_list: + type: + type_id: list + items: + type_id: ref + id: KillPodConfig + namespace: $.steps.pod_chaos_wf.execute.inputs.items + cpu_hog_list: + type: + type_id: list + items: + type_id: ref + id: CpuHog + namespace: $.steps.cpu_hog_wf.execute.inputs.items + +steps: + kubeburner_wf: + kind: foreach + items: !expr 'bindConstants($.input.kubeburner_list, $.input.kubernetes_target)' + workflow: subworkflows/kubeburner.yaml + parallelism: 1 + enabled: !expr $.input.kubeburner_enabled + pod_chaos_wf: + kind: foreach + items: !expr 'bindConstants($.input.pod_chaos_list, $.input.kubernetes_target)' + workflow: subworkflows/pod-chaos.yaml + parallelism: 1 + enabled: !expr $.input.pod_chaos_enabled + cpu_hog_wf: + kind: foreach + items: !expr 'bindConstants($.input.cpu_hog_list, $.input.kubernetes_target)' + workflow: subworkflows/cpu-hog.yaml + parallelism: 1 + enabled: !expr $.input.cpu_hog_enabled + +outputs: + workflow_success: + kubeburner: !ordisabled $.steps.kubeburner_wf.outputs.success + pod_chaos: !ordisabled $.steps.pod_chaos_wf.outputs.success + cpu_hog: !ordisabled $.steps.cpu_hog_wf.outputs.success