diff --git a/CI/config/common_test_config.yaml b/CI/config/common_test_config.yaml index 733ff358..f1e87a7d 100644 --- a/CI/config/common_test_config.yaml +++ b/CI/config/common_test_config.yaml @@ -50,3 +50,15 @@ telemetry: oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH events_backup: True # enables/disables cluster events collection telemetry_group: "funtests" +elastic: + enable_elastic: True + collect_metrics: False + collect_alerts: False + verify_certs: False + elastic_url: "https://192.168.39.196" # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank + elastic_port: 32766 + username: "elastic" + password: "test" + metrics_index: "krkn-metrics" + alerts_index: "krkn-alerts" + telemetry_index: "krkn-telemetry" diff --git a/CI/tests/test_service_hijacking.sh b/CI/tests/test_service_hijacking.sh index 8b779418..fedb75ca 100644 --- a/CI/tests/test_service_hijacking.sh +++ b/CI/tests/test_service_hijacking.sh @@ -42,7 +42,14 @@ function functional_test_service_hijacking { python3 -m coverage run -a run_kraken.py -c CI/config/service_hijacking.yaml > /dev/null 2>&1 & PID=$! #Waiting the hijacking to have effect - while [ `curl -X GET -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/list/index.php` == 404 ]; do echo "waiting scenario to kick in."; sleep 1; done; + COUNTER=0 + while [ `curl -X GET -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/list/index.php` == 404 ] + do + echo "waiting scenario to kick in." + sleep 1 + COUNTER=$((COUNTER+1)) + [ $COUNTER -eq "100" ] && echo "maximum number of retry reached, test failed" && exit 1 + done #Checking Step 1 GET on /list/index.php OUT_GET="`curl -X GET -s $SERVICE_URL/list/index.php`" diff --git a/config/config.yaml b/config/config.yaml index 3e918f7e..24c2323c 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -55,12 +55,27 @@ cerberus: performance_monitoring: deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift repo: "https://github.com/cloud-bulldozer/performance-dashboards.git" - prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. + prometheus_url: '' # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. uuid: # uuid for the run is generated by default if not set enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error + enable_metrics: False alert_profile: config/alerts.yaml # Path or URL to alert profile with the prometheus queries + metrics_profile: config/metrics.yaml check_critical_alerts: False # When enabled will check prometheus for critical alerts firing post chaos +elastic: + enable_elastic: False + collect_metrics: False + collect_alerts: False + verify_certs: False + elastic_url: "" # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank + elastic_port: 32766 + username: "elastic" + password: "test" + metrics_index: "krkn-metrics" + alerts_index: "krkn-alerts" + telemetry_index: "krkn-telemetry" + tunings: wait_duration: 60 # Duration to wait between each chaos scenario iterations: 1 # Number of times to execute the scenarios @@ -94,9 +109,7 @@ telemetry: - "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH events_backup: True # enables/disables cluster events collection -elastic: - elastic_url: "" # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank - elastic_index: "" # Elastic search index pattern to post results to + diff --git a/kraken/prometheus/client.py b/kraken/prometheus/client.py index 6052502d..5a50c48d 100644 --- a/kraken/prometheus/client.py +++ b/kraken/prometheus/client.py @@ -1,16 +1,30 @@ +from __future__ import annotations + import datetime import os.path -from typing import Optional +from typing import Optional, List, Dict, Any import urllib3 import logging import sys import yaml +from krkn_lib.elastic.krkn_elastic import KrknElastic +from krkn_lib.models.elastic.models import ElasticAlert from krkn_lib.models.krkn import ChaosRunAlertSummary, ChaosRunAlert from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus + + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) -def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile): +def alerts(prom_cli: KrknPrometheus, + elastic: KrknElastic, + run_uuid, + start_time, + end_time, + alert_profile, + elastic_colllect_alerts, + elastic_alerts_index + ): if alert_profile is None or os.path.exists(alert_profile) is False: logging.error(f"{alert_profile} alert profile does not exist") @@ -20,7 +34,7 @@ def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile): profile_yaml = yaml.safe_load(profile) if not isinstance(profile_yaml, list): logging.error(f"{alert_profile} wrong file format, alert profile must be " - f"a valid yaml file containing a list of items with 3 properties: " + f"a valid yaml file containing a list of items with at least 3 properties: " f"expr, description, severity" ) sys.exit(1) @@ -28,9 +42,20 @@ def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile): if list(alert.keys()).sort() != ["expr", "description", "severity"].sort(): logging.error(f"wrong alert {alert}, skipping") - prom_cli.process_alert(alert, + processed_alert = prom_cli.process_alert(alert, datetime.datetime.fromtimestamp(start_time), datetime.datetime.fromtimestamp(end_time)) + if processed_alert[0] and processed_alert[1] and elastic_colllect_alerts: + elastic_alert = ElasticAlert(run_uuid=run_uuid, + severity=alert["severity"], + alert=processed_alert[1], + created_at=datetime.datetime.fromtimestamp(processed_alert[0]) + ) + result = elastic.push_alert(elastic_alert, elastic_alerts_index) + if result == -1: + logging.error("failed to save alert on ElasticSearch") + pass + def critical_alerts(prom_cli: KrknPrometheus, @@ -86,3 +111,57 @@ def critical_alerts(prom_cli: KrknPrometheus, if not firing_alerts: logging.info("No critical alerts are firing!!") + + +def metrics(prom_cli: KrknPrometheus, + elastic: KrknElastic, + run_uuid, + start_time, + end_time, + metrics_profile, + elastic_colllect_metrics, + elastic_metrics_index + ) -> list[dict[str, list[(int, float)] | str]]: + metrics_list: list[dict[str, list[(int, float)] | str]] = [] + if metrics_profile is None or os.path.exists(metrics_profile) is False: + logging.error(f"{metrics_profile} alert profile does not exist") + sys.exit(1) + with open(metrics_profile) as profile: + profile_yaml = yaml.safe_load(profile) + if not profile_yaml["metrics"] or not isinstance(profile_yaml["metrics"], list): + logging.error(f"{metrics_profile} wrong file format, alert profile must be " + f"a valid yaml file containing a list of items with 3 properties: " + f"expr, description, severity" ) + sys.exit(1) + + for metric_query in profile_yaml["metrics"]: + if list(metric_query.keys()).sort() != ["query", "metricName", "instant"].sort(): + logging.error(f"wrong alert {metric_query}, skipping") + metrics_result = prom_cli.process_prom_query_in_range( + metric_query["query"], + start_time=datetime.datetime.fromtimestamp(start_time), + end_time=datetime.datetime.fromtimestamp(end_time) + + ) + + metric = {"name": metric_query["metricName"], "values":[]} + for returned_metric in metrics_result: + if "values" in returned_metric: + for value in returned_metric["values"]: + try: + metric["values"].append((value[0], float(value[1]))) + except ValueError: + pass + metrics_list.append(metric) + + if elastic_colllect_metrics: + result = elastic.upload_metrics_to_elasticsearch(run_uuid=run_uuid, index=elastic_metrics_index, raw_data=metrics_list) + if result == -1: + logging.error("failed to save metrics on ElasticSearch") + + + return metrics_list + + + + diff --git a/requirements.txt b/requirements.txt index b24ab259..6aa670c0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,9 +15,10 @@ google-api-python-client==2.116.0 ibm_cloud_sdk_core==3.18.0 ibm_vpc==0.20.0 jinja2==3.1.4 -krkn-lib==2.1.9 +krkn-lib==3.0.0 lxml==5.1.0 kubernetes==28.1.0 +numpy==1.26.4 oauth2client==4.1.3 pandas==2.2.0 openshift-client==1.0.21 diff --git a/run_kraken.py b/run_kraken.py index b1889b49..376928c1 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -10,6 +10,8 @@ import uuid import time +from krkn_lib.elastic.krkn_elastic import KrknElastic +from krkn_lib.models.elastic import ElasticChaosRunTelemetry from krkn_lib.models.krkn import ChaosRunOutput, ChaosRunAlertSummary from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus import kraken.time_actions.common_time_functions as time_actions @@ -30,7 +32,6 @@ from kraken import plugins, syn_flood from krkn_lib.k8s import KrknKubernetes from krkn_lib.ocp import KrknOpenshift -from krkn_lib.telemetry.elastic import KrknElastic from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift from krkn_lib.models.telemetry import ChaosRunTelemetry @@ -94,14 +95,61 @@ def main(cfg) -> int: enable_alerts = get_yaml_item_value( config["performance_monitoring"], "enable_alerts", False ) + enable_metrics = get_yaml_item_value( + config["performance_monitoring"], "enable_metrics", False + ) + # elastic search + enable_elastic = get_yaml_item_value( + config["elastic"], "enable_elastic", False + ) + elastic_collect_metrics = get_yaml_item_value( + config["elastic"], "collect_metrics", False + ) + + elastic_colllect_alerts = get_yaml_item_value( + config["elastic"], "collect_alerts", False + ) + + elastic_url = get_yaml_item_value( + config["elastic"], "elastic_url", "" + ) + + elastic_verify_certs = get_yaml_item_value( + config["elastic"], "verify_certs", False + ) + + elastic_port = get_yaml_item_value( + config["elastic"], "elastic_port", 32766 + ) + + elastic_username = get_yaml_item_value( + config["elastic"], "username", "" + ) + elastic_password = get_yaml_item_value( + config["elastic"], "password", "" + ) + + elastic_metrics_index = get_yaml_item_value( + config["elastic"], "metrics_index", "krkn-metrics" + ) + + elastic_alerts_index = get_yaml_item_value( + config["elastic"], "alerts_index", "krkn-alerts" + ) + + elastic_telemetry_index = get_yaml_item_value( + config["elastic"], "telemetry_index", "krkn-telemetry" + ) + + + alert_profile = config["performance_monitoring"].get("alert_profile") + metrics_profile = config["performance_monitoring"].get("metrics_profile") check_critical_alerts = get_yaml_item_value( config["performance_monitoring"], "check_critical_alerts", False ) telemetry_api_url = config["telemetry"].get("api_url") - elastic_config = get_yaml_item_value(config,"elastic",{}) - elastic_url = get_yaml_item_value(elastic_config,"elastic_url","") - elastic_index = get_yaml_item_value(elastic_config,"elastic_index","") + # Initialize clients if (not os.path.isfile(kubeconfig_path) and @@ -167,7 +215,7 @@ def main(cfg) -> int: cv = "" if distribution == "openshift": cv = ocpcli.get_clusterversion_string() - if prometheus_url is None: + if not prometheus_url: try: connection_data = ocpcli.get_prometheus_api_connection_data() if connection_data: @@ -189,9 +237,16 @@ def main(cfg) -> int: # KrknTelemetry init telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli) telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli) - telemetry_elastic = KrknElastic(safe_logger,elastic_url) + if enable_elastic: + elastic_search = KrknElastic(safe_logger, + elastic_url, + elastic_port, + elastic_verify_certs, + elastic_username, + elastic_password + ) summary = ChaosRunAlertSummary() - if enable_alerts or check_critical_alerts: + if enable_metrics or enable_alerts or check_critical_alerts: prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token) logging.info("Server URL: %s" % kubecli.get_host()) @@ -400,7 +455,12 @@ def main(cfg) -> int: decoded_chaos_run_telemetry = ChaosRunTelemetry(json.loads(chaos_telemetry.to_json())) chaos_output.telemetry = decoded_chaos_run_telemetry logging.info(f"Chaos data:\n{chaos_output.to_json()}") - telemetry_elastic.upload_data_to_elasticsearch(decoded_chaos_run_telemetry.to_json(), elastic_index) + if enable_elastic: + elastic_telemetry = ElasticChaosRunTelemetry(chaos_run_telemetry=decoded_chaos_run_telemetry) + result = elastic_search.push_telemetry(elastic_telemetry, elastic_telemetry_index) + if result == -1: + safe_logger.error(f"failed to save telemetry on elastic search: {chaos_output.to_json()}") + if config["telemetry"]["enabled"]: logging.info(f'telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/files/' f'{(config["telemetry"]["telemetry_group"] if config["telemetry"]["telemetry_group"] else "default")}/' @@ -451,14 +511,28 @@ def main(cfg) -> int: if alert_profile: prometheus_plugin.alerts( prometheus, + elastic_search, + run_uuid, start_time, end_time, alert_profile, + elastic_colllect_alerts, + elastic_alerts_index ) + else: logging.error("Alert profile is not defined") - #sys.exit(1) return 1 + #sys.exit(1) + if enable_metrics: + prometheus_plugin.metrics(prometheus, + elastic_search, + start_time, + run_uuid, + end_time, + metrics_profile, + elastic_collect_metrics, + elastic_metrics_index) if post_critical_alerts > 0: logging.error("Critical alerts are firing, please check; exiting")