From 5164a230f4eaab5a82a3fbe7846ea0e24fdf2803 Mon Sep 17 00:00:00 2001 From: roman huesler Date: Sun, 10 Dec 2023 07:35:27 +0100 Subject: [PATCH] feat: improve metrics error handling --- f2soperator/operation/operator/balancer.go | 9 ++-- f2soperator/services/prometheus/prometheus.go | 2 +- .../templates/grafana/grafana-dashboards.yaml | 50 ++++++++++++++++--- .../prometheus/prometheus-configmap.yaml | 2 +- 4 files changed, 48 insertions(+), 15 deletions(-) diff --git a/f2soperator/operation/operator/balancer.go b/f2soperator/operation/operator/balancer.go index d78567f..1c0d2ab 100644 --- a/f2soperator/operation/operator/balancer.go +++ b/f2soperator/operation/operator/balancer.go @@ -129,12 +129,11 @@ func scaleDeployments() { functions := configuration.ActiveConfiguration.Functions for _, function := range functions.Items { var resultScale int - currentAvailableReplicas, err := prometheus.ReadCurrentPrometheusMetricValue(&configuration.ActiveConfiguration, fmt.Sprintf("kube_deployment_status_replicas_available{functionname=\"%s\"}", function.Name)) - logging.Error(fmt.Sprintf("%s", err)) - requiredContainers, err := prometheus.ReadCurrentPrometheusMetricValue(&configuration.ActiveConfiguration, fmt.Sprintf("job:function_containers_required:containers{functionname=\"%s\"}", function.Name)) - logging.Error(fmt.Sprintf("%s", err)) - if err != nil { + currentAvailableReplicas, availableReplicasErr := prometheus.ReadCurrentPrometheusMetricValue(&configuration.ActiveConfiguration, fmt.Sprintf("kube_deployment_status_replicas_available{functionname=\"%s\"}", function.Name)) + requiredContainers, requiredContainersErr := prometheus.ReadCurrentPrometheusMetricValue(&configuration.ActiveConfiguration, fmt.Sprintf("job:function_containers_required:containers{functionname=\"%s\"}", function.Name)) + if availableReplicasErr != nil || requiredContainersErr != nil { // no invocations / metrics => scale to minimum + logging.Error("there was an error when trying to read metric [kube_deployment_status_replicas_available] or [job:function_containers_required:containers]. setting result scale to 0") resultScale = 0 } else { resultScale = int(math.Ceil(requiredContainers)) diff --git a/f2soperator/services/prometheus/prometheus.go b/f2soperator/services/prometheus/prometheus.go index 44f9318..de9a62b 100644 --- a/f2soperator/services/prometheus/prometheus.go +++ b/f2soperator/services/prometheus/prometheus.go @@ -131,7 +131,7 @@ func ReadCurrentPrometheusMetricValue(config *configuration.F2SConfiguration, qu // Check if any metric result exists if len(promResponse.Data.Result) == 0 { - return 0.0, fmt.Errorf("metric not found") + return 0.0, fmt.Errorf("metric not found. query string: %s", queryString) } // Extract the metric value diff --git a/helm/templates/grafana/grafana-dashboards.yaml b/helm/templates/grafana/grafana-dashboards.yaml index 33db8fd..263a881 100644 --- a/helm/templates/grafana/grafana-dashboards.yaml +++ b/helm/templates/grafana/grafana-dashboards.yaml @@ -50,7 +50,7 @@ data: "mode": "absolute", "steps": [ { - "color": "dark-purple", + "color": "#e91e61", "value": null } ] @@ -102,7 +102,7 @@ data: { "id": "color", "value": { - "fixedColor": "super-light-purple", + "fixedColor": "#e81e6142", "mode": "fixed" } } @@ -135,7 +135,7 @@ data: { "datasource": "Loki", "editorMode": "builder", - "expr": "{namespace=\"f2s\", pod=~\"$operator\", pod=~\"f2s.+\", component=~\"$component\", level=~\"$loglevel\"} |~ `$searchtext`", + "expr": "{namespace=\"f2s\", pod=~\"$operator\", pod=~\"f2s.+\", component=~\"$component\", level=~\"$loglevel\", type=~\"$type\"} |~ `$searchtext`", "queryType": "range", "refId": "A" } @@ -184,7 +184,7 @@ data: { "allValue": ".+", "current": { - "selected": true, + "selected": false, "text": [ "All" ], @@ -219,10 +219,10 @@ data: "current": { "selected": true, "text": [ - "operator" + "configuration" ], "value": [ - "operator" + "configuration" ] }, "datasource": { @@ -250,7 +250,7 @@ data: { "allValue": ".+", "current": { - "selected": true, + "selected": false, "text": [ "All" ], @@ -280,6 +280,40 @@ data: "sort": 0, "type": "query" }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": [ + "event", + "log" + ], + "value": [ + "event", + "log" + ] + }, + "hide": 0, + "includeAll": false, + "multi": true, + "name": "type", + "options": [ + { + "selected": true, + "text": "log", + "value": "log" + }, + { + "selected": true, + "text": "event", + "value": "event" + } + ], + "query": "log,event", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + }, { "current": { "selected": false, @@ -302,7 +336,7 @@ data: ] }, "time": { - "from": "now-5m", + "from": "now-6h", "to": "now" }, "timepicker": {}, diff --git a/helm/templates/prometheus/prometheus-configmap.yaml b/helm/templates/prometheus/prometheus-configmap.yaml index bdc5b5d..37c093d 100644 --- a/helm/templates/prometheus/prometheus-configmap.yaml +++ b/helm/templates/prometheus/prometheus-configmap.yaml @@ -59,7 +59,7 @@ data: # calculated number of containers needed to perform the incoming requests - record: job:function_containers_required:containers - expr: ceil(sum(f2sscaling_function_incoming_request_rate) by (functionname) / sum(job:function_capacity_average:reqpersec) by (functionname)) + expr: ceil(sum(f2sscaling_function_incoming_request_rate) by (functionname) / sum(job:function_capacity_average:reqpersec) by (functionname)) or vector(0) prometheus.yml: |- global: scrape_interval: 5s