Skip to content

Commit

Permalink
feat: improve metrics error handling
Browse files Browse the repository at this point in the history
  • Loading branch information
butschi84 committed Dec 10, 2023
1 parent a2b1caf commit 5164a23
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 15 deletions.
9 changes: 4 additions & 5 deletions f2soperator/operation/operator/balancer.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,11 @@ func scaleDeployments() {
functions := configuration.ActiveConfiguration.Functions
for _, function := range functions.Items {
var resultScale int
currentAvailableReplicas, err := prometheus.ReadCurrentPrometheusMetricValue(&configuration.ActiveConfiguration, fmt.Sprintf("kube_deployment_status_replicas_available{functionname=\"%s\"}", function.Name))
logging.Error(fmt.Sprintf("%s", err))
requiredContainers, err := prometheus.ReadCurrentPrometheusMetricValue(&configuration.ActiveConfiguration, fmt.Sprintf("job:function_containers_required:containers{functionname=\"%s\"}", function.Name))
logging.Error(fmt.Sprintf("%s", err))
if err != nil {
currentAvailableReplicas, availableReplicasErr := prometheus.ReadCurrentPrometheusMetricValue(&configuration.ActiveConfiguration, fmt.Sprintf("kube_deployment_status_replicas_available{functionname=\"%s\"}", function.Name))
requiredContainers, requiredContainersErr := prometheus.ReadCurrentPrometheusMetricValue(&configuration.ActiveConfiguration, fmt.Sprintf("job:function_containers_required:containers{functionname=\"%s\"}", function.Name))
if availableReplicasErr != nil || requiredContainersErr != nil {
// no invocations / metrics => scale to minimum
logging.Error("there was an error when trying to read metric [kube_deployment_status_replicas_available] or [job:function_containers_required:containers]. setting result scale to 0")
resultScale = 0
} else {
resultScale = int(math.Ceil(requiredContainers))
Expand Down
2 changes: 1 addition & 1 deletion f2soperator/services/prometheus/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ func ReadCurrentPrometheusMetricValue(config *configuration.F2SConfiguration, qu

// Check if any metric result exists
if len(promResponse.Data.Result) == 0 {
return 0.0, fmt.Errorf("metric not found")
return 0.0, fmt.Errorf("metric not found. query string: %s", queryString)
}

// Extract the metric value
Expand Down
50 changes: 42 additions & 8 deletions helm/templates/grafana/grafana-dashboards.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ data:
"mode": "absolute",
"steps": [
{
"color": "dark-purple",
"color": "#e91e61",
"value": null
}
]
Expand Down Expand Up @@ -102,7 +102,7 @@ data:
{
"id": "color",
"value": {
"fixedColor": "super-light-purple",
"fixedColor": "#e81e6142",
"mode": "fixed"
}
}
Expand Down Expand Up @@ -135,7 +135,7 @@ data:
{
"datasource": "Loki",
"editorMode": "builder",
"expr": "{namespace=\"f2s\", pod=~\"$operator\", pod=~\"f2s.+\", component=~\"$component\", level=~\"$loglevel\"} |~ `$searchtext`",
"expr": "{namespace=\"f2s\", pod=~\"$operator\", pod=~\"f2s.+\", component=~\"$component\", level=~\"$loglevel\", type=~\"$type\"} |~ `$searchtext`",
"queryType": "range",
"refId": "A"
}
Expand Down Expand Up @@ -184,7 +184,7 @@ data:
{
"allValue": ".+",
"current": {
"selected": true,
"selected": false,
"text": [
"All"
],
Expand Down Expand Up @@ -219,10 +219,10 @@ data:
"current": {
"selected": true,
"text": [
"operator"
"configuration"
],
"value": [
"operator"
"configuration"
]
},
"datasource": {
Expand Down Expand Up @@ -250,7 +250,7 @@ data:
{
"allValue": ".+",
"current": {
"selected": true,
"selected": false,
"text": [
"All"
],
Expand Down Expand Up @@ -280,6 +280,40 @@ data:
"sort": 0,
"type": "query"
},
{
"allValue": ".+",
"current": {
"selected": true,
"text": [
"event",
"log"
],
"value": [
"event",
"log"
]
},
"hide": 0,
"includeAll": false,
"multi": true,
"name": "type",
"options": [
{
"selected": true,
"text": "log",
"value": "log"
},
{
"selected": true,
"text": "event",
"value": "event"
}
],
"query": "log,event",
"queryValue": "",
"skipUrlSync": false,
"type": "custom"
},
{
"current": {
"selected": false,
Expand All @@ -302,7 +336,7 @@ data:
]
},
"time": {
"from": "now-5m",
"from": "now-6h",
"to": "now"
},
"timepicker": {},
Expand Down
2 changes: 1 addition & 1 deletion helm/templates/prometheus/prometheus-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ data:
# calculated number of containers needed to perform the incoming requests
- record: job:function_containers_required:containers
expr: ceil(sum(f2sscaling_function_incoming_request_rate) by (functionname) / sum(job:function_capacity_average:reqpersec) by (functionname))
expr: ceil(sum(f2sscaling_function_incoming_request_rate) by (functionname) / sum(job:function_capacity_average:reqpersec) by (functionname)) or vector(0)
prometheus.yml: |-
global:
scrape_interval: 5s
Expand Down

0 comments on commit 5164a23

Please sign in to comment.