From 98ecc9f2a53afbd4d6f2324c3e3e01c322cd245c Mon Sep 17 00:00:00 2001
From: Piotr <17101802+thampiotr@users.noreply.github.com>
Date: Thu, 23 Nov 2023 17:55:13 +0000
Subject: [PATCH] Improve visibility of slow component evaluations & failed
 components in general (#5843)

* Improve visibility of slow component evaluations

* rename metric

* Add alerts and dash

* fix-name

* fix queries to > 0

* changelog

* goimport
---
 CHANGELOG.md                                  |  3 ++
 operations/agent-flow-mixin/alerts.libsonnet  |  1 +
 .../alerts/controller.libsonnet               | 22 +++++++++++
 .../dashboards/controller.libsonnet           | 26 ++++++++++++-
 pkg/flow/internal/controller/loader.go        |  7 +---
 pkg/flow/internal/controller/metrics.go       | 39 ++++++++++++++++---
 .../grafana-agent-mixin/debugging.libsonnet   |  1 +
 7 files changed, 87 insertions(+), 12 deletions(-)
 create mode 100644 operations/agent-flow-mixin/alerts/controller.libsonnet

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 45c74f523c4f..3b21fbe8fa33 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,9 @@ Main (unreleased)
 
 - Grafana Agent on Windows now automatically restarts on failure. (@hainenber)
 
+- Added metrics, alerts and dashboard visualisations to help diagnose issues
+  with unhealthy components and components that take too long to evaluate. (@thampiotr)
+
 ### Bugfixes
 
 - Permit `X-Faro-Session-ID` header in CORS requests for the `faro.receiver`
diff --git a/operations/agent-flow-mixin/alerts.libsonnet b/operations/agent-flow-mixin/alerts.libsonnet
index 92555dc89d5e..d1473e605ed4 100644
--- a/operations/agent-flow-mixin/alerts.libsonnet
+++ b/operations/agent-flow-mixin/alerts.libsonnet
@@ -2,6 +2,7 @@
   prometheusAlerts+: {
     groups+: [
       (import './alerts/clustering.libsonnet'),
+      (import './alerts/controller.libsonnet'),
     ],
   },
 }
diff --git a/operations/agent-flow-mixin/alerts/controller.libsonnet b/operations/agent-flow-mixin/alerts/controller.libsonnet
new file mode 100644
index 000000000000..b6451db76292
--- /dev/null
+++ b/operations/agent-flow-mixin/alerts/controller.libsonnet
@@ -0,0 +1,22 @@
+local alert = import './utils/alert.jsonnet';
+
+alert.newGroup(
+  'agent_controller',
+  [
+    // Component evaluations are taking too long, which can lead to e.g. stale targets.
+    alert.newRule(
+      'SlowComponentEvaluations',
+      'sum by (cluster, namespace, component_id) (rate(agent_component_evaluation_slow_seconds[10m])) > 0',
+      'Flow component evaluations are taking too long.',
+      '15m',
+    ),
+
+    // Unhealthy components detected.
+    alert.newRule(
+      'UnhealthyComponents',
+      'sum(agent_component_controller_running_components{health_type!="healthy"}) > 0',
+      'Unhealthy Flow components detected.',
+      '15m',
+    ),
+  ]
+)
diff --git a/operations/agent-flow-mixin/dashboards/controller.libsonnet b/operations/agent-flow-mixin/dashboards/controller.libsonnet
index 1dcfcd3c9ab3..3876d612a77b 100644
--- a/operations/agent-flow-mixin/dashboards/controller.libsonnet
+++ b/operations/agent-flow-mixin/dashboards/controller.libsonnet
@@ -233,6 +233,28 @@ local filename = 'agent-flow-controller.json';
         ])
       ),
 
+      // Slow components evaluation time %
+      (
+        panel.new(title='Slow components evaluation times', type='timeseries') +
+        panel.withUnit('percentunit') +
+        panel.withDescription(|||
+          The percentage of time spent evaluating 'slow' components - components that took longer than 1 minute to evaluate.
+
+          Ideally, no component should take more than 1 minute to evaluate. The components displayed in this chart
+          may be a sign of a problem with the pipeline.
+        |||) +
+        panel.withPosition({ x: 16, y: 12, w: 8, h: 10 }) +
+        panel.withQueries([
+          panel.newQuery(
+            expr=|||
+              sum by (component_id) (rate(agent_component_evaluation_slow_seconds{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))
+              / scalar(sum(rate(agent_component_evaluation_seconds_sum{cluster="$cluster", namespace="$namespace"}[$__rate_interval])))
+            |||,
+            legendFormat='{{component_id}}',
+          ),
+        ])
+      ),
+
       // Component evaluation histogram
       (
         panel.newHeatmap('Component evaluation histogram') +
@@ -242,7 +264,7 @@ local filename = 'agent-flow-controller.json';
           The goal is to design your config so that evaluations take as little
           time as possible; under 100ms is a good goal.
         |||) +
-        panel.withPosition({ x: 16, y: 12, w: 8, h: 10 }) +
+        panel.withPosition({ x: 0, y: 22, w: 8, h: 10 }) +
         panel.withQueries([
           panel.newQuery(
             expr='sum by (le) (increase(agent_component_evaluation_seconds_bucket{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))',
@@ -261,7 +283,7 @@ local filename = 'agent-flow-controller.json';
           The goal is to design your config so that most of the time components do not
           queue for long; under 10ms is a good goal.
         |||) +
-        panel.withPosition({ x: 0, y: 22, w: 8, h: 10 }) +
+        panel.withPosition({ x: 8, y: 22, w: 8, h: 10 }) +
         panel.withQueries([
           panel.newQuery(
             expr='sum by (le) (increase(agent_component_dependencies_wait_seconds_bucket{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))',
diff --git a/pkg/flow/internal/controller/loader.go b/pkg/flow/internal/controller/loader.go
index a0954f82333f..d44a2a5062eb 100644
--- a/pkg/flow/internal/controller/loader.go
+++ b/pkg/flow/internal/controller/loader.go
@@ -149,9 +149,7 @@ func (l *Loader) Apply(args map[string]any, componentBlocks []*ast.BlockStmt, co
 	defer func() {
 		span.SetStatus(codes.Ok, "")
 
-		duration := time.Since(start)
-		level.Info(logger).Log("msg", "finished complete graph evaluation", "duration", duration)
-		l.cm.componentEvaluationTime.Observe(duration.Seconds())
+		level.Info(logger).Log("msg", "finished complete graph evaluation", "duration", time.Since(start))
 	}()
 
 	l.cache.ClearModuleExports()
@@ -234,7 +232,6 @@ func (l *Loader) Apply(args map[string]any, componentBlocks []*ast.BlockStmt, co
 	l.graph = &newGraph
 	l.cache.SyncIDs(componentIDs)
 	l.blocks = componentBlocks
-	l.cm.componentEvaluationTime.Observe(time.Since(start).Seconds())
 	if l.globals.OnExportsChange != nil && l.cache.ExportChangeIndex() != l.moduleExportIndex {
 		l.moduleExportIndex = l.cache.ExportChangeIndex()
 		l.globals.OnExportsChange(l.cache.CreateModuleExports())
@@ -652,8 +649,8 @@ func (l *Loader) concurrentEvalFn(n dag.Node, spanCtx context.Context, tracer tr
 
 	defer func() {
 		duration := time.Since(start)
+		l.cm.onComponentEvaluationDone(n.NodeID(), duration)
 		level.Info(l.log).Log("msg", "finished node evaluation", "node_id", n.NodeID(), "duration", duration)
-		l.cm.componentEvaluationTime.Observe(duration.Seconds())
 	}()
 
 	var err error
diff --git a/pkg/flow/internal/controller/metrics.go b/pkg/flow/internal/controller/metrics.go
index ff6c427d7e96..1c5a558ccc1b 100644
--- a/pkg/flow/internal/controller/metrics.go
+++ b/pkg/flow/internal/controller/metrics.go
@@ -1,20 +1,32 @@
 package controller
 
 import (
+	"fmt"
+	"time"
+
 	"github.com/prometheus/client_golang/prometheus"
 )
 
 // controllerMetrics contains the metrics for components controller
 type controllerMetrics struct {
-	controllerEvaluation    prometheus.Gauge
-	componentEvaluationTime prometheus.Histogram
-	dependenciesWaitTime    prometheus.Histogram
-	evaluationQueueSize     prometheus.Gauge
+	controllerEvaluation        prometheus.Gauge
+	componentEvaluationTime     prometheus.Histogram
+	dependenciesWaitTime        prometheus.Histogram
+	evaluationQueueSize         prometheus.Gauge
+	slowComponentThreshold      time.Duration
+	slowComponentEvaluationTime *prometheus.CounterVec
 }
 
 // newControllerMetrics inits the metrics for the components controller
 func newControllerMetrics(id string) *controllerMetrics {
-	cm := &controllerMetrics{}
+	cm := &controllerMetrics{
+		slowComponentThreshold: 1 * time.Minute,
+	}
+
+	// The evaluation time becomes particularly problematic in the range of 30s+, so add more buckets
+	// that can help spot issues in that range.
+	// Use the following buckets: 5ms, 25ms, 100ms, 500ms, 1s, 5s, 10s, 30s, 1m, 2m, 5m, 10m
+	evaluationTimesBuckets := []float64{.005, .025, .1, .5, 1, 5, 10, 30, 60, 120, 300, 600}
 
 	cm.controllerEvaluation = prometheus.NewGauge(prometheus.GaugeOpts{
 		Name:        "agent_component_controller_evaluating",
@@ -27,6 +39,7 @@ func newControllerMetrics(id string) *controllerMetrics {
 			Name:        "agent_component_evaluation_seconds",
 			Help:        "Time spent performing component evaluation",
 			ConstLabels: map[string]string{"controller_id": id},
+			Buckets:     evaluationTimesBuckets,
 		},
 	)
 	cm.dependenciesWaitTime = prometheus.NewHistogram(
@@ -34,6 +47,7 @@ func newControllerMetrics(id string) *controllerMetrics {
 			Name:        "agent_component_dependencies_wait_seconds",
 			Help:        "Time spent by components waiting to be evaluated after their dependency is updated.",
 			ConstLabels: map[string]string{"controller_id": id},
+			Buckets:     evaluationTimesBuckets,
 		},
 	)
 
@@ -43,14 +57,28 @@ func newControllerMetrics(id string) *controllerMetrics {
 		ConstLabels: map[string]string{"controller_id": id},
 	})
 
+	cm.slowComponentEvaluationTime = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Name:        "agent_component_evaluation_slow_seconds",
+		Help:        fmt.Sprintf("Number of seconds spent evaluating components that take longer than %v to evaluate", cm.slowComponentThreshold),
+		ConstLabels: map[string]string{"controller_id": id},
+	}, []string{"component_id"})
+
 	return cm
 }
 
+func (cm *controllerMetrics) onComponentEvaluationDone(name string, duration time.Duration) {
+	cm.componentEvaluationTime.Observe(duration.Seconds())
+	if duration >= cm.slowComponentThreshold {
+		cm.slowComponentEvaluationTime.WithLabelValues(name).Add(duration.Seconds())
+	}
+}
+
 func (cm *controllerMetrics) Collect(ch chan<- prometheus.Metric) {
 	cm.componentEvaluationTime.Collect(ch)
 	cm.controllerEvaluation.Collect(ch)
 	cm.dependenciesWaitTime.Collect(ch)
 	cm.evaluationQueueSize.Collect(ch)
+	cm.slowComponentEvaluationTime.Collect(ch)
 }
 
 func (cm *controllerMetrics) Describe(ch chan<- *prometheus.Desc) {
@@ -58,6 +86,7 @@ func (cm *controllerMetrics) Describe(ch chan<- *prometheus.Desc) {
 	cm.controllerEvaluation.Describe(ch)
 	cm.dependenciesWaitTime.Describe(ch)
 	cm.evaluationQueueSize.Describe(ch)
+	cm.slowComponentEvaluationTime.Describe(ch)
 }
 
 type controllerCollector struct {
diff --git a/production/grafana-agent-mixin/debugging.libsonnet b/production/grafana-agent-mixin/debugging.libsonnet
index be2ab983c8ef..711184d333f7 100644
--- a/production/grafana-agent-mixin/debugging.libsonnet
+++ b/production/grafana-agent-mixin/debugging.libsonnet
@@ -125,3 +125,4 @@ local g = import 'grafana-builder/grafana.libsonnet';
       ),
   },
 }
+