Skip to content

Commit

Permalink
Improve visibility of slow component evaluations & failed components …
Browse files Browse the repository at this point in the history
…in general (grafana#5843)

* Improve visibility of slow component evaluations

* rename metric

* Add alerts and dash

* fix-name

* fix queries to > 0

* changelog

* goimport
  • Loading branch information
thampiotr authored and hainenber committed Nov 25, 2023
1 parent a2654f8 commit 98ecc9f
Show file tree
Hide file tree
Showing 7 changed files with 87 additions and 12 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ Main (unreleased)

- Grafana Agent on Windows now automatically restarts on failure. (@hainenber)

- Added metrics, alerts and dashboard visualisations to help diagnose issues
with unhealthy components and components that take too long to evaluate. (@thampiotr)

### Bugfixes

- Permit `X-Faro-Session-ID` header in CORS requests for the `faro.receiver`
Expand Down
1 change: 1 addition & 0 deletions operations/agent-flow-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
prometheusAlerts+: {
groups+: [
(import './alerts/clustering.libsonnet'),
(import './alerts/controller.libsonnet'),
],
},
}
22 changes: 22 additions & 0 deletions operations/agent-flow-mixin/alerts/controller.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
local alert = import './utils/alert.jsonnet';

alert.newGroup(
'agent_controller',
[
// Component evaluations are taking too long, which can lead to e.g. stale targets.
alert.newRule(
'SlowComponentEvaluations',
'sum by (cluster, namespace, component_id) (rate(agent_component_evaluation_slow_seconds[10m])) > 0',
'Flow component evaluations are taking too long.',
'15m',
),

// Unhealthy components detected.
alert.newRule(
'UnhealthyComponents',
'sum(agent_component_controller_running_components{health_type!="healthy"}) > 0',
'Unhealthy Flow components detected.',
'15m',
),
]
)
26 changes: 24 additions & 2 deletions operations/agent-flow-mixin/dashboards/controller.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,28 @@ local filename = 'agent-flow-controller.json';
])
),

// Slow components evaluation time %
(
panel.new(title='Slow components evaluation times', type='timeseries') +
panel.withUnit('percentunit') +
panel.withDescription(|||
The percentage of time spent evaluating 'slow' components - components that took longer than 1 minute to evaluate.
Ideally, no component should take more than 1 minute to evaluate. The components displayed in this chart
may be a sign of a problem with the pipeline.
|||) +
panel.withPosition({ x: 16, y: 12, w: 8, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr=|||
sum by (component_id) (rate(agent_component_evaluation_slow_seconds{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))
/ scalar(sum(rate(agent_component_evaluation_seconds_sum{cluster="$cluster", namespace="$namespace"}[$__rate_interval])))
|||,
legendFormat='{{component_id}}',
),
])
),

// Component evaluation histogram
(
panel.newHeatmap('Component evaluation histogram') +
Expand All @@ -242,7 +264,7 @@ local filename = 'agent-flow-controller.json';
The goal is to design your config so that evaluations take as little
time as possible; under 100ms is a good goal.
|||) +
panel.withPosition({ x: 16, y: 12, w: 8, h: 10 }) +
panel.withPosition({ x: 0, y: 22, w: 8, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr='sum by (le) (increase(agent_component_evaluation_seconds_bucket{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))',
Expand All @@ -261,7 +283,7 @@ local filename = 'agent-flow-controller.json';
The goal is to design your config so that most of the time components do not
queue for long; under 10ms is a good goal.
|||) +
panel.withPosition({ x: 0, y: 22, w: 8, h: 10 }) +
panel.withPosition({ x: 8, y: 22, w: 8, h: 10 }) +
panel.withQueries([
panel.newQuery(
expr='sum by (le) (increase(agent_component_dependencies_wait_seconds_bucket{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))',
Expand Down
7 changes: 2 additions & 5 deletions pkg/flow/internal/controller/loader.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,7 @@ func (l *Loader) Apply(args map[string]any, componentBlocks []*ast.BlockStmt, co
defer func() {
span.SetStatus(codes.Ok, "")

duration := time.Since(start)
level.Info(logger).Log("msg", "finished complete graph evaluation", "duration", duration)
l.cm.componentEvaluationTime.Observe(duration.Seconds())
level.Info(logger).Log("msg", "finished complete graph evaluation", "duration", time.Since(start))
}()

l.cache.ClearModuleExports()
Expand Down Expand Up @@ -234,7 +232,6 @@ func (l *Loader) Apply(args map[string]any, componentBlocks []*ast.BlockStmt, co
l.graph = &newGraph
l.cache.SyncIDs(componentIDs)
l.blocks = componentBlocks
l.cm.componentEvaluationTime.Observe(time.Since(start).Seconds())
if l.globals.OnExportsChange != nil && l.cache.ExportChangeIndex() != l.moduleExportIndex {
l.moduleExportIndex = l.cache.ExportChangeIndex()
l.globals.OnExportsChange(l.cache.CreateModuleExports())
Expand Down Expand Up @@ -652,8 +649,8 @@ func (l *Loader) concurrentEvalFn(n dag.Node, spanCtx context.Context, tracer tr

defer func() {
duration := time.Since(start)
l.cm.onComponentEvaluationDone(n.NodeID(), duration)
level.Info(l.log).Log("msg", "finished node evaluation", "node_id", n.NodeID(), "duration", duration)
l.cm.componentEvaluationTime.Observe(duration.Seconds())
}()

var err error
Expand Down
39 changes: 34 additions & 5 deletions pkg/flow/internal/controller/metrics.go
Original file line number Diff line number Diff line change
@@ -1,20 +1,32 @@
package controller

import (
"fmt"
"time"

"github.com/prometheus/client_golang/prometheus"
)

// controllerMetrics contains the metrics for components controller
type controllerMetrics struct {
controllerEvaluation prometheus.Gauge
componentEvaluationTime prometheus.Histogram
dependenciesWaitTime prometheus.Histogram
evaluationQueueSize prometheus.Gauge
controllerEvaluation prometheus.Gauge
componentEvaluationTime prometheus.Histogram
dependenciesWaitTime prometheus.Histogram
evaluationQueueSize prometheus.Gauge
slowComponentThreshold time.Duration
slowComponentEvaluationTime *prometheus.CounterVec
}

// newControllerMetrics inits the metrics for the components controller
func newControllerMetrics(id string) *controllerMetrics {
cm := &controllerMetrics{}
cm := &controllerMetrics{
slowComponentThreshold: 1 * time.Minute,
}

// The evaluation time becomes particularly problematic in the range of 30s+, so add more buckets
// that can help spot issues in that range.
// Use the following buckets: 5ms, 25ms, 100ms, 500ms, 1s, 5s, 10s, 30s, 1m, 2m, 5m, 10m
evaluationTimesBuckets := []float64{.005, .025, .1, .5, 1, 5, 10, 30, 60, 120, 300, 600}

cm.controllerEvaluation = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "agent_component_controller_evaluating",
Expand All @@ -27,13 +39,15 @@ func newControllerMetrics(id string) *controllerMetrics {
Name: "agent_component_evaluation_seconds",
Help: "Time spent performing component evaluation",
ConstLabels: map[string]string{"controller_id": id},
Buckets: evaluationTimesBuckets,
},
)
cm.dependenciesWaitTime = prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "agent_component_dependencies_wait_seconds",
Help: "Time spent by components waiting to be evaluated after their dependency is updated.",
ConstLabels: map[string]string{"controller_id": id},
Buckets: evaluationTimesBuckets,
},
)

Expand All @@ -43,21 +57,36 @@ func newControllerMetrics(id string) *controllerMetrics {
ConstLabels: map[string]string{"controller_id": id},
})

cm.slowComponentEvaluationTime = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "agent_component_evaluation_slow_seconds",
Help: fmt.Sprintf("Number of seconds spent evaluating components that take longer than %v to evaluate", cm.slowComponentThreshold),
ConstLabels: map[string]string{"controller_id": id},
}, []string{"component_id"})

return cm
}

func (cm *controllerMetrics) onComponentEvaluationDone(name string, duration time.Duration) {
cm.componentEvaluationTime.Observe(duration.Seconds())
if duration >= cm.slowComponentThreshold {
cm.slowComponentEvaluationTime.WithLabelValues(name).Add(duration.Seconds())
}
}

func (cm *controllerMetrics) Collect(ch chan<- prometheus.Metric) {
cm.componentEvaluationTime.Collect(ch)
cm.controllerEvaluation.Collect(ch)
cm.dependenciesWaitTime.Collect(ch)
cm.evaluationQueueSize.Collect(ch)
cm.slowComponentEvaluationTime.Collect(ch)
}

func (cm *controllerMetrics) Describe(ch chan<- *prometheus.Desc) {
cm.componentEvaluationTime.Describe(ch)
cm.controllerEvaluation.Describe(ch)
cm.dependenciesWaitTime.Describe(ch)
cm.evaluationQueueSize.Describe(ch)
cm.slowComponentEvaluationTime.Describe(ch)
}

type controllerCollector struct {
Expand Down
1 change: 1 addition & 0 deletions production/grafana-agent-mixin/debugging.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,4 @@ local g = import 'grafana-builder/grafana.libsonnet';
),
},
}

0 comments on commit 98ecc9f

Please sign in to comment.