Skip to content

Commit

Permalink
rules: Add new RuleEvaluationTimeSum field to groups (#15672)
Browse files Browse the repository at this point in the history
* feat(ruler): Add new `RuleEvaluationTimeSum` field to groups
Coupled with a metric: `rule_group_last_rule_duration_sum_seconds`

This will give us more observability into how fast a group runs with or without concurrency

Signed-off-by: Julien Duchesne <[email protected]>

* Update rules/group.go

Co-authored-by: gotjosh <[email protected]>
Signed-off-by: Julien Duchesne <[email protected]>
Signed-off-by: Julien Duchesne <[email protected]>

* Apply suggestions from code review

Co-authored-by: gotjosh <[email protected]>
Signed-off-by: Julien Duchesne <[email protected]>
Signed-off-by: Julien Duchesne <[email protected]>

* Remove `in seconds`. A duration is a duration

Signed-off-by: Julien Duchesne <[email protected]>

---------

Signed-off-by: Julien Duchesne <[email protected]>
Signed-off-by: Julien Duchesne <[email protected]>
Co-authored-by: gotjosh <[email protected]>
  • Loading branch information
julienduchesne and gotjosh committed Dec 17, 2024
1 parent 0a0a416 commit 49582ed
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 18 deletions.
62 changes: 48 additions & 14 deletions rules/group.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,21 @@ import (

// Group is a set of rules that have a logical relation.
type Group struct {
name string
file string
interval time.Duration
queryOffset *time.Duration
limit int
rules []Rule
sourceTenants []string
seriesInPreviousEval []map[string]labels.Labels // One per Rule.
staleSeries []labels.Labels
opts *ManagerOptions
mtx sync.Mutex
evaluationTime time.Duration
lastEvaluation time.Time // Wall-clock time of most recent evaluation.
lastEvalTimestamp time.Time // Time slot used for most recent evaluation.
name string
file string
interval time.Duration
queryOffset *time.Duration
limit int
rules []Rule
sourceTenants []string
seriesInPreviousEval []map[string]labels.Labels // One per Rule.
staleSeries []labels.Labels
opts *ManagerOptions
mtx sync.Mutex
evaluationTime time.Duration // Time it took to evaluate the group.
evaluationRuleTimeSum time.Duration // Sum of time it took to evaluate each rule in the group.
lastEvaluation time.Time // Wall-clock time of most recent evaluation.
lastEvalTimestamp time.Time // Time slot used for most recent evaluation.

shouldRestore bool

Expand Down Expand Up @@ -119,6 +120,7 @@ func NewGroup(o GroupOptions) *Group {
metrics.EvalFailures.WithLabelValues(key)
metrics.GroupLastEvalTime.WithLabelValues(key)
metrics.GroupLastDuration.WithLabelValues(key)
metrics.GroupLastRuleDurationSum.WithLabelValues(key)
metrics.GroupRules.WithLabelValues(key).Set(float64(len(o.Rules)))
metrics.GroupSamples.WithLabelValues(key)
metrics.GroupInterval.WithLabelValues(key).Set(o.Interval.Seconds())
Expand Down Expand Up @@ -380,6 +382,28 @@ func (g *Group) setEvaluationTime(dur time.Duration) {
g.evaluationTime = dur
}

// GetRuleEvaluationTimeSum returns the sum of the time it took to evaluate each rule in the group irrespective of concurrency.
func (g *Group) GetRuleEvaluationTimeSum() time.Duration {
g.mtx.Lock()
defer g.mtx.Unlock()
return g.evaluationRuleTimeSum
}

// updateRuleEvaluationTimeSum updates evaluationRuleTimeSum which is the sum of the time it took to evaluate each rule in the group irrespective of concurrency.
// It collects the times from the rules themselves.
func (g *Group) updateRuleEvaluationTimeSum() {
var sum time.Duration
for _, rule := range g.rules {
sum += rule.GetEvaluationDuration()
}

g.metrics.GroupLastRuleDurationSum.WithLabelValues(GroupKey(g.file, g.name)).Set(sum.Seconds())

g.mtx.Lock()
defer g.mtx.Unlock()
g.evaluationRuleTimeSum = sum
}

// GetLastEvaluation returns the time the last evaluation of the rule group took place.
func (g *Group) GetLastEvaluation() time.Time {
g.mtx.Lock()
Expand Down Expand Up @@ -916,6 +940,7 @@ type Metrics struct {
GroupInterval *prometheus.GaugeVec
GroupLastEvalTime *prometheus.GaugeVec
GroupLastDuration *prometheus.GaugeVec
GroupLastRuleDurationSum *prometheus.GaugeVec
GroupLastRestoreDuration *prometheus.GaugeVec
GroupRules *prometheus.GaugeVec
GroupSamples *prometheus.GaugeVec
Expand Down Expand Up @@ -994,6 +1019,14 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
},
[]string{"rule_group"},
),
GroupLastRuleDurationSum: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "rule_group_last_rule_duration_sum_seconds",
Help: "The sum of time in seconds it took to evaluate each rule in the group regardless of concurrency. This should be higher than the group duration if rules are evaluated concurrently.",
},
[]string{"rule_group"},
),
GroupLastRestoreDuration: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Expand Down Expand Up @@ -1031,6 +1064,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
m.GroupInterval,
m.GroupLastEvalTime,
m.GroupLastDuration,
m.GroupLastRuleDurationSum,
m.GroupLastRestoreDuration,
m.GroupRules,
m.GroupSamples,
Expand Down
1 change: 1 addition & 0 deletions rules/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ func DefaultEvalIterationFunc(ctx context.Context, g *Group, evalTimestamp time.
timeSinceStart := time.Since(start)

g.metrics.IterationDuration.Observe(timeSinceStart.Seconds())
g.updateRuleEvaluationTimeSum()
g.setEvaluationTime(timeSinceStart)
g.setLastEvaluation(start)
g.setLastEvalTimestamp(evalTimestamp)
Expand Down
12 changes: 8 additions & 4 deletions rules/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2222,14 +2222,16 @@ func TestAsyncRuleEvaluation(t *testing.T) {
require.Len(t, group.rules, ruleCount)

start := time.Now()
group.Eval(ctx, start)
DefaultEvalIterationFunc(ctx, group, start)

// Never expect more than 1 inflight query at a time.
require.EqualValues(t, 1, maxInflight.Load())
// Each rule should take at least 1 second to execute sequentially.
require.GreaterOrEqual(t, time.Since(start).Seconds(), (time.Duration(ruleCount) * artificialDelay).Seconds())
// Each rule produces one vector.
require.EqualValues(t, ruleCount, testutil.ToFloat64(group.metrics.GroupSamples))
// Group duration is higher than the sum of rule durations (group overhead).
require.GreaterOrEqual(t, group.GetEvaluationTime(), group.GetRuleEvaluationTimeSum())
}
})

Expand Down Expand Up @@ -2260,7 +2262,7 @@ func TestAsyncRuleEvaluation(t *testing.T) {
require.Len(t, group.rules, ruleCount)

start := time.Now()
group.Eval(ctx, start)
DefaultEvalIterationFunc(ctx, group, start)

// Max inflight can be 1 synchronous eval and up to MaxConcurrentEvals concurrent evals.
require.EqualValues(t, opts.MaxConcurrentEvals+1, maxInflight.Load())
Expand Down Expand Up @@ -2298,7 +2300,7 @@ func TestAsyncRuleEvaluation(t *testing.T) {
require.Len(t, group.rules, ruleCount)

start := time.Now()
group.Eval(ctx, start)
DefaultEvalIterationFunc(ctx, group, start)

// Max inflight can be 1 synchronous eval and up to MaxConcurrentEvals concurrent evals.
require.EqualValues(t, opts.MaxConcurrentEvals+1, maxInflight.Load())
Expand Down Expand Up @@ -2337,14 +2339,16 @@ func TestAsyncRuleEvaluation(t *testing.T) {

start := time.Now()

group.Eval(ctx, start)
DefaultEvalIterationFunc(ctx, group, start)

// Max inflight can be up to MaxConcurrentEvals concurrent evals, since there is sufficient concurrency to run all rules at once.
require.LessOrEqual(t, int64(maxInflight.Load()), opts.MaxConcurrentEvals)
// Some rules should execute concurrently so should complete quicker.
require.Less(t, time.Since(start).Seconds(), (time.Duration(ruleCount) * artificialDelay).Seconds())
// Each rule produces one vector.
require.EqualValues(t, ruleCount, testutil.ToFloat64(group.metrics.GroupSamples))
// Group duration is less than the sum of rule durations
require.Less(t, group.GetEvaluationTime(), group.GetRuleEvaluationTimeSum())
}
})
}
Expand Down

0 comments on commit 49582ed

Please sign in to comment.