Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(Backport to r321) rules: Add new RuleEvaluationTimeSum field to groups (#15672) #807

Merged
merged 1 commit into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 48 additions & 14 deletions rules/group.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,21 @@ import (

// Group is a set of rules that have a logical relation.
type Group struct {
name string
file string
interval time.Duration
queryOffset *time.Duration
limit int
rules []Rule
sourceTenants []string
seriesInPreviousEval []map[string]labels.Labels // One per Rule.
staleSeries []labels.Labels
opts *ManagerOptions
mtx sync.Mutex
evaluationTime time.Duration
lastEvaluation time.Time // Wall-clock time of most recent evaluation.
lastEvalTimestamp time.Time // Time slot used for most recent evaluation.
name string
file string
interval time.Duration
queryOffset *time.Duration
limit int
rules []Rule
sourceTenants []string
seriesInPreviousEval []map[string]labels.Labels // One per Rule.
staleSeries []labels.Labels
opts *ManagerOptions
mtx sync.Mutex
evaluationTime time.Duration // Time it took to evaluate the group.
evaluationRuleTimeSum time.Duration // Sum of time it took to evaluate each rule in the group.
lastEvaluation time.Time // Wall-clock time of most recent evaluation.
lastEvalTimestamp time.Time // Time slot used for most recent evaluation.

shouldRestore bool

Expand Down Expand Up @@ -119,6 +120,7 @@ func NewGroup(o GroupOptions) *Group {
metrics.EvalFailures.WithLabelValues(key)
metrics.GroupLastEvalTime.WithLabelValues(key)
metrics.GroupLastDuration.WithLabelValues(key)
metrics.GroupLastRuleDurationSum.WithLabelValues(key)
metrics.GroupRules.WithLabelValues(key).Set(float64(len(o.Rules)))
metrics.GroupSamples.WithLabelValues(key)
metrics.GroupInterval.WithLabelValues(key).Set(o.Interval.Seconds())
Expand Down Expand Up @@ -380,6 +382,28 @@ func (g *Group) setEvaluationTime(dur time.Duration) {
g.evaluationTime = dur
}

// GetRuleEvaluationTimeSum returns the sum of the time it took to evaluate each rule in the group irrespective of concurrency.
func (g *Group) GetRuleEvaluationTimeSum() time.Duration {
g.mtx.Lock()
defer g.mtx.Unlock()
return g.evaluationRuleTimeSum
}

// updateRuleEvaluationTimeSum updates evaluationRuleTimeSum which is the sum of the time it took to evaluate each rule in the group irrespective of concurrency.
// It collects the times from the rules themselves.
func (g *Group) updateRuleEvaluationTimeSum() {
var sum time.Duration
for _, rule := range g.rules {
sum += rule.GetEvaluationDuration()
}

g.metrics.GroupLastRuleDurationSum.WithLabelValues(GroupKey(g.file, g.name)).Set(sum.Seconds())

g.mtx.Lock()
defer g.mtx.Unlock()
g.evaluationRuleTimeSum = sum
}

// GetLastEvaluation returns the time the last evaluation of the rule group took place.
func (g *Group) GetLastEvaluation() time.Time {
g.mtx.Lock()
Expand Down Expand Up @@ -916,6 +940,7 @@ type Metrics struct {
GroupInterval *prometheus.GaugeVec
GroupLastEvalTime *prometheus.GaugeVec
GroupLastDuration *prometheus.GaugeVec
GroupLastRuleDurationSum *prometheus.GaugeVec
GroupLastRestoreDuration *prometheus.GaugeVec
GroupRules *prometheus.GaugeVec
GroupSamples *prometheus.GaugeVec
Expand Down Expand Up @@ -994,6 +1019,14 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
},
[]string{"rule_group"},
),
GroupLastRuleDurationSum: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "rule_group_last_rule_duration_sum_seconds",
Help: "The sum of time in seconds it took to evaluate each rule in the group regardless of concurrency. This should be higher than the group duration if rules are evaluated concurrently.",
},
[]string{"rule_group"},
),
GroupLastRestoreDuration: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Expand Down Expand Up @@ -1031,6 +1064,7 @@ func NewGroupMetrics(reg prometheus.Registerer) *Metrics {
m.GroupInterval,
m.GroupLastEvalTime,
m.GroupLastDuration,
m.GroupLastRuleDurationSum,
m.GroupLastRestoreDuration,
m.GroupRules,
m.GroupSamples,
Expand Down
1 change: 1 addition & 0 deletions rules/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ func DefaultEvalIterationFunc(ctx context.Context, g *Group, evalTimestamp time.
timeSinceStart := time.Since(start)

g.metrics.IterationDuration.Observe(timeSinceStart.Seconds())
g.updateRuleEvaluationTimeSum()
g.setEvaluationTime(timeSinceStart)
g.setLastEvaluation(start)
g.setLastEvalTimestamp(evalTimestamp)
Expand Down
12 changes: 8 additions & 4 deletions rules/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2222,14 +2222,16 @@ func TestAsyncRuleEvaluation(t *testing.T) {
require.Len(t, group.rules, ruleCount)

start := time.Now()
group.Eval(ctx, start)
DefaultEvalIterationFunc(ctx, group, start)

// Never expect more than 1 inflight query at a time.
require.EqualValues(t, 1, maxInflight.Load())
// Each rule should take at least 1 second to execute sequentially.
require.GreaterOrEqual(t, time.Since(start).Seconds(), (time.Duration(ruleCount) * artificialDelay).Seconds())
// Each rule produces one vector.
require.EqualValues(t, ruleCount, testutil.ToFloat64(group.metrics.GroupSamples))
// Group duration is higher than the sum of rule durations (group overhead).
require.GreaterOrEqual(t, group.GetEvaluationTime(), group.GetRuleEvaluationTimeSum())
}
})

Expand Down Expand Up @@ -2260,7 +2262,7 @@ func TestAsyncRuleEvaluation(t *testing.T) {
require.Len(t, group.rules, ruleCount)

start := time.Now()
group.Eval(ctx, start)
DefaultEvalIterationFunc(ctx, group, start)

// Max inflight can be 1 synchronous eval and up to MaxConcurrentEvals concurrent evals.
require.EqualValues(t, opts.MaxConcurrentEvals+1, maxInflight.Load())
Expand Down Expand Up @@ -2298,7 +2300,7 @@ func TestAsyncRuleEvaluation(t *testing.T) {
require.Len(t, group.rules, ruleCount)

start := time.Now()
group.Eval(ctx, start)
DefaultEvalIterationFunc(ctx, group, start)

// Max inflight can be 1 synchronous eval and up to MaxConcurrentEvals concurrent evals.
require.EqualValues(t, opts.MaxConcurrentEvals+1, maxInflight.Load())
Expand Down Expand Up @@ -2337,14 +2339,16 @@ func TestAsyncRuleEvaluation(t *testing.T) {

start := time.Now()

group.Eval(ctx, start)
DefaultEvalIterationFunc(ctx, group, start)

// Max inflight can be up to MaxConcurrentEvals concurrent evals, since there is sufficient concurrency to run all rules at once.
require.LessOrEqual(t, int64(maxInflight.Load()), opts.MaxConcurrentEvals)
// Some rules should execute concurrently so should complete quicker.
require.Less(t, time.Since(start).Seconds(), (time.Duration(ruleCount) * artificialDelay).Seconds())
// Each rule produces one vector.
require.EqualValues(t, ruleCount, testutil.ToFloat64(group.metrics.GroupSamples))
// Group duration is less than the sum of rule durations
require.Less(t, group.GetEvaluationTime(), group.GetRuleEvaluationTimeSum())
}
})
}
Expand Down
Loading