Skip to content

Commit d8a3f92

Browse files
committed
metrics: migrate Prometheus metrics to pkg/metrics helpers
The pkg/metrics helpers provide critical functionality that raw prometheus.New*Vec functions lack: automatic metric cleanup for deleted pods, cardinality constraints, configurable high-cardinality labels, and automatic documentation generation. Migrating to these helpers ensures consistent metric management across the codebase and prevents unbounded cardinality growth. Fixes #2798 Signed-off-by: Aritra Dey <[email protected]>
1 parent 27c9abe commit d8a3f92

File tree

13 files changed

+275
-155
lines changed

13 files changed

+275
-155
lines changed

docs/content/en/docs/reference/metrics.md

Lines changed: 9 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/bench/summary.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,8 @@ func (s *Summary) PrettyPrint() {
8484
getCounterValue(observer.RingbufLost),
8585
getCounterValue(observer.RingbufErrors))
8686

87-
mergePushed := getCounterValue(kprobemetrics.MergePushed)
88-
mergeOkTotal := getCounterValue(kprobemetrics.MergeOkTotal)
87+
mergePushed := getCounterValue(kprobemetrics.MergePushed.WithLabelValues())
88+
mergeOkTotal := getCounterValue(kprobemetrics.MergeOkTotal.WithLabelValues())
8989
fmt.Printf("Merged events: pushed=%d, ok=%d, errors=%d\n",
9090
mergePushed, mergeOkTotal, mergePushed-mergeOkTotal)
9191
}

pkg/grpc/tracing/stats.go

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,22 @@
44
package tracing
55

66
import (
7-
"github.com/prometheus/client_golang/prometheus"
7+
"maps"
8+
"slices"
89

910
"github.com/cilium/tetragon/pkg/metrics"
1011
"github.com/cilium/tetragon/pkg/metrics/consts"
1112
)
1213

1314
var (
14-
LoaderStats = prometheus.NewCounterVec(prometheus.CounterOpts{
15-
Namespace: consts.MetricsNamespace,
16-
Name: "process_loader_stats",
17-
Help: "Process Loader event statistics. For internal use only.",
18-
ConstLabels: nil,
19-
}, []string{"count"})
15+
LoaderStats = metrics.MustNewCounter(
16+
metrics.NewOpts(
17+
consts.MetricsNamespace, "", "process_loader_stats",
18+
"Process Loader event statistics. For internal use only.",
19+
nil, []metrics.ConstrainedLabel{{Name: "count", Values: slices.Collect(maps.Values(LoaderTypeStrings))}}, nil,
20+
),
21+
nil,
22+
)
2023
)
2124

2225
func RegisterMetrics(group metrics.Group) {

pkg/metrics/cgroupratemetrics/cgroupratemetrics.go

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
package cgroupratemetrics
55

66
import (
7+
"maps"
8+
"slices"
9+
710
"github.com/prometheus/client_golang/prometheus"
811

912
"github.com/cilium/tetragon/pkg/metrics"
@@ -39,12 +42,14 @@ func (e CgroupRateType) String() string {
3942
}
4043

4144
var (
42-
CgroupRateTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
43-
Namespace: consts.MetricsNamespace,
44-
Name: "cgroup_rate_total",
45-
Help: "The total number of Tetragon cgroup rate counters. For internal use only.",
46-
ConstLabels: nil,
47-
}, []string{"type"})
45+
CgroupRateTotal = metrics.MustNewCounter(
46+
metrics.NewOpts(
47+
consts.MetricsNamespace, "", "cgroup_rate_total",
48+
"The total number of Tetragon cgroup rate counters. For internal use only.",
49+
nil, []metrics.ConstrainedLabel{{Name: "type", Values: slices.Collect(maps.Values(totalLabelValues))}}, nil,
50+
),
51+
nil,
52+
)
4853
)
4954

5055
func RegisterMetrics(group metrics.Group) {

pkg/metrics/errormetrics/errormetrics.go

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
package errormetrics
55

66
import (
7+
"maps"
8+
"slices"
79
"strconv"
810

911
"github.com/prometheus/client_golang/prometheus"
@@ -57,19 +59,48 @@ func (e EventHandlerError) String() string {
5759
}
5860

5961
var (
60-
ErrorTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
61-
Namespace: consts.MetricsNamespace,
62-
Name: "errors_total",
63-
Help: "The total number of Tetragon errors. For internal use only.",
64-
ConstLabels: nil,
65-
}, []string{"type"})
66-
67-
HandlerErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
68-
Namespace: consts.MetricsNamespace,
69-
Name: "handler_errors_total",
70-
Help: "The total number of event handler errors. For internal use only.",
71-
ConstLabels: nil,
72-
}, []string{"opcode", "error_type"})
62+
// Constrained label for error type
63+
errorTypeLabel = metrics.ConstrainedLabel{
64+
Name: "type",
65+
Values: slices.Collect(maps.Values(errorTypeLabelValues)),
66+
}
67+
// Constrained label for opcode (numeric strings)
68+
opcodeLabel = metrics.ConstrainedLabel{
69+
Name: "opcode",
70+
Values: func() []string {
71+
res := make([]string, 0, len(ops.OpCodeStrings))
72+
for opcode := range ops.OpCodeStrings {
73+
if opcode != ops.MSG_OP_TEST {
74+
// include UNDEF (0) to represent unknown opcodes in docs/metrics
75+
res = append(res, strconv.Itoa(int(int32(opcode))))
76+
}
77+
}
78+
return res
79+
}(),
80+
}
81+
// Constrained label for handler error type
82+
handlerErrTypeLabel = metrics.ConstrainedLabel{
83+
Name: "error_type",
84+
Values: slices.Collect(maps.Values(eventHandlerErrorLabelValues)),
85+
}
86+
87+
ErrorTotal = metrics.MustNewCounter(
88+
metrics.NewOpts(
89+
consts.MetricsNamespace, "", "errors_total",
90+
"The total number of Tetragon errors. For internal use only.",
91+
nil, []metrics.ConstrainedLabel{errorTypeLabel}, nil,
92+
),
93+
nil,
94+
)
95+
96+
HandlerErrors = metrics.MustNewCounter(
97+
metrics.NewOpts(
98+
consts.MetricsNamespace, "", "handler_errors_total",
99+
"The total number of event handler errors. For internal use only.",
100+
nil, []metrics.ConstrainedLabel{opcodeLabel, handlerErrTypeLabel}, nil,
101+
),
102+
nil,
103+
)
73104
)
74105

75106
func RegisterMetrics(group metrics.Group) {

pkg/metrics/eventmetrics/eventmetrics.go

Lines changed: 43 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -41,42 +41,53 @@ var (
4141
)
4242

4343
var (
44-
EventsProcessed = metrics.MustNewGranularCounter[metrics.ProcessLabels](prometheus.CounterOpts{
45-
Namespace: consts.MetricsNamespace,
46-
Name: "events_total",
47-
Help: "The total number of Tetragon events",
48-
ConstLabels: nil,
49-
}, []string{"type"})
44+
// Preserve label name "type" while using constrained values from EventTypeLabel.
45+
eventTypeLabel = metrics.ConstrainedLabel{
46+
Name: "type",
47+
Values: append(slices.Clone(metrics.EventTypeLabel.Values), "unknown"),
48+
}
49+
50+
EventsProcessed = metrics.MustNewGranularCounterWithInit[metrics.ProcessLabels](
51+
metrics.NewOpts(
52+
consts.MetricsNamespace, "", "events_total",
53+
"The total number of Tetragon events",
54+
nil, []metrics.ConstrainedLabel{eventTypeLabel}, nil,
55+
),
56+
nil,
57+
)
5058
MissedEvents = metrics.MustNewCustomCounter(metrics.NewOpts(
5159
consts.MetricsNamespace, "bpf", "missed_events_total",
5260
"Number of Tetragon perf events that are failed to be sent from the kernel.",
5361
nil, []metrics.ConstrainedLabel{metrics.OpCodeLabel, perfEventErrorLabel}, nil,
5462
))
55-
FlagCount = prometheus.NewCounterVec(prometheus.CounterOpts{
56-
Namespace: consts.MetricsNamespace,
57-
Name: "flags_total",
58-
Help: "The total number of Tetragon flags. For internal use only.",
59-
ConstLabels: nil,
60-
}, []string{"type"})
61-
NotifyOverflowedEvents = prometheus.NewCounter(prometheus.CounterOpts{
62-
Namespace: consts.MetricsNamespace,
63-
Name: "notify_overflowed_events_total",
64-
Help: "The total number of events dropped because listener buffer was full",
65-
ConstLabels: nil,
66-
})
67-
68-
policyStats = metrics.MustNewGranularCounter[metrics.ProcessLabels](prometheus.CounterOpts{
69-
Namespace: consts.MetricsNamespace,
70-
Name: "policy_events_total",
71-
Help: "Policy events calls observed.",
72-
ConstLabels: nil,
73-
}, []string{"policy", "hook"})
74-
75-
missingProcessInfo = prometheus.NewCounter(prometheus.CounterOpts{
76-
Namespace: consts.MetricsNamespace,
77-
Name: "events_missing_process_info_total",
78-
Help: "Number of events missing process info.",
79-
})
63+
FlagCount = metrics.MustNewCounter(
64+
metrics.NewOpts(
65+
consts.MetricsNamespace, "", "flags_total",
66+
"The total number of Tetragon flags. For internal use only.",
67+
nil, nil, []metrics.UnconstrainedLabel{{Name: "type", ExampleValue: "unknown"}},
68+
),
69+
nil,
70+
)
71+
NotifyOverflowedEvents = metrics.MustNewCounter(metrics.NewOpts(
72+
consts.MetricsNamespace, "", "notify_overflowed_events_total",
73+
"The total number of events dropped because listener buffer was full",
74+
nil, nil, nil,
75+
), nil)
76+
77+
policyStats = metrics.MustNewGranularCounterWithInit[metrics.ProcessLabels](
78+
metrics.NewOpts(
79+
consts.MetricsNamespace, "", "policy_events_total",
80+
"Policy events calls observed.",
81+
nil, nil, []metrics.UnconstrainedLabel{{Name: "policy", ExampleValue: consts.ExamplePolicyLabel}, {Name: "hook", ExampleValue: consts.ExampleKprobeLabel}},
82+
),
83+
nil,
84+
)
85+
86+
missingProcessInfo = metrics.MustNewCounter(metrics.NewOpts(
87+
consts.MetricsNamespace, "", "events_missing_process_info_total",
88+
"Number of events missing process info.",
89+
nil, nil, nil,
90+
), nil)
8091
)
8192

8293
func RegisterHealthMetrics(group metrics.Group) {
@@ -122,7 +133,7 @@ func GetProcessInfo(process *tetragon.Process) (binary, pod, workload, namespace
122133
pod = process.Pod.Name
123134
}
124135
} else {
125-
missingProcessInfo.Inc()
136+
missingProcessInfo.WithLabelValues().Inc()
126137
}
127138
return binary, pod, workload, namespace
128139
}

pkg/metrics/kprobemetrics/kprobemetrics.go

Lines changed: 36 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
package kprobemetrics
55

66
import (
7+
"maps"
8+
"slices"
9+
710
"github.com/prometheus/client_golang/prometheus"
811

912
"github.com/cilium/tetragon/pkg/metrics"
@@ -27,24 +30,35 @@ func (t MergeErrorType) String() string {
2730
}
2831

2932
var (
30-
MergeErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
31-
Namespace: consts.MetricsNamespace,
32-
Name: "generic_kprobe_merge_errors_total",
33-
Help: "The total number of failed attempts to merge a kprobe and kretprobe event.",
34-
ConstLabels: nil,
35-
}, []string{"curr_fn", "curr_type", "prev_fn", "prev_type"})
36-
MergeOkTotal = prometheus.NewCounter(prometheus.CounterOpts{
37-
Namespace: consts.MetricsNamespace,
38-
Name: "generic_kprobe_merge_ok_total",
39-
Help: "The total number of successful attempts to merge a kprobe and kretprobe event.",
40-
ConstLabels: nil,
41-
})
42-
MergePushed = prometheus.NewCounter(prometheus.CounterOpts{
43-
Namespace: consts.MetricsNamespace,
44-
Name: "generic_kprobe_merge_pushed_total",
45-
Help: "The total number of pushed events for later merge.",
46-
ConstLabels: nil,
47-
})
33+
currTypeLabel = metrics.ConstrainedLabel{
34+
Name: "curr_type",
35+
Values: slices.Collect(maps.Values(mergeErrorTypeLabelValues)),
36+
}
37+
prevTypeLabel = metrics.ConstrainedLabel{
38+
Name: "prev_type",
39+
Values: slices.Collect(maps.Values(mergeErrorTypeLabelValues)),
40+
}
41+
42+
MergeErrors = metrics.MustNewCounter(
43+
metrics.NewOpts(
44+
consts.MetricsNamespace, "", "generic_kprobe_merge_errors_total",
45+
"The total number of failed attempts to merge a kprobe and kretprobe event.",
46+
nil,
47+
[]metrics.ConstrainedLabel{currTypeLabel, prevTypeLabel},
48+
[]metrics.UnconstrainedLabel{{Name: "curr_fn", ExampleValue: consts.ExampleKprobeLabel}, {Name: "prev_fn", ExampleValue: consts.ExampleKprobeLabel}},
49+
),
50+
nil,
51+
)
52+
MergeOkTotal = metrics.MustNewCounter(metrics.NewOpts(
53+
consts.MetricsNamespace, "", "generic_kprobe_merge_ok_total",
54+
"The total number of successful attempts to merge a kprobe and kretprobe event.",
55+
nil, nil, nil,
56+
), nil)
57+
MergePushed = metrics.MustNewCounter(metrics.NewOpts(
58+
consts.MetricsNamespace, "", "generic_kprobe_merge_pushed_total",
59+
"The total number of pushed events for later merge.",
60+
nil, nil, nil,
61+
), nil)
4862
)
4963

5064
func RegisterMetrics(group metrics.Group) {
@@ -57,15 +71,15 @@ func InitMetricsForDocs() {
5771
// Initialize metrics with example labels
5872
for _, curr := range mergeErrorTypeLabelValues {
5973
for _, prev := range mergeErrorTypeLabelValues {
60-
MergeErrors.WithLabelValues(consts.ExampleKprobeLabel, curr, consts.ExampleKprobeLabel, prev).Add(0)
74+
MergeErrors.WithLabelValues(curr, prev, consts.ExampleKprobeLabel, consts.ExampleKprobeLabel).Add(0)
6175
}
6276
}
6377
}
6478

6579
// Get a new handle on the mergeErrors metric for a current and previous function
6680
// name and probe type
6781
func GetMergeErrors(currFn, prevFn string, currType, prevType MergeErrorType) prometheus.Counter {
68-
return MergeErrors.WithLabelValues(currFn, prevFn, currType.String(), prevType.String())
82+
return MergeErrors.WithLabelValues(currType.String(), prevType.String(), currFn, prevFn)
6983
}
7084

7185
// Increment the mergeErrors metric for a current and previous function
@@ -75,9 +89,9 @@ func MergeErrorsInc(currFn, prevFn string, currType, prevType MergeErrorType) {
7589
}
7690

7791
func MergeOkTotalInc() {
78-
MergeOkTotal.Inc()
92+
MergeOkTotal.WithLabelValues().Inc()
7993
}
8094

8195
func MergePushedInc() {
82-
MergePushed.Inc()
96+
MergePushed.WithLabelValues().Inc()
8397
}

0 commit comments

Comments
 (0)