Skip to content

Commit e0ade0d

Browse files
NETOBSERV-2411: Fix alerts with empty labels (like namespace) (#2109)
* remove empty labels * address feedback
1 parent 90bb12f commit e0ade0d

File tree

5 files changed

+136
-17
lines changed

5 files changed

+136
-17
lines changed

internal/pkg/metrics/alerts/alerts.go

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,9 @@ func (rb *ruleBuilder) kernelDrops() (*monitoringv1.Rule, error) {
5151
)
5252

5353
metric, totalMetric := rb.getMetricsForAlert()
54-
metricsRate := promQLRateFromMetric(metric, "", "", "2m", "")
55-
totalRate := promQLRateFromMetric(totalMetric, "", "", "2m", "")
54+
filter := rb.buildLabelFilter("")
55+
metricsRate := promQLRateFromMetric(metric, "", filter, "2m", "")
56+
totalRate := promQLRateFromMetric(totalMetric, "", filter, "2m", "")
5657
metricsSumBy := sumBy(metricsRate, rb.alert.GroupBy, rb.side, "")
5758
totalSumBy := sumBy(totalRate, rb.alert.GroupBy, rb.side, "")
5859
promql := percentagePromQL(metricsSumBy, totalSumBy, rb.threshold, rb.upperThreshold, rb.alert.LowVolumeThreshold)
@@ -118,8 +119,9 @@ func (rb *ruleBuilder) ipsecErrors() (*monitoringv1.Rule, error) {
118119
)
119120

120121
metric, totalMetric := rb.getMetricsForAlert()
121-
metricsRate := promQLRateFromMetric(metric, "", "", "2m", "")
122-
totalRate := promQLRateFromMetric(totalMetric, "", "", "2m", "")
122+
filter := rb.buildLabelFilter("")
123+
metricsRate := promQLRateFromMetric(metric, "", filter, "2m", "")
124+
totalRate := promQLRateFromMetric(totalMetric, "", filter, "2m", "")
123125
metricsSumBy := sumBy(metricsRate, rb.alert.GroupBy, rb.side, "")
124126
totalSumBy := sumBy(totalRate, rb.alert.GroupBy, rb.side, "")
125127
promql := percentagePromQL(metricsSumBy, totalSumBy, rb.threshold, rb.upperThreshold, rb.alert.LowVolumeThreshold)
@@ -140,8 +142,10 @@ func (rb *ruleBuilder) dnsErrors() (*monitoringv1.Rule, error) {
140142
)
141143

142144
metric, totalMetric := rb.getMetricsForAlert()
143-
metricsRate := promQLRateFromMetric(metric, "_count", `{DnsFlagsResponseCode!="NoError"}`, "2m", "")
144-
totalRate := promQLRateFromMetric(totalMetric, "_count", "", "2m", "")
145+
metricsFilter := rb.buildLabelFilter(`DnsFlagsResponseCode!="NoError"`)
146+
totalFilter := rb.buildLabelFilter("")
147+
metricsRate := promQLRateFromMetric(metric, "_count", metricsFilter, "2m", "")
148+
totalRate := promQLRateFromMetric(totalMetric, "_count", totalFilter, "2m", "")
145149
metricsSumBy := sumBy(metricsRate, rb.alert.GroupBy, rb.side, "")
146150
totalSumBy := sumBy(totalRate, rb.alert.GroupBy, rb.side, "")
147151
promql := percentagePromQL(metricsSumBy, totalSumBy, rb.threshold, rb.upperThreshold, rb.alert.LowVolumeThreshold)
@@ -160,8 +164,10 @@ func (rb *ruleBuilder) netpolDenied() (*monitoringv1.Rule, error) {
160164
)
161165

162166
metric, totalMetric := rb.getMetricsForAlert()
163-
metricsRate := promQLRateFromMetric(metric, "", `{action="drop"}`, "2m", "")
164-
totalRate := promQLRateFromMetric(totalMetric, "", "", "2m", "")
167+
metricsFilter := rb.buildLabelFilter(`action="drop"`)
168+
totalFilter := rb.buildLabelFilter("")
169+
metricsRate := promQLRateFromMetric(metric, "", metricsFilter, "2m", "")
170+
totalRate := promQLRateFromMetric(totalMetric, "", totalFilter, "2m", "")
165171
metricsSumBy := sumBy(metricsRate, rb.alert.GroupBy, rb.side, "")
166172
totalSumBy := sumBy(totalRate, rb.alert.GroupBy, rb.side, "")
167173
promql := percentagePromQL(metricsSumBy, totalSumBy, rb.threshold, rb.upperThreshold, rb.alert.LowVolumeThreshold)
@@ -180,8 +186,9 @@ func (rb *ruleBuilder) latencyTrend() (*monitoringv1.Rule, error) {
180186
)
181187

182188
metric, baseline := rb.getMetricsForAlert()
183-
metricsRate := promQLRateFromMetric(metric, "_bucket", "", "2m", "")
184-
baselineRate := promQLRateFromMetric(baseline, "_bucket", "", duration, " offset "+offset)
189+
filter := rb.buildLabelFilter("")
190+
metricsRate := promQLRateFromMetric(metric, "_bucket", filter, "2m", "")
191+
baselineRate := promQLRateFromMetric(baseline, "_bucket", filter, duration, " offset "+offset)
185192
metricQuantile := histogramQuantile(metricsRate, rb.alert.GroupBy, rb.side, "0.9")
186193
baselineQuantile := histogramQuantile(baselineRate, rb.alert.GroupBy, rb.side, "0.9")
187194
promql := baselineIncreasePromQL(metricQuantile, baselineQuantile, rb.threshold, rb.upperThreshold)

internal/pkg/metrics/alerts/alerts_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -280,11 +280,11 @@ func TestLatencyPromql(t *testing.T) {
280280
assert.Equal(t,
281281
`100 * `+
282282
`((histogram_quantile(0.9, `+
283-
`sum(label_replace(rate(netobserv_namespace_rtt_seconds_bucket[2m]), "namespace", "$1", "SrcK8S_Namespace", "(.*)")) by (namespace,le)))`+
283+
`sum(label_replace(rate(netobserv_namespace_rtt_seconds_bucket{SrcK8S_Namespace!=""}[2m]), "namespace", "$1", "SrcK8S_Namespace", "(.*)")) by (namespace,le)))`+
284284
` - (histogram_quantile(0.9, `+
285-
`sum(label_replace(rate(netobserv_namespace_rtt_seconds_bucket[2h] offset 24h), "namespace", "$1", "SrcK8S_Namespace", "(.*)")) by (namespace,le))))`+
285+
`sum(label_replace(rate(netobserv_namespace_rtt_seconds_bucket{SrcK8S_Namespace!=""}[2h] offset 24h), "namespace", "$1", "SrcK8S_Namespace", "(.*)")) by (namespace,le))))`+
286286
` / (histogram_quantile(0.9, `+
287-
`sum(label_replace(rate(netobserv_namespace_rtt_seconds_bucket[2h] offset 24h), "namespace", "$1", "SrcK8S_Namespace", "(.*)")) by (namespace,le)))`+
287+
`sum(label_replace(rate(netobserv_namespace_rtt_seconds_bucket{SrcK8S_Namespace!=""}[2h] offset 24h), "namespace", "$1", "SrcK8S_Namespace", "(.*)")) by (namespace,le)))`+
288288
` > 100`,
289289
rules[0].Expr.StrVal,
290290
)

internal/pkg/metrics/alerts/builder.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,33 @@ func buildLabels(severity string, forHealth bool) map[string]string {
214214
return m
215215
}
216216

217+
func (rb *ruleBuilder) buildLabelFilter(additionalFilter string) string {
218+
var filters []string
219+
220+
// Build label matchers to filter out metrics where K8s labels don't exist or are empty
221+
// This prevents alerts from firing with empty namespace/workload/node labels
222+
switch rb.alert.GroupBy {
223+
case flowslatest.GroupByNode:
224+
filters = append(filters, string(rb.side)+`K8S_HostName!=""`)
225+
case flowslatest.GroupByNamespace:
226+
filters = append(filters, string(rb.side)+`K8S_Namespace!=""`)
227+
case flowslatest.GroupByWorkload:
228+
filters = append(filters, string(rb.side)+`K8S_Namespace!=""`)
229+
filters = append(filters, string(rb.side)+`K8S_OwnerName!=""`)
230+
filters = append(filters, string(rb.side)+`K8S_OwnerType!=""`)
231+
}
232+
233+
// Add additional business logic filters
234+
if additionalFilter != "" {
235+
filters = append(filters, additionalFilter)
236+
}
237+
238+
if len(filters) == 0 {
239+
return ""
240+
}
241+
return "{" + strings.Join(filters, ",") + "}"
242+
}
243+
217244
func (rb *ruleBuilder) getMetricsForAlert() (string, string) {
218245
var reqMetric1, reqMetric2 string
219246
reqMetrics1, reqMetrics2 := flowslatest.GetElligibleMetricsForAlert(rb.template, rb.alert)
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
package alerts
2+
3+
import (
4+
"testing"
5+
6+
flowslatest "github.com/netobserv/network-observability-operator/api/flowcollector/v1beta2"
7+
"github.com/stretchr/testify/assert"
8+
)
9+
10+
func TestBuildLabelFilter(t *testing.T) {
11+
// Test GroupByNode with source side
12+
rb := &ruleBuilder{
13+
alert: &flowslatest.AlertVariant{
14+
GroupBy: flowslatest.GroupByNode,
15+
},
16+
side: asSource,
17+
}
18+
filter := rb.buildLabelFilter("")
19+
assert.Equal(t, `{SrcK8S_HostName!=""}`, filter)
20+
21+
// Test GroupByNode with destination side
22+
rb.side = asDest
23+
filter = rb.buildLabelFilter("")
24+
assert.Equal(t, `{DstK8S_HostName!=""}`, filter)
25+
26+
// Test GroupByNamespace
27+
rb.alert.GroupBy = flowslatest.GroupByNamespace
28+
rb.side = asSource
29+
filter = rb.buildLabelFilter("")
30+
assert.Equal(t, `{SrcK8S_Namespace!=""}`, filter)
31+
32+
// Test GroupByWorkload
33+
rb.alert.GroupBy = flowslatest.GroupByWorkload
34+
rb.side = asDest
35+
filter = rb.buildLabelFilter("")
36+
assert.Equal(t, `{DstK8S_Namespace!="",DstK8S_OwnerName!="",DstK8S_OwnerType!=""}`, filter)
37+
38+
// Test with additional filter
39+
rb.alert.GroupBy = flowslatest.GroupByNamespace
40+
rb.side = asSource
41+
filter = rb.buildLabelFilter(`DnsFlagsResponseCode!="NoError"`)
42+
assert.Equal(t, `{SrcK8S_Namespace!="",DnsFlagsResponseCode!="NoError"}`, filter)
43+
44+
// Test with action filter (netpol)
45+
rb.alert.GroupBy = flowslatest.GroupByWorkload
46+
rb.side = asDest
47+
filter = rb.buildLabelFilter(`action="drop"`)
48+
assert.Equal(t, `{DstK8S_Namespace!="",DstK8S_OwnerName!="",DstK8S_OwnerType!="",action="drop"}`, filter)
49+
50+
// Test no grouping (global)
51+
rb.alert.GroupBy = ""
52+
rb.side = ""
53+
filter = rb.buildLabelFilter("")
54+
assert.Equal(t, "", filter)
55+
56+
// Test no grouping with additional filter
57+
filter = rb.buildLabelFilter(`DnsFlagsResponseCode!="NoError"`)
58+
assert.Equal(t, `{DnsFlagsResponseCode!="NoError"}`, filter)
59+
}

internal/pkg/metrics/alerts/promql_test.go

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,46 @@ import (
88
)
99

1010
func TestSumBy(t *testing.T) {
11-
pql := sumBy("rate(my_metric[1m])", flowslatest.GroupByNode, asSource, "")
11+
// Test GroupByNode with source side - filters are now added via buildLabelFilter before promQLRateFromMetric
12+
// sumBy should only handle label replacement and aggregation
13+
pql := sumBy(`rate(my_metric{SrcK8S_HostName!=""}[1m])`, flowslatest.GroupByNode, asSource, "")
1214
assert.Equal(t,
13-
`sum(label_replace(rate(my_metric[1m]), "node", "$1", "SrcK8S_HostName", "(.*)")) by (node)`,
15+
`sum(label_replace(rate(my_metric{SrcK8S_HostName!=""}[1m]), "node", "$1", "SrcK8S_HostName", "(.*)")) by (node)`,
1416
pql,
1517
)
18+
assert.Contains(t, pql, `SrcK8S_HostName!=""`, "should preserve filters from input")
1619

17-
pql = sumBy("rate(my_metric[1m])", flowslatest.GroupByWorkload, asDest, "")
20+
// Test GroupByWorkload with destination side - filters come from buildLabelFilter
21+
pql = sumBy(`rate(my_metric{DstK8S_Namespace!="",DstK8S_OwnerName!="",DstK8S_OwnerType!=""}[1m])`, flowslatest.GroupByWorkload, asDest, "")
1822
assert.Equal(t,
19-
`sum(label_replace(label_replace(label_replace(rate(my_metric[1m]), "namespace", "$1", "DstK8S_Namespace", "(.*)"), "workload", "$1", "DstK8S_OwnerName", "(.*)"), "kind", "$1", "DstK8S_OwnerType", "(.*)")) by (namespace,workload,kind)`,
23+
`sum(label_replace(label_replace(label_replace(rate(my_metric{DstK8S_Namespace!="",DstK8S_OwnerName!="",DstK8S_OwnerType!=""}[1m]), "namespace", "$1", "DstK8S_Namespace", "(.*)"), "workload", "$1", "DstK8S_OwnerName", "(.*)"), "kind", "$1", "DstK8S_OwnerType", "(.*)")) by (namespace,workload,kind)`,
2024
pql,
2125
)
26+
assert.Contains(t, pql, `DstK8S_Namespace!=""`, "should preserve DstK8S_Namespace filter from input")
27+
assert.Contains(t, pql, `DstK8S_OwnerName!=""`, "should preserve DstK8S_OwnerName filter from input")
28+
assert.Contains(t, pql, `DstK8S_OwnerType!=""`, "should preserve DstK8S_OwnerType filter from input")
2229

30+
// Test GroupByNamespace - filters from buildLabelFilter
31+
pql = sumBy(`rate(my_metric{DstK8S_Namespace!=""}[1m])`, flowslatest.GroupByNamespace, asDest, "")
32+
assert.Equal(t,
33+
`sum(label_replace(rate(my_metric{DstK8S_Namespace!=""}[1m]), "namespace", "$1", "DstK8S_Namespace", "(.*)")) by (namespace)`,
34+
pql,
35+
)
36+
assert.Contains(t, pql, `DstK8S_Namespace!=""`, "should preserve namespace filter from input")
37+
38+
// Test no grouping - should NOT add any filters
2339
pql = sumBy("rate(my_metric[1m])", "", "", "")
2440
assert.Equal(t, `sum(rate(my_metric[1m]))`, pql)
41+
assert.NotContains(t, pql, "K8S_", "should not add K8s label filters when not grouping")
42+
43+
// Test with existing label selector (like DNS errors) - filters are merged in buildLabelFilter
44+
pql = sumBy(`rate(my_metric{DstK8S_Namespace!="",DnsFlagsResponseCode!="NoError"}[1m])`, flowslatest.GroupByNamespace, asDest, "")
45+
assert.Equal(t,
46+
`sum(label_replace(rate(my_metric{DstK8S_Namespace!="",DnsFlagsResponseCode!="NoError"}[1m]), "namespace", "$1", "DstK8S_Namespace", "(.*)")) by (namespace)`,
47+
pql,
48+
)
49+
assert.Contains(t, pql, `DstK8S_Namespace!=""`, "should preserve namespace filter")
50+
assert.Contains(t, pql, `DnsFlagsResponseCode!="NoError"`, "should preserve business logic filter")
2551
}
2652

2753
func TestPercentagePromQL(t *testing.T) {

0 commit comments

Comments
 (0)