@@ -225,71 +225,11 @@ func (eh ConfigureMonitoringEventHandler) updatePrometheusConfigMap(eventData ke
225
225
// <service>.<project>-<stage>
226
226
createScrapeJobConfig (scrapeConfig , config , eventData .Project , stage .Name , eventData .Service , false , false )
227
227
228
- // fetch SLOs for the given service and stage
229
- slos , err := retrieveSLOs (eventData , stage .Name , eh .logger )
230
- if err != nil || slos == nil {
231
- eh .logger .Info ("No SLO file found for stage " + stage .Name + ". No alerting rules created for this stage" )
232
- continue
233
- }
234
-
235
- // Create or update alerting group
236
- var alertingGroupConfig * alertingGroup
237
- alertingGroupName := eventData .Service + " " + eventData .Project + "-" + stage .Name + " alerts"
238
- alertingGroupConfig = getAlertingGroup (& alertingRulesConfig , alertingGroupName )
239
- if alertingGroupConfig == nil {
240
- alertingGroupConfig = & alertingGroup {
241
- Name : alertingGroupName ,
242
- }
243
- alertingRulesConfig .Groups = append (alertingRulesConfig .Groups , alertingGroupConfig )
244
- }
245
-
246
- for _ , objective := range slos .Objectives {
228
+ alertingRulesConfig , err = eh .createPrometheusAlertsIfSLOsAndRemediationDefined (eventData , stage ,
229
+ alertingRulesConfig )
247
230
248
- expr , err := eh .getSLIQuery (eventData .Project , stage .Name , eventData .Service , objective .SLI , slos .Filter )
249
- if err != nil || expr == "" {
250
- eh .logger .Error ("No query defined for SLI " + objective .SLI + " in project " + eventData .Project )
251
- continue
252
- }
253
-
254
- if objective .Pass != nil {
255
- for _ , criteriaGroup := range objective .Pass {
256
- for _ , criteria := range criteriaGroup .Criteria {
257
- if strings .Contains (criteria , "+" ) || strings .Contains (criteria , "-" ) || strings .Contains (criteria , "%" ) || (! strings .Contains (criteria , "<" ) && ! strings .Contains (criteria , ">" )) {
258
- continue
259
- }
260
- criteriaString := strings .Replace (criteria , "=" , "" , - 1 )
261
- if strings .Contains (criteriaString , "<" ) {
262
- criteriaString = strings .Replace (criteriaString , "<" , ">" , - 1 )
263
- } else {
264
- criteriaString = strings .Replace (criteriaString , ">" , "<" , - 1 )
265
- }
266
-
267
- var newAlertingRule * alertingRule
268
- ruleName := objective .SLI
269
- newAlertingRule = getAlertingRuleOfGroup (alertingGroupConfig , ruleName )
270
- if newAlertingRule == nil {
271
- newAlertingRule = & alertingRule {
272
- Alert : ruleName ,
273
- }
274
- alertingGroupConfig .Rules = append (alertingGroupConfig .Rules , newAlertingRule )
275
- }
276
- newAlertingRule .Alert = ruleName
277
- newAlertingRule .Expr = expr + criteriaString
278
- newAlertingRule .For = "10m" // TODO: introduce alert duration concept in SLO?
279
- newAlertingRule .Labels = & alertingLabel {
280
- Severity : "webhook" ,
281
- PodName : eventData .Service + "-primary" ,
282
- Service : eventData .Service ,
283
- Project : eventData .Project ,
284
- Stage : stage .Name ,
285
- }
286
- newAlertingRule .Annotations = & alertingAnnotations {
287
- Summary : ruleName ,
288
- Description : "Pod name {{ $labels.pod_name }}" ,
289
- }
290
- }
291
- }
292
- }
231
+ if err != nil {
232
+ return fmt .Errorf ("error configuring prometheus alerts: %w" , err )
293
233
}
294
234
}
295
235
alertingRulesYAMLString , err := yaml .Marshal (alertingRulesConfig )
@@ -306,6 +246,97 @@ func (eh ConfigureMonitoringEventHandler) updatePrometheusConfigMap(eventData ke
306
246
return nil
307
247
}
308
248
249
+ func (eh ConfigureMonitoringEventHandler ) createPrometheusAlertsIfSLOsAndRemediationDefined (
250
+ eventData keptnevents.ConfigureMonitoringEventData , stage keptnv2.Stage , alertingRulesConfig alertingRules ,
251
+ ) (alertingRules , error ) {
252
+ // fetch SLOs for the given service and stage
253
+ slos , err := retrieveSLOs (eventData , stage .Name , eh .logger )
254
+ if err != nil || slos == nil {
255
+ eh .logger .Info ("No SLO file found for stage " + stage .Name + ". No alerting rules created for this stage" )
256
+ return alertingRulesConfig , nil
257
+ }
258
+
259
+ const remediationFileDefaultName = "remediation.yaml"
260
+ _ , err = eh .keptnHandler .ResourceHandler .GetServiceResource (eventData .Project , stage .Name , eventData .Service ,
261
+ remediationFileDefaultName )
262
+
263
+ if errors .Is (err , configutils .ResourceNotFoundError ) {
264
+ eh .logger .Infof ("No remediation defined for project %s stage %s, skipping setup of prometheus alerts" ,
265
+ eventData .Project , stage .Name )
266
+ return alertingRulesConfig , nil
267
+ }
268
+
269
+ if err != nil {
270
+ return alertingRulesConfig ,
271
+ fmt .Errorf ("error retrieving remediation definition %s for project %s and stage %s: %w" ,
272
+ remediationFileDefaultName , eventData .Project , stage .Name , err )
273
+ }
274
+
275
+ // Create or update alerting group
276
+ var alertingGroupConfig * alertingGroup
277
+ alertingGroupName := eventData .Service + " " + eventData .Project + "-" + stage .Name + " alerts"
278
+ alertingGroupConfig = getAlertingGroup (& alertingRulesConfig , alertingGroupName )
279
+ if alertingGroupConfig == nil {
280
+ alertingGroupConfig = & alertingGroup {
281
+ Name : alertingGroupName ,
282
+ }
283
+ alertingRulesConfig .Groups = append (alertingRulesConfig .Groups , alertingGroupConfig )
284
+ }
285
+
286
+ for _ , objective := range slos .Objectives {
287
+
288
+ expr , err := eh .getSLIQuery (eventData .Project , stage .Name , eventData .Service , objective .SLI , slos .Filter )
289
+ if err != nil || expr == "" {
290
+ eh .logger .Error ("No query defined for SLI " + objective .SLI + " in project " + eventData .Project )
291
+ continue
292
+ }
293
+
294
+ if objective .Pass != nil {
295
+ for _ , criteriaGroup := range objective .Pass {
296
+ for _ , criteria := range criteriaGroup .Criteria {
297
+ if strings .Contains (criteria , "+" ) || strings .Contains (criteria , "-" ) || strings .Contains (
298
+ criteria , "%" ,
299
+ ) || (! strings .Contains (criteria , "<" ) && ! strings .Contains (criteria , ">" )) {
300
+ continue
301
+ }
302
+ criteriaString := strings .Replace (criteria , "=" , "" , - 1 )
303
+ if strings .Contains (criteriaString , "<" ) {
304
+ criteriaString = strings .Replace (criteriaString , "<" , ">" , - 1 )
305
+ } else {
306
+ criteriaString = strings .Replace (criteriaString , ">" , "<" , - 1 )
307
+ }
308
+
309
+ var newAlertingRule * alertingRule
310
+ ruleName := objective .SLI
311
+ newAlertingRule = getAlertingRuleOfGroup (alertingGroupConfig , ruleName )
312
+ if newAlertingRule == nil {
313
+ newAlertingRule = & alertingRule {
314
+ Alert : ruleName ,
315
+ }
316
+ alertingGroupConfig .Rules = append (alertingGroupConfig .Rules , newAlertingRule )
317
+ }
318
+ newAlertingRule .Alert = ruleName
319
+ newAlertingRule .Expr = expr + criteriaString
320
+ newAlertingRule .For = "10m" // TODO: introduce alert duration concept in SLO?
321
+ newAlertingRule .Labels = & alertingLabel {
322
+ Severity : "webhook" ,
323
+ PodName : eventData .Service + "-primary" ,
324
+ Service : eventData .Service ,
325
+ Project : eventData .Project ,
326
+ Stage : stage .Name ,
327
+ }
328
+ newAlertingRule .Annotations = & alertingAnnotations {
329
+ Summary : ruleName ,
330
+ Description : "Pod name {{ $labels.pod_name }}" ,
331
+ }
332
+ }
333
+ }
334
+ }
335
+ }
336
+
337
+ return alertingRulesConfig , nil
338
+ }
339
+
309
340
func getDefaultFilterExpression (project string , stage string , service string , filters map [string ]string ) string {
310
341
filterExpression := "job='" + service + "-" + project + "-" + stage + "-primary'"
311
342
if filters != nil && len (filters ) > 0 {
0 commit comments