Skip to content
This repository was archived by the owner on Jan 19, 2024. It is now read-only.

Commit 05dae73

Browse files
authored
feat: only add prometheus alerts if remediation.yaml is defined for the stage (#253) (#255)
Alerts are useful only to trigger remediations when SLOs are violated. If no remediation is defined in a stage there is no need to clutter the prometheus configuration with alerts that will not trigger any recovery action. Signed-off-by: Paolo Chila <[email protected]>
1 parent 1202556 commit 05dae73

File tree

1 file changed

+95
-64
lines changed

1 file changed

+95
-64
lines changed

eventhandling/configureEvent.go

Lines changed: 95 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -225,71 +225,11 @@ func (eh ConfigureMonitoringEventHandler) updatePrometheusConfigMap(eventData ke
225225
// <service>.<project>-<stage>
226226
createScrapeJobConfig(scrapeConfig, config, eventData.Project, stage.Name, eventData.Service, false, false)
227227

228-
// fetch SLOs for the given service and stage
229-
slos, err := retrieveSLOs(eventData, stage.Name, eh.logger)
230-
if err != nil || slos == nil {
231-
eh.logger.Info("No SLO file found for stage " + stage.Name + ". No alerting rules created for this stage")
232-
continue
233-
}
234-
235-
// Create or update alerting group
236-
var alertingGroupConfig *alertingGroup
237-
alertingGroupName := eventData.Service + " " + eventData.Project + "-" + stage.Name + " alerts"
238-
alertingGroupConfig = getAlertingGroup(&alertingRulesConfig, alertingGroupName)
239-
if alertingGroupConfig == nil {
240-
alertingGroupConfig = &alertingGroup{
241-
Name: alertingGroupName,
242-
}
243-
alertingRulesConfig.Groups = append(alertingRulesConfig.Groups, alertingGroupConfig)
244-
}
245-
246-
for _, objective := range slos.Objectives {
228+
alertingRulesConfig, err = eh.createPrometheusAlertsIfSLOsAndRemediationDefined(eventData, stage,
229+
alertingRulesConfig)
247230

248-
expr, err := eh.getSLIQuery(eventData.Project, stage.Name, eventData.Service, objective.SLI, slos.Filter)
249-
if err != nil || expr == "" {
250-
eh.logger.Error("No query defined for SLI " + objective.SLI + " in project " + eventData.Project)
251-
continue
252-
}
253-
254-
if objective.Pass != nil {
255-
for _, criteriaGroup := range objective.Pass {
256-
for _, criteria := range criteriaGroup.Criteria {
257-
if strings.Contains(criteria, "+") || strings.Contains(criteria, "-") || strings.Contains(criteria, "%") || (!strings.Contains(criteria, "<") && !strings.Contains(criteria, ">")) {
258-
continue
259-
}
260-
criteriaString := strings.Replace(criteria, "=", "", -1)
261-
if strings.Contains(criteriaString, "<") {
262-
criteriaString = strings.Replace(criteriaString, "<", ">", -1)
263-
} else {
264-
criteriaString = strings.Replace(criteriaString, ">", "<", -1)
265-
}
266-
267-
var newAlertingRule *alertingRule
268-
ruleName := objective.SLI
269-
newAlertingRule = getAlertingRuleOfGroup(alertingGroupConfig, ruleName)
270-
if newAlertingRule == nil {
271-
newAlertingRule = &alertingRule{
272-
Alert: ruleName,
273-
}
274-
alertingGroupConfig.Rules = append(alertingGroupConfig.Rules, newAlertingRule)
275-
}
276-
newAlertingRule.Alert = ruleName
277-
newAlertingRule.Expr = expr + criteriaString
278-
newAlertingRule.For = "10m" // TODO: introduce alert duration concept in SLO?
279-
newAlertingRule.Labels = &alertingLabel{
280-
Severity: "webhook",
281-
PodName: eventData.Service + "-primary",
282-
Service: eventData.Service,
283-
Project: eventData.Project,
284-
Stage: stage.Name,
285-
}
286-
newAlertingRule.Annotations = &alertingAnnotations{
287-
Summary: ruleName,
288-
Description: "Pod name {{ $labels.pod_name }}",
289-
}
290-
}
291-
}
292-
}
231+
if err != nil {
232+
return fmt.Errorf("error configuring prometheus alerts: %w", err)
293233
}
294234
}
295235
alertingRulesYAMLString, err := yaml.Marshal(alertingRulesConfig)
@@ -306,6 +246,97 @@ func (eh ConfigureMonitoringEventHandler) updatePrometheusConfigMap(eventData ke
306246
return nil
307247
}
308248

249+
func (eh ConfigureMonitoringEventHandler) createPrometheusAlertsIfSLOsAndRemediationDefined(
250+
eventData keptnevents.ConfigureMonitoringEventData, stage keptnv2.Stage, alertingRulesConfig alertingRules,
251+
) (alertingRules, error) {
252+
// fetch SLOs for the given service and stage
253+
slos, err := retrieveSLOs(eventData, stage.Name, eh.logger)
254+
if err != nil || slos == nil {
255+
eh.logger.Info("No SLO file found for stage " + stage.Name + ". No alerting rules created for this stage")
256+
return alertingRulesConfig, nil
257+
}
258+
259+
const remediationFileDefaultName = "remediation.yaml"
260+
_, err = eh.keptnHandler.ResourceHandler.GetServiceResource(eventData.Project, stage.Name, eventData.Service,
261+
remediationFileDefaultName)
262+
263+
if errors.Is(err, configutils.ResourceNotFoundError) {
264+
eh.logger.Infof("No remediation defined for project %s stage %s, skipping setup of prometheus alerts",
265+
eventData.Project, stage.Name)
266+
return alertingRulesConfig, nil
267+
}
268+
269+
if err != nil {
270+
return alertingRulesConfig,
271+
fmt.Errorf("error retrieving remediation definition %s for project %s and stage %s: %w",
272+
remediationFileDefaultName, eventData.Project, stage.Name, err)
273+
}
274+
275+
// Create or update alerting group
276+
var alertingGroupConfig *alertingGroup
277+
alertingGroupName := eventData.Service + " " + eventData.Project + "-" + stage.Name + " alerts"
278+
alertingGroupConfig = getAlertingGroup(&alertingRulesConfig, alertingGroupName)
279+
if alertingGroupConfig == nil {
280+
alertingGroupConfig = &alertingGroup{
281+
Name: alertingGroupName,
282+
}
283+
alertingRulesConfig.Groups = append(alertingRulesConfig.Groups, alertingGroupConfig)
284+
}
285+
286+
for _, objective := range slos.Objectives {
287+
288+
expr, err := eh.getSLIQuery(eventData.Project, stage.Name, eventData.Service, objective.SLI, slos.Filter)
289+
if err != nil || expr == "" {
290+
eh.logger.Error("No query defined for SLI " + objective.SLI + " in project " + eventData.Project)
291+
continue
292+
}
293+
294+
if objective.Pass != nil {
295+
for _, criteriaGroup := range objective.Pass {
296+
for _, criteria := range criteriaGroup.Criteria {
297+
if strings.Contains(criteria, "+") || strings.Contains(criteria, "-") || strings.Contains(
298+
criteria, "%",
299+
) || (!strings.Contains(criteria, "<") && !strings.Contains(criteria, ">")) {
300+
continue
301+
}
302+
criteriaString := strings.Replace(criteria, "=", "", -1)
303+
if strings.Contains(criteriaString, "<") {
304+
criteriaString = strings.Replace(criteriaString, "<", ">", -1)
305+
} else {
306+
criteriaString = strings.Replace(criteriaString, ">", "<", -1)
307+
}
308+
309+
var newAlertingRule *alertingRule
310+
ruleName := objective.SLI
311+
newAlertingRule = getAlertingRuleOfGroup(alertingGroupConfig, ruleName)
312+
if newAlertingRule == nil {
313+
newAlertingRule = &alertingRule{
314+
Alert: ruleName,
315+
}
316+
alertingGroupConfig.Rules = append(alertingGroupConfig.Rules, newAlertingRule)
317+
}
318+
newAlertingRule.Alert = ruleName
319+
newAlertingRule.Expr = expr + criteriaString
320+
newAlertingRule.For = "10m" // TODO: introduce alert duration concept in SLO?
321+
newAlertingRule.Labels = &alertingLabel{
322+
Severity: "webhook",
323+
PodName: eventData.Service + "-primary",
324+
Service: eventData.Service,
325+
Project: eventData.Project,
326+
Stage: stage.Name,
327+
}
328+
newAlertingRule.Annotations = &alertingAnnotations{
329+
Summary: ruleName,
330+
Description: "Pod name {{ $labels.pod_name }}",
331+
}
332+
}
333+
}
334+
}
335+
}
336+
337+
return alertingRulesConfig, nil
338+
}
339+
309340
func getDefaultFilterExpression(project string, stage string, service string, filters map[string]string) string {
310341
filterExpression := "job='" + service + "-" + project + "-" + stage + "-primary'"
311342
if filters != nil && len(filters) > 0 {

0 commit comments

Comments
 (0)