diff --git a/controllers/libs/monitoring/monitoring.go b/controllers/libs/monitoring/monitoring.go index 50d58184..eac74fc0 100644 --- a/controllers/libs/monitoring/monitoring.go +++ b/controllers/libs/monitoring/monitoring.go @@ -140,6 +140,14 @@ func MkPrometheusRuleGroup(name string, rules []monitoringv1.Rule) monitoringv1. } } +var CriticalSeverityLabel = map[string]string{ + "severity": "critical", +} + +var WarningSeverityLabel = map[string]string{ + "severity": "warning", +} + func MkPrometheusAlertRule(name string, expr intstr.IntOrString, forDuration string, labels map[string]string, annotations map[string]string) monitoringv1.Rule { f := monitoringv1.Duration(forDuration) return monitoringv1.Rule{ diff --git a/controllers/logserver_controller.go b/controllers/logserver_controller.go index 47b3ea5b..4550e00e 100644 --- a/controllers/logserver_controller.go +++ b/controllers/logserver_controller.go @@ -125,10 +125,6 @@ func (r *LogServerController) ensureLogserverPodMonitor() bool { // Create some default, interesting alerts func (r *LogServerController) ensureLogserverPromRule() bool { - diskFullLabels := map[string]string{ - "lasttime": "{{ $value | humanizeTimestamp }}", - "severity": "critical", - } diskFullAnnotations := map[string]string{ "description": "Log server only has {{ $value | humanize1024 }} free disk available.", "summary": "Log server out of disk", @@ -144,7 +140,7 @@ func (r *LogServerController) ensureLogserverPromRule() bool { " node_filesystem_size_bytes{job=\""+r.ns+"/"+logserverIdent+"-monitor\"} < 10) and "+ "(node_filesystem_avail_bytes{job=\""+r.ns+"/"+logserverIdent+"-monitor\"} < 20 * 1024 ^ 3)"), "30m", - diskFullLabels, + sfmonitoring.CriticalSeverityLabel, diskFullAnnotations, ) diskFullIn3days := sfmonitoring.MkPrometheusAlertRule( @@ -155,19 +151,25 @@ func (r *LogServerController) ensureLogserverPromRule() bool { "(predict_linear(node_filesystem_avail_bytes{job=\""+r.ns+"/"+logserverIdent+"-monitor\"}[1d], 3 * 24 * 3600) < 0) and "+ "(node_filesystem_size_bytes{job=\""+r.ns+"/"+logserverIdent+"-monitor\"} <= 1e+11)"), "12h", - map[string]string{}, + sfmonitoring.WarningSeverityLabel, diskFull3daysAnnotations, ) lsDiskRuleGroup := sfmonitoring.MkPrometheusRuleGroup( - "disk.rules", + "disk_default.rules", []monitoringv1.Rule{diskFull, diskFullIn3days}) - desiredLsPromRule := sfmonitoring.MkPrometheusRuleCR(logserverIdent+".rules", r.ns) + desiredLsPromRule := sfmonitoring.MkPrometheusRuleCR(logserverIdent+"-default.rules", r.ns) desiredLsPromRule.Spec.Groups = append(desiredLsPromRule.Spec.Groups, lsDiskRuleGroup) // add annotations so we can handle lifecycle annotations := map[string]string{ - "version": "1", + "version": "2", } + // delete badly named, previous rule - TODO remove this after next release + badPromRule := monitoringv1.PrometheusRule{} + if r.GetM(logserverIdent+".rules", &badPromRule) { + r.DeleteR(&badPromRule) + } + desiredLsPromRule.ObjectMeta.Annotations = annotations currentPromRule := monitoringv1.PrometheusRule{} if !r.GetM(desiredLsPromRule.Name, ¤tPromRule) { diff --git a/controllers/nodepool.go b/controllers/nodepool.go index 36c39c4f..a9e51cb5 100644 --- a/controllers/nodepool.go +++ b/controllers/nodepool.go @@ -18,6 +18,7 @@ import ( appsv1 "k8s.io/api/apps/v1" apiv1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" ) //go:embed static/nodepool/generate-config.sh @@ -147,6 +148,106 @@ func (r *SFController) EnsureNodepoolPodMonitor() bool { return true } +// create default alerts +func (r *SFController) ensureNodepoolPromRule() bool { + /* Alert when more than 5% of node launches resulted in failure in the last hour with any provider */ + highLaunchErrorRateAnnotations := map[string]string{ + "description": "More than 5% ({{ $value }}%) of node launch events for provider {{ $labels.provider }} were failures in the last hour", + "summary": "Too many nodes failing to launch on provider {{ $labels.provider }}", + } + + highLaunchErrorRate := monitoring.MkPrometheusAlertRule( + "HighNodeLaunchErrorRate", + intstr.FromString( + "sum(rate(nodepool_launch_provider_error{error=~'.*'}[1h]))"+ + " / (sum(rate(nodepool_launch_provider_ready[1h])) + "+ + "sum(rate(nodepool_launch_provider_error{error=~'.*'}[1h]))) * 100 > 5"), + "1h", + monitoring.CriticalSeverityLabel, + highLaunchErrorRateAnnotations, + ) + + /* Alert when a DIB build failed */ + dibImageBuildFailureAnnotations := map[string]string{ + "summary": "DIB failure: {{ $labels.diskimage }}", + "description": "DIB could not build image {{ $labels.diskimage }}, check build logs", + } + dibImageBuildFailure := monitoring.MkPrometheusAlertRule( + "DIBImageBuildFailure", + intstr.FromString( + "nodepool_dib_image_build_status_rc != 0"), + "0m", + monitoring.WarningSeverityLabel, + dibImageBuildFailureAnnotations, + ) + + /* Alert when more than 5% of nodes are in "failed" state for more than 1h with any provider */ + highFailedStateRateAnnotations := map[string]string{ + "description": "More than 5% ({{ $value }}%) of nodes were in failed state in the last hour on provider {{ $labels.provider }}", + "summary": "Too many failed nodes on provider {{ $labels.provider }}", + } + + highFailedStateRate := monitoring.MkPrometheusAlertRule( + "HighFailedStateRate", + intstr.FromString( + "sum(rate(nodepool_provider_nodes{state='failed'}[1h]))"+ + " / sum(rate(nodepool_launch_provider_error{state=~'.*'}[1h])) * 100 > 5"), + "1h", + monitoring.CriticalSeverityLabel, + highFailedStateRateAnnotations, + ) + + /* Alert when more than 5% of OpenStack API calls return with status 5xx */ + highOpenStackAPIError5xxRateAnnotations := map[string]string{ + "description": "More than 5% ({{ $value }}%) of API calls to service {{ $labels.service }} / {{ $labels.method }} / {{ $labels.operation }} resulted in HTTP error code 5xx on provider {{ $labels.provider }}", + "summary": "Too many OpenStack API errors on provider {{ $labels.provider }}", + } + + highOpenStackAPIError5xxRate := monitoring.MkPrometheusAlertRule( + "HighOpenStackAPIError5xxRate", + intstr.FromString( + "sum(rate(nodepool_task_openstack{status=~'5..'}[15m]))"+ + " / sum(rate(nodepool_task_openstack{status=~'.*'}[15m])) * 100 > 5"), + "15m", + monitoring.CriticalSeverityLabel, + highOpenStackAPIError5xxRateAnnotations, + ) + + launcherRuleGroup := monitoring.MkPrometheusRuleGroup( + "launcher_default.rules", + []monitoringv1.Rule{ + highLaunchErrorRate, + highFailedStateRate, + highOpenStackAPIError5xxRate, + }) + builderRuleGroup := monitoring.MkPrometheusRuleGroup( + "builder_default.rules", + []monitoringv1.Rule{ + dibImageBuildFailure, + }) + desiredNodepoolPromRule := monitoring.MkPrometheusRuleCR(nodepoolIdent+"-default.rules", r.ns) + desiredNodepoolPromRule.Spec.Groups = append(desiredNodepoolPromRule.Spec.Groups, launcherRuleGroup, builderRuleGroup) + + annotations := map[string]string{ + "version": "1", + } + desiredNodepoolPromRule.ObjectMeta.Annotations = annotations + currentPromRule := monitoringv1.PrometheusRule{} + if !r.GetM(desiredNodepoolPromRule.Name, ¤tPromRule) { + r.CreateR(&desiredNodepoolPromRule) + return false + } else { + if !utils.MapEquals(¤tPromRule.ObjectMeta.Annotations, &annotations) { + r.log.V(1).Info("Nodepool default Prometheus rules changed, updating...") + currentPromRule.Spec = desiredNodepoolPromRule.Spec + currentPromRule.ObjectMeta.Annotations = annotations + r.UpdateR(¤tPromRule) + return false + } + } + return true +} + func (r *SFController) DeployNodepoolBuilder(statsdExporterVolume apiv1.Volume) bool { r.EnsureSSHKeySecret("nodepool-builder-ssh-key") @@ -462,8 +563,9 @@ func (r *SFController) DeployNodepool() map[string]bool { }) statsdVolume := base.MkVolumeCM("statsd-config", "np-statsd-config-map") - // Ensure monitoring + // Ensure monitoring - TODO add to condition r.EnsureNodepoolPodMonitor() + r.ensureNodepoolPromRule() deployments := make(map[string]bool) deployments[LauncherIdent] = r.DeployNodepoolLauncher(statsdVolume) diff --git a/controllers/zuul.go b/controllers/zuul.go index ce57e6a0..012af0b2 100644 --- a/controllers/zuul.go +++ b/controllers/zuul.go @@ -14,6 +14,7 @@ import ( appsv1 "k8s.io/api/apps/v1" apiv1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/utils/pointer" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" @@ -396,6 +397,49 @@ func (r *SFController) EnsureZuulPodMonitor() bool { return true } +// create default alerts +func (r *SFController) ensureZuulPromRule() bool { + configUpdateFailureInPostAnnotations := map[string]string{ + "description": "A config-update job failed in the post pipeline. Latest changes might not have been applied. Please check services configurations", + "summary": "config-update failure post merge", + } + + configUpdateFailureInPost := monitoring.MkPrometheusAlertRule( + "ConfigUpdateFailureInPostPipeline", + intstr.FromString( + "increase(zuul_tenant_pipeline_project_job_count"+ + "{jobname=\"config-update\",tenant=\"internal\",pipeline=\"post\",result!~\"SUCCESS|wait_time\"}[1m]) > 0"), + "0m", + monitoring.CriticalSeverityLabel, + configUpdateFailureInPostAnnotations, + ) + + configRepoRuleGroup := monitoring.MkPrometheusRuleGroup( + "config-repository_default.rules", + []monitoringv1.Rule{configUpdateFailureInPost}) + desiredZuulPromRule := monitoring.MkPrometheusRuleCR("zuul-default.rules", r.ns) + desiredZuulPromRule.Spec.Groups = append(desiredZuulPromRule.Spec.Groups, configRepoRuleGroup) + + annotations := map[string]string{ + "version": "1", + } + desiredZuulPromRule.ObjectMeta.Annotations = annotations + currentPromRule := monitoringv1.PrometheusRule{} + if !r.GetM(desiredZuulPromRule.Name, ¤tPromRule) { + r.CreateR(&desiredZuulPromRule) + return false + } else { + if !utils.MapEquals(¤tPromRule.ObjectMeta.Annotations, &annotations) { + r.log.V(1).Info("Zuul default Prometheus rules changed, updating...") + currentPromRule.Spec = desiredZuulPromRule.Spec + currentPromRule.ObjectMeta.Annotations = annotations + r.UpdateR(¤tPromRule) + return false + } + } + return true +} + func (r *SFController) EnsureZuulConfigSecret(cfg *ini.File) { r.EnsureSecret(&apiv1.Secret{ Data: map[string][]byte{ @@ -587,6 +631,8 @@ func (r *SFController) DeployZuul() bool { r.EnsureZuulComponentsFrontServices() // We could condition readiness to the state of the PodMonitor, but we don't. r.EnsureZuulPodMonitor() + r.ensureZuulPromRule() + return r.EnsureZuulComponents(initContainers, cfgINI) && r.setupZuulIngress() } diff --git a/roles/health-check/test-monitoring/tasks/main.yaml b/roles/health-check/test-monitoring/tasks/main.yaml index 1f088d07..401c2056 100644 --- a/roles/health-check/test-monitoring/tasks/main.yaml +++ b/roles/health-check/test-monitoring/tasks/main.yaml @@ -23,7 +23,7 @@ - "logserver-nodeexporter" - name: Fetch defined alerts for logserver - ansible.builtin.shell: curl -k https://{{ prometheus_host }}/api/v1/rules | jq '.data.groups[] | select(.name == "disk.rules") | .rules[] | select(.name == "{{ item }}") | .health' + ansible.builtin.shell: curl -k https://{{ prometheus_host }}/api/v1/rules | jq '.data.groups[] | select(.name == "disk_default.rules") | .rules[] | select(.name == "{{ item }}") | .health' register: logserver_alert failed_when: "\"ok\" not in logserver_alert.stdout" loop: