Skip to content

Commit

Permalink
Prometheus: Add basic alerts for nodepool, config-repo
Browse files Browse the repository at this point in the history
* HighNodeLaunchErrorRate: fires when the rate of node launch errors
per node events and per provider reaches over 5% for one hour.
* DIBImageBuildFailure: fires when a DIB build fails for any image.
* HighFailedStateRate: fires when the rate of nodes in failed state
on any provider reaches over 5% for one hour.
* HighOpenStackAPIError5xxRate: more than 5% of OpenStack API calls
resulted in error 5xx in the last 15 minutes.
* ConfigUpdateFailureInPostPipeline: fires when a "config-update"
job fails on tenant internal in the post pipeline, meaning latest
config changes might not have been applied.

Also refactor severity labels, and rename alerts with the same
pattern ("..._default").

Change-Id: I608357e587ed4bdac69fe6f515c4c1d55549712f
  • Loading branch information
mhuin committed Oct 10, 2023
1 parent 1b21b04 commit 7b55a7d
Show file tree
Hide file tree
Showing 5 changed files with 169 additions and 11 deletions.
8 changes: 8 additions & 0 deletions controllers/libs/monitoring/monitoring.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,14 @@ func MkPrometheusRuleGroup(name string, rules []monitoringv1.Rule) monitoringv1.
}
}

var CriticalSeverityLabel = map[string]string{
"severity": "critical",
}

var WarningSeverityLabel = map[string]string{
"severity": "warning",
}

func MkPrometheusAlertRule(name string, expr intstr.IntOrString, forDuration string, labels map[string]string, annotations map[string]string) monitoringv1.Rule {
f := monitoringv1.Duration(forDuration)
return monitoringv1.Rule{
Expand Down
20 changes: 11 additions & 9 deletions controllers/logserver_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,10 +125,6 @@ func (r *LogServerController) ensureLogserverPodMonitor() bool {

// Create some default, interesting alerts
func (r *LogServerController) ensureLogserverPromRule() bool {
diskFullLabels := map[string]string{
"lasttime": "{{ $value | humanizeTimestamp }}",
"severity": "critical",
}
diskFullAnnotations := map[string]string{
"description": "Log server only has {{ $value | humanize1024 }} free disk available.",
"summary": "Log server out of disk",
Expand All @@ -144,7 +140,7 @@ func (r *LogServerController) ensureLogserverPromRule() bool {
" node_filesystem_size_bytes{job=\""+r.ns+"/"+logserverIdent+"-monitor\"} < 10) and "+
"(node_filesystem_avail_bytes{job=\""+r.ns+"/"+logserverIdent+"-monitor\"} < 20 * 1024 ^ 3)"),
"30m",
diskFullLabels,
sfmonitoring.CriticalSeverityLabel,
diskFullAnnotations,
)
diskFullIn3days := sfmonitoring.MkPrometheusAlertRule(
Expand All @@ -155,19 +151,25 @@ func (r *LogServerController) ensureLogserverPromRule() bool {
"(predict_linear(node_filesystem_avail_bytes{job=\""+r.ns+"/"+logserverIdent+"-monitor\"}[1d], 3 * 24 * 3600) < 0) and "+
"(node_filesystem_size_bytes{job=\""+r.ns+"/"+logserverIdent+"-monitor\"} <= 1e+11)"),
"12h",
map[string]string{},
sfmonitoring.WarningSeverityLabel,
diskFull3daysAnnotations,
)
lsDiskRuleGroup := sfmonitoring.MkPrometheusRuleGroup(
"disk.rules",
"disk_default.rules",
[]monitoringv1.Rule{diskFull, diskFullIn3days})
desiredLsPromRule := sfmonitoring.MkPrometheusRuleCR(logserverIdent+".rules", r.ns)
desiredLsPromRule := sfmonitoring.MkPrometheusRuleCR(logserverIdent+"-default.rules", r.ns)
desiredLsPromRule.Spec.Groups = append(desiredLsPromRule.Spec.Groups, lsDiskRuleGroup)

// add annotations so we can handle lifecycle
annotations := map[string]string{
"version": "1",
"version": "2",
}
// delete badly named, previous rule - TODO remove this after next release
badPromRule := monitoringv1.PrometheusRule{}
if r.GetM(logserverIdent+".rules", &badPromRule) {
r.DeleteR(&badPromRule)
}

desiredLsPromRule.ObjectMeta.Annotations = annotations
currentPromRule := monitoringv1.PrometheusRule{}
if !r.GetM(desiredLsPromRule.Name, &currentPromRule) {
Expand Down
104 changes: 103 additions & 1 deletion controllers/nodepool.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
appsv1 "k8s.io/api/apps/v1"
apiv1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
)

//go:embed static/nodepool/generate-config.sh
Expand Down Expand Up @@ -147,6 +148,106 @@ func (r *SFController) EnsureNodepoolPodMonitor() bool {
return true
}

// create default alerts
func (r *SFController) ensureNodepoolPromRule() bool {
/* Alert when more than 5% of node launches resulted in failure in the last hour with any provider */
highLaunchErrorRateAnnotations := map[string]string{
"description": "More than 5% ({{ $value }}%) of node launch events for provider {{ $labels.provider }} were failures in the last hour",
"summary": "Too many nodes failing to launch on provider {{ $labels.provider }}",
}

highLaunchErrorRate := monitoring.MkPrometheusAlertRule(
"HighNodeLaunchErrorRate",
intstr.FromString(
"sum(rate(nodepool_launch_provider_error{error=~'.*'}[1h]))"+
" / (sum(rate(nodepool_launch_provider_ready[1h])) + "+
"sum(rate(nodepool_launch_provider_error{error=~'.*'}[1h]))) * 100 > 5"),
"1h",
monitoring.CriticalSeverityLabel,
highLaunchErrorRateAnnotations,
)

/* Alert when a DIB build failed */
dibImageBuildFailureAnnotations := map[string]string{
"summary": "DIB failure: {{ $labels.diskimage }}",
"description": "DIB could not build image {{ $labels.diskimage }}, check build logs",
}
dibImageBuildFailure := monitoring.MkPrometheusAlertRule(
"DIBImageBuildFailure",
intstr.FromString(
"nodepool_dib_image_build_status_rc != 0"),
"0m",
monitoring.WarningSeverityLabel,
dibImageBuildFailureAnnotations,
)

/* Alert when more than 5% of nodes are in "failed" state for more than 1h with any provider */
highFailedStateRateAnnotations := map[string]string{
"description": "More than 5% ({{ $value }}%) of nodes were in failed state in the last hour on provider {{ $labels.provider }}",
"summary": "Too many failed nodes on provider {{ $labels.provider }}",
}

highFailedStateRate := monitoring.MkPrometheusAlertRule(
"HighFailedStateRate",
intstr.FromString(
"sum(rate(nodepool_provider_nodes{state='failed'}[1h]))"+
" / sum(rate(nodepool_launch_provider_error{state=~'.*'}[1h])) * 100 > 5"),
"1h",
monitoring.CriticalSeverityLabel,
highFailedStateRateAnnotations,
)

/* Alert when more than 5% of OpenStack API calls return with status 5xx */
highOpenStackAPIError5xxRateAnnotations := map[string]string{
"description": "More than 5% ({{ $value }}%) of API calls to service {{ $labels.service }} / {{ $labels.method }} / {{ $labels.operation }} resulted in HTTP error code 5xx on provider {{ $labels.provider }}",
"summary": "Too many OpenStack API errors on provider {{ $labels.provider }}",
}

highOpenStackAPIError5xxRate := monitoring.MkPrometheusAlertRule(
"HighOpenStackAPIError5xxRate",
intstr.FromString(
"sum(rate(nodepool_task_openstack{status=~'5..'}[15m]))"+
" / sum(rate(nodepool_task_openstack{status=~'.*'}[15m])) * 100 > 5"),
"15m",
monitoring.CriticalSeverityLabel,
highOpenStackAPIError5xxRateAnnotations,
)

launcherRuleGroup := monitoring.MkPrometheusRuleGroup(
"launcher_default.rules",
[]monitoringv1.Rule{
highLaunchErrorRate,
highFailedStateRate,
highOpenStackAPIError5xxRate,
})
builderRuleGroup := monitoring.MkPrometheusRuleGroup(
"builder_default.rules",
[]monitoringv1.Rule{
dibImageBuildFailure,
})
desiredNodepoolPromRule := monitoring.MkPrometheusRuleCR(nodepoolIdent+"-default.rules", r.ns)
desiredNodepoolPromRule.Spec.Groups = append(desiredNodepoolPromRule.Spec.Groups, launcherRuleGroup, builderRuleGroup)

annotations := map[string]string{
"version": "1",
}
desiredNodepoolPromRule.ObjectMeta.Annotations = annotations
currentPromRule := monitoringv1.PrometheusRule{}
if !r.GetM(desiredNodepoolPromRule.Name, &currentPromRule) {
r.CreateR(&desiredNodepoolPromRule)
return false
} else {
if !utils.MapEquals(&currentPromRule.ObjectMeta.Annotations, &annotations) {
r.log.V(1).Info("Nodepool default Prometheus rules changed, updating...")
currentPromRule.Spec = desiredNodepoolPromRule.Spec
currentPromRule.ObjectMeta.Annotations = annotations
r.UpdateR(&currentPromRule)
return false
}
}
return true
}

func (r *SFController) DeployNodepoolBuilder(statsdExporterVolume apiv1.Volume) bool {

r.EnsureSSHKeySecret("nodepool-builder-ssh-key")
Expand Down Expand Up @@ -462,8 +563,9 @@ func (r *SFController) DeployNodepool() map[string]bool {
})
statsdVolume := base.MkVolumeCM("statsd-config", "np-statsd-config-map")

// Ensure monitoring
// Ensure monitoring - TODO add to condition
r.EnsureNodepoolPodMonitor()
r.ensureNodepoolPromRule()

deployments := make(map[string]bool)
deployments[LauncherIdent] = r.DeployNodepoolLauncher(statsdVolume)
Expand Down
46 changes: 46 additions & 0 deletions controllers/zuul.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
appsv1 "k8s.io/api/apps/v1"
apiv1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/utils/pointer"

monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
Expand Down Expand Up @@ -396,6 +397,49 @@ func (r *SFController) EnsureZuulPodMonitor() bool {
return true
}

// create default alerts
func (r *SFController) ensureZuulPromRule() bool {
configUpdateFailureInPostAnnotations := map[string]string{
"description": "A config-update job failed in the post pipeline. Latest changes might not have been applied. Please check services configurations",
"summary": "config-update failure post merge",
}

configUpdateFailureInPost := monitoring.MkPrometheusAlertRule(
"ConfigUpdateFailureInPostPipeline",
intstr.FromString(
"increase(zuul_tenant_pipeline_project_job_count"+
"{jobname=\"config-update\",tenant=\"internal\",pipeline=\"post\",result!~\"SUCCESS|wait_time\"}[1m]) > 0"),
"0m",
monitoring.CriticalSeverityLabel,
configUpdateFailureInPostAnnotations,
)

configRepoRuleGroup := monitoring.MkPrometheusRuleGroup(
"config-repository_default.rules",
[]monitoringv1.Rule{configUpdateFailureInPost})
desiredZuulPromRule := monitoring.MkPrometheusRuleCR("zuul-default.rules", r.ns)
desiredZuulPromRule.Spec.Groups = append(desiredZuulPromRule.Spec.Groups, configRepoRuleGroup)

annotations := map[string]string{
"version": "1",
}
desiredZuulPromRule.ObjectMeta.Annotations = annotations
currentPromRule := monitoringv1.PrometheusRule{}
if !r.GetM(desiredZuulPromRule.Name, &currentPromRule) {
r.CreateR(&desiredZuulPromRule)
return false
} else {
if !utils.MapEquals(&currentPromRule.ObjectMeta.Annotations, &annotations) {
r.log.V(1).Info("Zuul default Prometheus rules changed, updating...")
currentPromRule.Spec = desiredZuulPromRule.Spec
currentPromRule.ObjectMeta.Annotations = annotations
r.UpdateR(&currentPromRule)
return false
}
}
return true
}

func (r *SFController) EnsureZuulConfigSecret(cfg *ini.File) {
r.EnsureSecret(&apiv1.Secret{
Data: map[string][]byte{
Expand Down Expand Up @@ -587,6 +631,8 @@ func (r *SFController) DeployZuul() bool {
r.EnsureZuulComponentsFrontServices()
// We could condition readiness to the state of the PodMonitor, but we don't.
r.EnsureZuulPodMonitor()
r.ensureZuulPromRule()

return r.EnsureZuulComponents(initContainers, cfgINI) && r.setupZuulIngress()
}

Expand Down
2 changes: 1 addition & 1 deletion roles/health-check/test-monitoring/tasks/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
- "logserver-nodeexporter"

- name: Fetch defined alerts for logserver
ansible.builtin.shell: curl -k https://{{ prometheus_host }}/api/v1/rules | jq '.data.groups[] | select(.name == "disk.rules") | .rules[] | select(.name == "{{ item }}") | .health'
ansible.builtin.shell: curl -k https://{{ prometheus_host }}/api/v1/rules | jq '.data.groups[] | select(.name == "disk_default.rules") | .rules[] | select(.name == "{{ item }}") | .health'
register: logserver_alert
failed_when: "\"ok\" not in logserver_alert.stdout"
loop:
Expand Down

0 comments on commit 7b55a7d

Please sign in to comment.