From b0f98f23a2d2f098d33c87c514ae2105e50b1e64 Mon Sep 17 00:00:00 2001 From: Julien Duchesne Date: Wed, 6 Nov 2024 22:37:24 -0500 Subject: [PATCH] `MimirRolloutStuck`: critical if over 24h The warning can go unnoticed --- .../metamonitoring/mixin-alerts.yaml | 48 ++++++ .../alerts.yaml | 48 ++++++ operations/mimir-mixin-compiled/alerts.yaml | 48 ++++++ .../mimir-mixin/alerts/alerts.libsonnet | 142 +++++++++--------- 4 files changed, 219 insertions(+), 67 deletions(-) diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index b45db52d7ef..44d259a2be9 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -373,6 +373,34 @@ spec: labels: severity: warning workload_type: statefulset + - alert: MimirRolloutStuck + annotations: + message: | + The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck + expr: | + ( + max without (revision) ( + sum without(statefulset) (label_replace(kube_statefulset_status_current_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + unless + sum without(statefulset) (label_replace(kube_statefulset_status_update_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + ) + * + ( + sum without(statefulset) (label_replace(kube_statefulset_replicas, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + != + sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + ) + ) and ( + changes(sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) + == + 0 + ) + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + for: 24h + labels: + severity: critical + workload_type: statefulset - alert: MimirRolloutStuck annotations: message: | @@ -393,6 +421,26 @@ spec: labels: severity: warning workload_type: deployment + - alert: MimirRolloutStuck + annotations: + message: | + The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck + expr: | + ( + sum without(deployment) (label_replace(kube_deployment_spec_replicas, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) + != + sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) + ) and ( + changes(sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) + == + 0 + ) + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + for: 24h + labels: + severity: critical + workload_type: deployment - alert: RolloutOperatorNotReconciling annotations: message: | diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index 4f87df5ba22..2a32b48f19b 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -361,6 +361,34 @@ groups: labels: severity: warning workload_type: statefulset + - alert: MimirRolloutStuck + annotations: + message: | + The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck + expr: | + ( + max without (revision) ( + sum without(statefulset) (label_replace(kube_statefulset_status_current_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + unless + sum without(statefulset) (label_replace(kube_statefulset_status_update_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + ) + * + ( + sum without(statefulset) (label_replace(kube_statefulset_replicas, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + != + sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + ) + ) and ( + changes(sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) + == + 0 + ) + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + for: 24h + labels: + severity: critical + workload_type: statefulset - alert: MimirRolloutStuck annotations: message: | @@ -381,6 +409,26 @@ groups: labels: severity: warning workload_type: deployment + - alert: MimirRolloutStuck + annotations: + message: | + The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck + expr: | + ( + sum without(deployment) (label_replace(kube_deployment_spec_replicas, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) + != + sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) + ) and ( + changes(sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) + == + 0 + ) + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + for: 24h + labels: + severity: critical + workload_type: deployment - alert: RolloutOperatorNotReconciling annotations: message: | diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index dc7cdd4e8eb..d46e0511018 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -361,6 +361,34 @@ groups: labels: severity: warning workload_type: statefulset + - alert: MimirRolloutStuck + annotations: + message: | + The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck + expr: | + ( + max without (revision) ( + sum without(statefulset) (label_replace(kube_statefulset_status_current_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + unless + sum without(statefulset) (label_replace(kube_statefulset_status_update_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + ) + * + ( + sum without(statefulset) (label_replace(kube_statefulset_replicas, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + != + sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")) + ) + ) and ( + changes(sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) + == + 0 + ) + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + for: 24h + labels: + severity: critical + workload_type: statefulset - alert: MimirRolloutStuck annotations: message: | @@ -381,6 +409,26 @@ groups: labels: severity: warning workload_type: deployment + - alert: MimirRolloutStuck + annotations: + message: | + The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck + expr: | + ( + sum without(deployment) (label_replace(kube_deployment_spec_replicas, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) + != + sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?")) + ) and ( + changes(sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))[15m:1m]) + == + 0 + ) + * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) + for: 24h + labels: + severity: critical + workload_type: deployment - alert: RolloutOperatorNotReconciling annotations: message: | diff --git a/operations/mimir-mixin/alerts/alerts.libsonnet b/operations/mimir-mixin/alerts/alerts.libsonnet index f66af7829a6..3206093dcee 100644 --- a/operations/mimir-mixin/alerts/alerts.libsonnet +++ b/operations/mimir-mixin/alerts/alerts.libsonnet @@ -543,78 +543,86 @@ local utils = import 'mixin-utils/utils.libsonnet'; ], }, { - name: 'mimir-rollout-alerts', - rules: [ - { - alert: $.alertName('RolloutStuck'), - expr: ||| - ( - max without (revision) ( - %(kube_statefulset_status_current_revision)s - unless - %(kube_statefulset_status_update_revision)s - ) - * - ( - %(kube_statefulset_replicas)s - != - %(kube_statefulset_status_replicas_updated)s - ) - ) and ( - changes(%(kube_statefulset_status_replicas_updated)s[%(range_interval)s]) - == - 0 + local statefulset_rollout_stuck(for_duration, severity) = { + alert: $.alertName('RolloutStuck'), + expr: ||| + ( + max without (revision) ( + %(kube_statefulset_status_current_revision)s + unless + %(kube_statefulset_status_update_revision)s ) - * on(%(aggregation_labels)s) group_left max by(%(aggregation_labels)s) (cortex_build_info) - ||| % { - aggregation_labels: $._config.alert_aggregation_labels, - kube_statefulset_status_current_revision: groupStatefulSetByRolloutGroup('kube_statefulset_status_current_revision'), - kube_statefulset_status_update_revision: groupStatefulSetByRolloutGroup('kube_statefulset_status_update_revision'), - kube_statefulset_replicas: groupStatefulSetByRolloutGroup('kube_statefulset_replicas'), - kube_statefulset_status_replicas_updated: groupStatefulSetByRolloutGroup('kube_statefulset_status_replicas_updated'), - range_interval: '15m:' + $.alertRangeInterval(1), - }, - 'for': '30m', - labels: { - severity: 'warning', - workload_type: 'statefulset', - }, - annotations: { - message: ||| - The {{ $labels.rollout_group }} rollout is stuck in %(alert_aggregation_variables)s. - ||| % $._config, - }, - }, - { - alert: $.alertName('RolloutStuck'), - expr: ||| + * ( - %(kube_deployment_spec_replicas)s + %(kube_statefulset_replicas)s != - %(kube_deployment_status_replicas_updated)s - ) and ( - changes(%(kube_deployment_status_replicas_updated)s[%(range_interval)s]) - == - 0 + %(kube_statefulset_status_replicas_updated)s ) - * on(%(aggregation_labels)s) group_left max by(%(aggregation_labels)s) (cortex_build_info) - ||| % { - aggregation_labels: $._config.alert_aggregation_labels, - kube_deployment_spec_replicas: groupDeploymentByRolloutGroup('kube_deployment_spec_replicas'), - kube_deployment_status_replicas_updated: groupDeploymentByRolloutGroup('kube_deployment_status_replicas_updated'), - range_interval: '15m:' + $.alertRangeInterval(1), - }, - 'for': '30m', - labels: { - severity: 'warning', - workload_type: 'deployment', - }, - annotations: { - message: ||| - The {{ $labels.rollout_group }} rollout is stuck in %(alert_aggregation_variables)s. - ||| % $._config, - }, + ) and ( + changes(%(kube_statefulset_status_replicas_updated)s[%(range_interval)s]) + == + 0 + ) + * on(%(aggregation_labels)s) group_left max by(%(aggregation_labels)s) (cortex_build_info) + ||| % { + aggregation_labels: $._config.alert_aggregation_labels, + kube_statefulset_status_current_revision: groupStatefulSetByRolloutGroup('kube_statefulset_status_current_revision'), + kube_statefulset_status_update_revision: groupStatefulSetByRolloutGroup('kube_statefulset_status_update_revision'), + kube_statefulset_replicas: groupStatefulSetByRolloutGroup('kube_statefulset_replicas'), + kube_statefulset_status_replicas_updated: groupStatefulSetByRolloutGroup('kube_statefulset_status_replicas_updated'), + range_interval: '15m:' + $.alertRangeInterval(1), + }, + 'for': for_duration, + labels: { + severity: severity, + workload_type: 'statefulset', + }, + annotations: { + message: ||| + The {{ $labels.rollout_group }} rollout is stuck in %(alert_aggregation_variables)s. + ||| % $._config, }, + }, + + local deployment_rollout_stuck(for_duration, severity) = { + alert: $.alertName('RolloutStuck'), + expr: ||| + ( + %(kube_deployment_spec_replicas)s + != + %(kube_deployment_status_replicas_updated)s + ) and ( + changes(%(kube_deployment_status_replicas_updated)s[%(range_interval)s]) + == + 0 + ) + * on(%(aggregation_labels)s) group_left max by(%(aggregation_labels)s) (cortex_build_info) + ||| % { + aggregation_labels: $._config.alert_aggregation_labels, + kube_deployment_spec_replicas: groupDeploymentByRolloutGroup('kube_deployment_spec_replicas'), + kube_deployment_status_replicas_updated: groupDeploymentByRolloutGroup('kube_deployment_status_replicas_updated'), + range_interval: '15m:' + $.alertRangeInterval(1), + }, + 'for': for_duration, + labels: { + severity: severity, + workload_type: 'deployment', + }, + annotations: { + message: ||| + The {{ $labels.rollout_group }} rollout is stuck in %(alert_aggregation_variables)s. + ||| % $._config, + }, + }, + + + name: 'mimir-rollout-alerts', + rules: [ + statefulset_rollout_stuck('30m', 'warning'), + statefulset_rollout_stuck('24h', 'critical'), + deployment_rollout_stuck('30m', 'warning'), + deployment_rollout_stuck('24h', 'critical'), + { alert: 'RolloutOperatorNotReconciling', expr: |||