Skip to content

Commit

Permalink
MimirRolloutStuck: critical if over 24h
Browse files Browse the repository at this point in the history
The warning can go unnoticed
  • Loading branch information
julienduchesne committed Nov 7, 2024
1 parent 6bf0b93 commit b0f98f2
Show file tree
Hide file tree
Showing 4 changed files with 219 additions and 67 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,34 @@ spec:
labels:
severity: warning
workload_type: statefulset
- alert: MimirRolloutStuck
annotations:
message: |
The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck
expr: |
(
max without (revision) (
sum without(statefulset) (label_replace(kube_statefulset_status_current_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))
unless
sum without(statefulset) (label_replace(kube_statefulset_status_update_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))
)
*
(
sum without(statefulset) (label_replace(kube_statefulset_replicas, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))
!=
sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))
)
) and (
changes(sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))[15m:1m])
==
0
)
* on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info)
for: 24h
labels:
severity: critical
workload_type: statefulset
- alert: MimirRolloutStuck
annotations:
message: |
Expand All @@ -393,6 +421,26 @@ spec:
labels:
severity: warning
workload_type: deployment
- alert: MimirRolloutStuck
annotations:
message: |
The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck
expr: |
(
sum without(deployment) (label_replace(kube_deployment_spec_replicas, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))
!=
sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))
) and (
changes(sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))[15m:1m])
==
0
)
* on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info)
for: 24h
labels:
severity: critical
workload_type: deployment
- alert: RolloutOperatorNotReconciling
annotations:
message: |
Expand Down
48 changes: 48 additions & 0 deletions operations/mimir-mixin-compiled-baremetal/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,34 @@ groups:
labels:
severity: warning
workload_type: statefulset
- alert: MimirRolloutStuck
annotations:
message: |
The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck
expr: |
(
max without (revision) (
sum without(statefulset) (label_replace(kube_statefulset_status_current_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))
unless
sum without(statefulset) (label_replace(kube_statefulset_status_update_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))
)
*
(
sum without(statefulset) (label_replace(kube_statefulset_replicas, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))
!=
sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))
)
) and (
changes(sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))[15m:1m])
==
0
)
* on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info)
for: 24h
labels:
severity: critical
workload_type: statefulset
- alert: MimirRolloutStuck
annotations:
message: |
Expand All @@ -381,6 +409,26 @@ groups:
labels:
severity: warning
workload_type: deployment
- alert: MimirRolloutStuck
annotations:
message: |
The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck
expr: |
(
sum without(deployment) (label_replace(kube_deployment_spec_replicas, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))
!=
sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))
) and (
changes(sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))[15m:1m])
==
0
)
* on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info)
for: 24h
labels:
severity: critical
workload_type: deployment
- alert: RolloutOperatorNotReconciling
annotations:
message: |
Expand Down
48 changes: 48 additions & 0 deletions operations/mimir-mixin-compiled/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,34 @@ groups:
labels:
severity: warning
workload_type: statefulset
- alert: MimirRolloutStuck
annotations:
message: |
The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck
expr: |
(
max without (revision) (
sum without(statefulset) (label_replace(kube_statefulset_status_current_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))
unless
sum without(statefulset) (label_replace(kube_statefulset_status_update_revision, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))
)
*
(
sum without(statefulset) (label_replace(kube_statefulset_replicas, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))
!=
sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))
)
) and (
changes(sum without(statefulset) (label_replace(kube_statefulset_status_replicas_updated, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))[15m:1m])
==
0
)
* on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info)
for: 24h
labels:
severity: critical
workload_type: statefulset
- alert: MimirRolloutStuck
annotations:
message: |
Expand All @@ -381,6 +409,26 @@ groups:
labels:
severity: warning
workload_type: deployment
- alert: MimirRolloutStuck
annotations:
message: |
The {{ $labels.rollout_group }} rollout is stuck in {{ $labels.cluster }}/{{ $labels.namespace }}.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrolloutstuck
expr: |
(
sum without(deployment) (label_replace(kube_deployment_spec_replicas, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))
!=
sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))
) and (
changes(sum without(deployment) (label_replace(kube_deployment_status_replicas_updated, "rollout_group", "$1", "deployment", "(.*?)(?:-zone-[a-z])?"))[15m:1m])
==
0
)
* on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info)
for: 24h
labels:
severity: critical
workload_type: deployment
- alert: RolloutOperatorNotReconciling
annotations:
message: |
Expand Down
142 changes: 75 additions & 67 deletions operations/mimir-mixin/alerts/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -543,78 +543,86 @@ local utils = import 'mixin-utils/utils.libsonnet';
],
},
{
name: 'mimir-rollout-alerts',
rules: [
{
alert: $.alertName('RolloutStuck'),
expr: |||
(
max without (revision) (
%(kube_statefulset_status_current_revision)s
unless
%(kube_statefulset_status_update_revision)s
)
*
(
%(kube_statefulset_replicas)s
!=
%(kube_statefulset_status_replicas_updated)s
)
) and (
changes(%(kube_statefulset_status_replicas_updated)s[%(range_interval)s])
==
0
local statefulset_rollout_stuck(for_duration, severity) = {
alert: $.alertName('RolloutStuck'),
expr: |||
(
max without (revision) (
%(kube_statefulset_status_current_revision)s
unless
%(kube_statefulset_status_update_revision)s
)
* on(%(aggregation_labels)s) group_left max by(%(aggregation_labels)s) (cortex_build_info)
||| % {
aggregation_labels: $._config.alert_aggregation_labels,
kube_statefulset_status_current_revision: groupStatefulSetByRolloutGroup('kube_statefulset_status_current_revision'),
kube_statefulset_status_update_revision: groupStatefulSetByRolloutGroup('kube_statefulset_status_update_revision'),
kube_statefulset_replicas: groupStatefulSetByRolloutGroup('kube_statefulset_replicas'),
kube_statefulset_status_replicas_updated: groupStatefulSetByRolloutGroup('kube_statefulset_status_replicas_updated'),
range_interval: '15m:' + $.alertRangeInterval(1),
},
'for': '30m',
labels: {
severity: 'warning',
workload_type: 'statefulset',
},
annotations: {
message: |||
The {{ $labels.rollout_group }} rollout is stuck in %(alert_aggregation_variables)s.
||| % $._config,
},
},
{
alert: $.alertName('RolloutStuck'),
expr: |||
*
(
%(kube_deployment_spec_replicas)s
%(kube_statefulset_replicas)s
!=
%(kube_deployment_status_replicas_updated)s
) and (
changes(%(kube_deployment_status_replicas_updated)s[%(range_interval)s])
==
0
%(kube_statefulset_status_replicas_updated)s
)
* on(%(aggregation_labels)s) group_left max by(%(aggregation_labels)s) (cortex_build_info)
||| % {
aggregation_labels: $._config.alert_aggregation_labels,
kube_deployment_spec_replicas: groupDeploymentByRolloutGroup('kube_deployment_spec_replicas'),
kube_deployment_status_replicas_updated: groupDeploymentByRolloutGroup('kube_deployment_status_replicas_updated'),
range_interval: '15m:' + $.alertRangeInterval(1),
},
'for': '30m',
labels: {
severity: 'warning',
workload_type: 'deployment',
},
annotations: {
message: |||
The {{ $labels.rollout_group }} rollout is stuck in %(alert_aggregation_variables)s.
||| % $._config,
},
) and (
changes(%(kube_statefulset_status_replicas_updated)s[%(range_interval)s])
==
0
)
* on(%(aggregation_labels)s) group_left max by(%(aggregation_labels)s) (cortex_build_info)
||| % {
aggregation_labels: $._config.alert_aggregation_labels,
kube_statefulset_status_current_revision: groupStatefulSetByRolloutGroup('kube_statefulset_status_current_revision'),
kube_statefulset_status_update_revision: groupStatefulSetByRolloutGroup('kube_statefulset_status_update_revision'),
kube_statefulset_replicas: groupStatefulSetByRolloutGroup('kube_statefulset_replicas'),
kube_statefulset_status_replicas_updated: groupStatefulSetByRolloutGroup('kube_statefulset_status_replicas_updated'),
range_interval: '15m:' + $.alertRangeInterval(1),
},
'for': for_duration,
labels: {
severity: severity,
workload_type: 'statefulset',
},
annotations: {
message: |||
The {{ $labels.rollout_group }} rollout is stuck in %(alert_aggregation_variables)s.
||| % $._config,
},
},

local deployment_rollout_stuck(for_duration, severity) = {
alert: $.alertName('RolloutStuck'),
expr: |||
(
%(kube_deployment_spec_replicas)s
!=
%(kube_deployment_status_replicas_updated)s
) and (
changes(%(kube_deployment_status_replicas_updated)s[%(range_interval)s])
==
0
)
* on(%(aggregation_labels)s) group_left max by(%(aggregation_labels)s) (cortex_build_info)
||| % {
aggregation_labels: $._config.alert_aggregation_labels,
kube_deployment_spec_replicas: groupDeploymentByRolloutGroup('kube_deployment_spec_replicas'),
kube_deployment_status_replicas_updated: groupDeploymentByRolloutGroup('kube_deployment_status_replicas_updated'),
range_interval: '15m:' + $.alertRangeInterval(1),
},
'for': for_duration,
labels: {
severity: severity,
workload_type: 'deployment',
},
annotations: {
message: |||
The {{ $labels.rollout_group }} rollout is stuck in %(alert_aggregation_variables)s.
||| % $._config,
},
},


name: 'mimir-rollout-alerts',
rules: [
statefulset_rollout_stuck('30m', 'warning'),
statefulset_rollout_stuck('24h', 'critical'),
deployment_rollout_stuck('30m', 'warning'),
deployment_rollout_stuck('24h', 'critical'),

{
alert: 'RolloutOperatorNotReconciling',
expr: |||
Expand Down

0 comments on commit b0f98f2

Please sign in to comment.