Skip to content

Commit d7e492c

Browse files
committed
chore: update mimir rules
1 parent 6a72d2f commit d7e492c

File tree

1 file changed

+34
-30
lines changed

1 file changed

+34
-30
lines changed

rules/mimir_alerts.yml

Lines changed: 34 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@ groups:
2020
The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors.
2121
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequesterrors
2222
expr: |
23-
100 * sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"ready"}[1m]))
23+
100 * sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"ready|debug_pprof"}[1m]))
2424
/
25-
sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready"}[1m]))
25+
sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[1m]))
2626
> 1
2727
for: 15m
2828
labels:
@@ -33,7 +33,7 @@ groups:
3333
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
3434
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequestlatency
3535
expr: |
36-
cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"}
36+
cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop|debug_pprof"}
3737
>
3838
2.5
3939
for: 15m
@@ -117,11 +117,20 @@ groups:
117117
- alert: MimirIngesterRestarts
118118
annotations:
119119
message:
120-
'{{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f"
121-
$value }} times in the last 30 mins.'
120+
Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
121+
}} has restarted {{ printf "%.2f" $value }} times in the last 30 mins.
122122
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterrestarts
123123
expr: |
124-
changes(process_start_time_seconds{job=~".*/(ingester.*|cortex|mimir|mimir-write.*)"}[30m]) >= 2
124+
(
125+
sum by(cluster, namespace, instance) (
126+
increase(kube_pod_container_status_restarts_total{container=~"(ingester|mimir-write)"}[30m])
127+
)
128+
>= 2
129+
)
130+
and
131+
(
132+
count by(cluster, namespace, instance) (cortex_build_info) > 0
133+
)
125134
labels:
126135
severity: warning
127136
- alert: MimirKVStoreFailure
@@ -187,6 +196,21 @@ groups:
187196
for: 1h
188197
labels:
189198
severity: warning
199+
- alert: MimirIngestedDataTooFarInTheFuture
200+
annotations:
201+
message:
202+
Mimir ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
203+
}} has ingested samples with timestamps more than 1h in the future.
204+
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesteddatatoofarinthefuture
205+
expr: |
206+
max by(cluster, namespace, instance) (
207+
cortex_ingester_tsdb_head_max_timestamp_seconds - time()
208+
and
209+
cortex_ingester_tsdb_head_max_timestamp_seconds > 0
210+
) > 60*60
211+
for: 5m
212+
labels:
213+
severity: warning
190214
- alert: MimirRingMembersMismatch
191215
annotations:
192216
message: |
@@ -350,26 +374,6 @@ groups:
350374
severity: critical
351375
- name: mimir-provisioning
352376
rules:
353-
- alert: MimirProvisioningTooManyActiveSeries
354-
annotations:
355-
message: |
356-
The number of in-memory series per ingester in {{ $labels.cluster }}/{{ $labels.namespace }} is too high.
357-
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirprovisioningtoomanyactiveseries
358-
expr: |
359-
avg by (cluster, namespace) (cortex_ingester_memory_series) > 1.6e6
360-
for: 2h
361-
labels:
362-
severity: warning
363-
- alert: MimirProvisioningTooManyWrites
364-
annotations:
365-
message: |
366-
Ingesters in {{ $labels.cluster }}/{{ $labels.namespace }} ingest too many samples per second.
367-
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirprovisioningtoomanywrites
368-
expr: |
369-
avg by (cluster, namespace) (cluster_namespace_instance:cortex_ingester_ingested_samples_total:rate1m) > 80e3
370-
for: 15m
371-
labels:
372-
severity: warning
373377
- alert: MimirAllocatingTooMuchMemory
374378
annotations:
375379
message: |
@@ -476,7 +480,7 @@ groups:
476480
}} sees incorrect number of gossip members.
477481
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmembersmismatch
478482
expr: |
479-
avg by (cluster, namespace) (memberlist_client_cluster_members_count) != sum by (cluster, namespace) (up{job=~".+/(alertmanager|compactor|distributor|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"})
483+
avg by (cluster, namespace) (memberlist_client_cluster_members_count) != sum by (cluster, namespace) (up{job=~".+/(admin-api|alertmanager|compactor.*|distributor|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"})
480484
for: 15m
481485
labels:
482486
severity: warning
@@ -615,9 +619,9 @@ groups:
615619
}} has not shipped any block in the last 4 hours.
616620
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblocks
617621
expr: |
618-
(min by(cluster, namespace, instance) (time() - thanos_shipper_last_successful_upload_time) > 60 * 60 * 4)
622+
(min by(cluster, namespace, instance) (time() - cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 60 * 60 * 4)
619623
and
620-
(max by(cluster, namespace, instance) (thanos_shipper_last_successful_upload_time) > 0)
624+
(max by(cluster, namespace, instance) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) > 0)
621625
and
622626
# Only if the ingester has ingested samples over the last 4h.
623627
(max by(cluster, namespace, instance) (max_over_time(cluster_namespace_instance:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
@@ -637,7 +641,7 @@ groups:
637641
}} has not shipped any block in the last 4 hours.
638642
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblockssincestart
639643
expr: |
640-
(max by(cluster, namespace, instance) (thanos_shipper_last_successful_upload_time) == 0)
644+
(max by(cluster, namespace, instance) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds) == 0)
641645
and
642646
(max by(cluster, namespace, instance) (max_over_time(cluster_namespace_instance:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
643647
for: 4h

0 commit comments

Comments
 (0)