@@ -20,9 +20,9 @@ groups:
20
20
The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors.
21
21
runbook_url : https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequesterrors
22
22
expr : |
23
- 100 * sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"ready"}[1m]))
23
+ 100 * sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",route!~"ready|debug_pprof "}[1m]))
24
24
/
25
- sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready"}[1m]))
25
+ sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof "}[1m]))
26
26
> 1
27
27
for : 15m
28
28
labels :
@@ -33,7 +33,7 @@ groups:
33
33
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
34
34
runbook_url : https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequestlatency
35
35
expr : |
36
- cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop"}
36
+ cluster_namespace_job_route:cortex_request_duration_seconds:99quantile{route!~"metrics|/frontend.Frontend/Process|ready|/schedulerpb.SchedulerForFrontend/FrontendLoop|/schedulerpb.SchedulerForQuerier/QuerierLoop|debug_pprof "}
37
37
>
38
38
2.5
39
39
for : 15m
@@ -117,11 +117,20 @@ groups:
117
117
- alert : MimirIngesterRestarts
118
118
annotations :
119
119
message :
120
- ' {{ $labels.job }}/ {{ $labels.instance }} has restarted {{ printf "%.2f"
121
- $value }} times in the last 30 mins.'
120
+ Mimir {{ $labels.instance }} in {{ $labels.cluster }}/ {{ $labels.namespace
121
+ }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins.
122
122
runbook_url : https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterrestarts
123
123
expr : |
124
- changes(process_start_time_seconds{job=~".*/(ingester.*|cortex|mimir|mimir-write.*)"}[30m]) >= 2
124
+ (
125
+ sum by(cluster, namespace, instance) (
126
+ increase(kube_pod_container_status_restarts_total{container=~"(ingester|mimir-write)"}[30m])
127
+ )
128
+ >= 2
129
+ )
130
+ and
131
+ (
132
+ count by(cluster, namespace, instance) (cortex_build_info) > 0
133
+ )
125
134
labels :
126
135
severity : warning
127
136
- alert : MimirKVStoreFailure
@@ -187,6 +196,21 @@ groups:
187
196
for : 1h
188
197
labels :
189
198
severity : warning
199
+ - alert : MimirIngestedDataTooFarInTheFuture
200
+ annotations :
201
+ message :
202
+ Mimir ingester {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace
203
+ }} has ingested samples with timestamps more than 1h in the future.
204
+ runbook_url : https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesteddatatoofarinthefuture
205
+ expr : |
206
+ max by(cluster, namespace, instance) (
207
+ cortex_ingester_tsdb_head_max_timestamp_seconds - time()
208
+ and
209
+ cortex_ingester_tsdb_head_max_timestamp_seconds > 0
210
+ ) > 60*60
211
+ for : 5m
212
+ labels :
213
+ severity : warning
190
214
- alert : MimirRingMembersMismatch
191
215
annotations :
192
216
message : |
@@ -350,26 +374,6 @@ groups:
350
374
severity : critical
351
375
- name : mimir-provisioning
352
376
rules :
353
- - alert : MimirProvisioningTooManyActiveSeries
354
- annotations :
355
- message : |
356
- The number of in-memory series per ingester in {{ $labels.cluster }}/{{ $labels.namespace }} is too high.
357
- runbook_url : https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirprovisioningtoomanyactiveseries
358
- expr : |
359
- avg by (cluster, namespace) (cortex_ingester_memory_series) > 1.6e6
360
- for : 2h
361
- labels :
362
- severity : warning
363
- - alert : MimirProvisioningTooManyWrites
364
- annotations :
365
- message : |
366
- Ingesters in {{ $labels.cluster }}/{{ $labels.namespace }} ingest too many samples per second.
367
- runbook_url : https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirprovisioningtoomanywrites
368
- expr : |
369
- avg by (cluster, namespace) (cluster_namespace_instance:cortex_ingester_ingested_samples_total:rate1m) > 80e3
370
- for : 15m
371
- labels :
372
- severity : warning
373
377
- alert : MimirAllocatingTooMuchMemory
374
378
annotations :
375
379
message : |
@@ -476,7 +480,7 @@ groups:
476
480
}} sees incorrect number of gossip members.
477
481
runbook_url : https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirgossipmembersmismatch
478
482
expr : |
479
- avg by (cluster, namespace) (memberlist_client_cluster_members_count) != sum by (cluster, namespace) (up{job=~".+/(alertmanager|compactor|distributor|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"})
483
+ avg by (cluster, namespace) (memberlist_client_cluster_members_count) != sum by (cluster, namespace) (up{job=~".+/(admin-api| alertmanager|compactor.* |distributor|ingester.*|querier.*|ruler|ruler-querier.*|store-gateway.*|cortex|mimir|mimir-write.*|mimir-read.*|mimir-backend.*)"})
480
484
for : 15m
481
485
labels :
482
486
severity : warning
@@ -615,9 +619,9 @@ groups:
615
619
}} has not shipped any block in the last 4 hours.
616
620
runbook_url : https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblocks
617
621
expr : |
618
- (min by(cluster, namespace, instance) (time() - thanos_shipper_last_successful_upload_time ) > 60 * 60 * 4)
622
+ (min by(cluster, namespace, instance) (time() - cortex_ingester_shipper_last_successful_upload_timestamp_seconds ) > 60 * 60 * 4)
619
623
and
620
- (max by(cluster, namespace, instance) (thanos_shipper_last_successful_upload_time ) > 0)
624
+ (max by(cluster, namespace, instance) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds ) > 0)
621
625
and
622
626
# Only if the ingester has ingested samples over the last 4h.
623
627
(max by(cluster, namespace, instance) (max_over_time(cluster_namespace_instance:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
@@ -637,7 +641,7 @@ groups:
637
641
}} has not shipped any block in the last 4 hours.
638
642
runbook_url : https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimiringesterhasnotshippedblockssincestart
639
643
expr : |
640
- (max by(cluster, namespace, instance) (thanos_shipper_last_successful_upload_time ) == 0)
644
+ (max by(cluster, namespace, instance) (cortex_ingester_shipper_last_successful_upload_timestamp_seconds ) == 0)
641
645
and
642
646
(max by(cluster, namespace, instance) (max_over_time(cluster_namespace_instance:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
643
647
for : 4h
0 commit comments