From 950b2717bf4afde808ddc8ea85a346373ebcb640 Mon Sep 17 00:00:00 2001 From: Theo Brigitte Date: Mon, 26 May 2025 12:42:40 +0200 Subject: [PATCH 1/2] Update JobScrapingFailure to page owner team --- CHANGELOG.md | 1 + .../atlas/alerting-rules/monitoring-pipeline.rules.yml | 9 ++++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 83de91a2b..7da40c1c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Rules unit tests: support for `$provider` template so we can move provider-specific tests to global tests. - Rules unit tests: simplify files organization by removing the `capi` folder. Also fixes a bug in cloud-director tests. - Rules linting: run against all configured providers. +- Change `JobScrapingFailure` to page owner team and default to atlas if not set. ## [4.62.0] - 2025-05-15 diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml index e63282f57..48b5a7952 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml @@ -42,16 +42,15 @@ spec: summary: Monitoring agent failed to scrape all targets in a job. runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/monitoring-job-scraping-failure/ expr: |- - ( - count(up == 0) by (job, installation, cluster_id, provider, pipeline) + label_replace(( + count(up == 0) by (job, installation, cluster_id, provider, pipeline, team) / - count(up) by (job, installation, cluster_id, provider, pipeline) - ) >= 1 + count(up) by (job, installation, cluster_id, provider, pipeline, team) + ), "team", "atlas", "team", "^$") >= 1 for: 1d labels: area: platform severity: notify - team: atlas topic: observability cancel_if_outside_working_hours: "true" - alert: CriticalJobScrapingFailure From 3a06dde86008e87cd57adba41b9397a32b6a5681 Mon Sep 17 00:00:00 2001 From: Theo Brigitte Date: Sun, 23 Nov 2025 17:10:11 +0100 Subject: [PATCH 2/2] Use the app label from the up metrics --- .../alerting-rules/monitoring-pipeline.rules.yml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml index 9816c4eaa..c50a71c2b 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml @@ -44,10 +44,14 @@ spec: expr: |- # This alert uses the same logic as the `aggregation:giantswarm:jobscrapingfailures` recording rule ( - count(up == 0) by (job, installation, cluster_id, provider, pipeline, team) - / - count(up) by (job, installation, cluster_id, provider, pipeline, team) - ), "team", "atlas", "team", "^$") >= 1 + ( + count(up == 0) by (app, job, installation, cluster_id, provider, pipeline) + / + count(up) by (app, job, installation, cluster_id, provider, pipeline) + ) + * on(app, cluster_id) group_left(team) + label_replace(app_operator_app_info, "app", "$1", "name", "[^-]+-(.+)") + ) >= 1 for: 1d labels: area: platform