From 950b2717bf4afde808ddc8ea85a346373ebcb640 Mon Sep 17 00:00:00 2001
From: Theo Brigitte <theo.brigitte@gmail.com>
Date: Mon, 26 May 2025 12:42:40 +0200
Subject: [PATCH 1/2] Update JobScrapingFailure to page owner team

---
 CHANGELOG.md                                             | 1 +
 .../atlas/alerting-rules/monitoring-pipeline.rules.yml   | 9 ++++-----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 83de91a2b..7da40c1c1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Rules unit tests: support for `$provider` template so we can move provider-specific tests to global tests.
 - Rules unit tests: simplify files organization by removing the `capi` folder. Also fixes a bug in cloud-director tests.
 - Rules linting: run against all configured providers.
+- Change `JobScrapingFailure` to page owner team and default to atlas if not set.
 
 ## [4.62.0] - 2025-05-15
 
diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml
index e63282f57..48b5a7952 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml
@@ -42,16 +42,15 @@ spec:
         summary: Monitoring agent failed to scrape all targets in a job.
         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/monitoring-job-scraping-failure/
       expr: |-
-        (
-          count(up == 0) by (job, installation, cluster_id, provider, pipeline)
+        label_replace((
+          count(up == 0) by (job, installation, cluster_id, provider, pipeline, team)
           /
-          count(up) by (job, installation, cluster_id, provider, pipeline)
-        ) >= 1
+          count(up) by (job, installation, cluster_id, provider, pipeline, team)
+        ), "team", "atlas", "team", "^$") >= 1
       for: 1d
       labels:
         area: platform
         severity: notify
-        team: atlas
         topic: observability
         cancel_if_outside_working_hours: "true"
     - alert: CriticalJobScrapingFailure

From 3a06dde86008e87cd57adba41b9397a32b6a5681 Mon Sep 17 00:00:00 2001
From: Theo Brigitte <theo.brigitte@gmail.com>
Date: Sun, 23 Nov 2025 17:10:11 +0100
Subject: [PATCH 2/2] Use the app label from the up metrics

---
 .../alerting-rules/monitoring-pipeline.rules.yml     | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml
index 9816c4eaa..c50a71c2b 100644
--- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml
+++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml
@@ -44,10 +44,14 @@ spec:
       expr: |-
         # This alert uses the same logic as the `aggregation:giantswarm:jobscrapingfailures` recording rule
         (
-          count(up == 0) by (job, installation, cluster_id, provider, pipeline, team)
-          /
-          count(up) by (job, installation, cluster_id, provider, pipeline, team)
-        ), "team", "atlas", "team", "^$") >= 1
+          (
+            count(up == 0) by (app, job, installation, cluster_id, provider, pipeline)
+            /
+            count(up) by (app, job, installation, cluster_id, provider, pipeline)
+          )
+          * on(app, cluster_id) group_left(team)
+            label_replace(app_operator_app_info, "app", "$1", "name", "[^-]+-(.+)")
+        ) >= 1
       for: 1d
       labels:
         area: platform