diff --git a/CHANGELOG.md b/CHANGELOG.md index 39b28216e..6a32dafcc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Change `JobScrapingFailure` to page owner team and default to atlas if not set. + ### Removed - Remove cluster-autoscaler runbook which does not exist anymore @@ -416,6 +420,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Rules unit tests: simplify files organization by removing the `capi` folder. Also fixes a bug in cloud-director tests. - Rules linting: run against all configured providers. - Exclude more containers from Rocket's `ManagementClusterContainerIsRestartingTooFrequently` alert. + ## [4.62.0] - 2025-05-15 ### Added diff --git a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml index 93c065884..c50a71c2b 100644 --- a/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml +++ b/helm/prometheus-rules/templates/platform/atlas/alerting-rules/monitoring-pipeline.rules.yml @@ -44,15 +44,18 @@ spec: expr: |- # This alert uses the same logic as the `aggregation:giantswarm:jobscrapingfailures` recording rule ( - count(up == 0) by (job, installation, cluster_id, provider, pipeline) - / - count(up) by (job, installation, cluster_id, provider, pipeline) + ( + count(up == 0) by (app, job, installation, cluster_id, provider, pipeline) + / + count(up) by (app, job, installation, cluster_id, provider, pipeline) + ) + * on(app, cluster_id) group_left(team) + label_replace(app_operator_app_info, "app", "$1", "name", "[^-]+-(.+)") ) >= 1 for: 1d labels: area: platform severity: notify - team: atlas topic: observability cancel_if_outside_working_hours: "true" - alert: CriticalJobScrapingFailure