posit-dev · amdove · Mar 17, 2026
@@ -12,10 +12,9 @@
 #
 # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
 #
-# Note: alert annotations reference {{$labels.cluster}}. For Azure Monitor-sourced metrics,
-# this label is injected by the prometheus.relabel.default block in grafana_alloy.py.
-# If Alloy is not running or relabeling is misconfigured, the label will be absent and
-# the annotation will render as "in cluster " (blank).
+# Note: alert annotations reference {{$labels.tenant_name}}. For Azure Monitor-sourced metrics,
+# this label is injected as an external label by grafana_alloy.py.
+# If Alloy is not running or relabeling is misconfigured, the label will be absent.
 apiVersion: 1
 groups:
     - orgId: 1
@@ -73,8 +72,22 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: Azure Load Balancer backend health probe availability is below 100% for over 5 minutes, indicating unhealthy backend instances. This is a critical issue that requires immediate attention.
-            summary: Health probe down on Azure Load Balancer {{$labels.resource}} in cluster {{$labels.cluster}}
+            summary: "🟡 WARNING: Azure Load Balancer Health Probe Down"
+            description: |
+              Backend health probe availability is below 100%, indicating unhealthy backend instances
+
+              ─── WHERE ───────────────────────────
+              Account:     {{ $labels.tenant_name }}
+              Resource:    {{ $labels.resource }}
+              Location:    {{ $labels.location }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Health Probe Availability (DIP)
+              Current:     < 100%
+              Threshold:   100%
+              Duration:    5 minutes
+
+              Note: Traffic is still routed to healthy backends if available.
           labels:
             opsgenie: "1"
           isPaused: false
@@ -128,8 +141,20 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: Azure Load Balancer data path availability is below 100% for over 5 minutes, indicating the load balancer frontend is not responding to health probes. This is a critical issue that requires immediate attention.
-            summary: Data path down on Azure Load Balancer {{$labels.resource}} in cluster {{$labels.cluster}}
+            summary: "🔴 CRITICAL: Azure Load Balancer Data Path Down"
+            description: |
+              Load balancer frontend is not responding to health probes
+
+              ─── WHERE ───────────────────────────
+              Account:     {{ $labels.tenant_name }}
+              Resource:    {{ $labels.resource }}
+              Location:    {{ $labels.location }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Data Path Availability
+              Current:     < 100%
+              Threshold:   100%
+              Duration:    5 minutes
           labels:
             opsgenie: "1"
           isPaused: false
@@ -186,8 +211,23 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: Azure Load Balancer is using more than 80% of allocated SNAT ports for over 5 minutes. SNAT port exhaustion can cause outbound connection failures and may require increasing the number of backend instances or using a NAT Gateway.
-            summary: SNAT port exhaustion on Azure Load Balancer {{$labels.resource}} in cluster {{$labels.cluster}}
+            summary: "🟡 WARNING: Azure Load Balancer SNAT Port Exhaustion"
+            description: |
+              Load balancer is using more than 80% of allocated SNAT ports
+
+              ─── WHERE ───────────────────────────
+              Account:     {{ $labels.tenant_name }}
+              Resource:    {{ $labels.resource }}
+              Location:    {{ $labels.location }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      SNAT Port Utilization
+              Current:     > 80%
+              Threshold:   80%
+              Duration:    5 minutes
+
+              SNAT port exhaustion can cause outbound connection failures.
+              Consider increasing backend instances or using a NAT Gateway.
           labels:
             opsgenie: "1"
           isPaused: false