diff --git a/python-pulumi/src/ptd/grafana_alerts/azure_loadbalancer.yaml b/python-pulumi/src/ptd/grafana_alerts/azure_loadbalancer.yaml index e5b9ed9..6383aaa 100644 --- a/python-pulumi/src/ptd/grafana_alerts/azure_loadbalancer.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/azure_loadbalancer.yaml @@ -12,10 +12,9 @@ # # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ # -# Note: alert annotations reference {{$labels.cluster}}. For Azure Monitor-sourced metrics, -# this label is injected by the prometheus.relabel.default block in grafana_alloy.py. -# If Alloy is not running or relabeling is misconfigured, the label will be absent and -# the annotation will render as "in cluster " (blank). +# Note: alert annotations reference {{$labels.tenant_name}}. For Azure Monitor-sourced metrics, +# this label is injected as an external label by grafana_alloy.py. +# If Alloy is not running or relabeling is misconfigured, the label will be absent. apiVersion: 1 groups: - orgId: 1 @@ -73,8 +72,22 @@ groups: execErrState: Error for: 5m annotations: - description: Azure Load Balancer backend health probe availability is below 100% for over 5 minutes, indicating unhealthy backend instances. This is a critical issue that requires immediate attention. - summary: Health probe down on Azure Load Balancer {{$labels.resource}} in cluster {{$labels.cluster}} + summary: "🟡 WARNING: Azure Load Balancer Health Probe Down" + description: | + Backend health probe availability is below 100%, indicating unhealthy backend instances + + ─── WHERE ─────────────────────────── + Account: {{ $labels.tenant_name }} + Resource: {{ $labels.resource }} + Location: {{ $labels.location }} + + ─── DETAILS ───────────────────────── + Metric: Health Probe Availability (DIP) + Current: < 100% + Threshold: 100% + Duration: 5 minutes + + Note: Traffic is still routed to healthy backends if available. labels: opsgenie: "1" isPaused: false @@ -128,8 +141,20 @@ groups: execErrState: Error for: 5m annotations: - description: Azure Load Balancer data path availability is below 100% for over 5 minutes, indicating the load balancer frontend is not responding to health probes. This is a critical issue that requires immediate attention. - summary: Data path down on Azure Load Balancer {{$labels.resource}} in cluster {{$labels.cluster}} + summary: "🔴 CRITICAL: Azure Load Balancer Data Path Down" + description: | + Load balancer frontend is not responding to health probes + + ─── WHERE ─────────────────────────── + Account: {{ $labels.tenant_name }} + Resource: {{ $labels.resource }} + Location: {{ $labels.location }} + + ─── DETAILS ───────────────────────── + Metric: Data Path Availability + Current: < 100% + Threshold: 100% + Duration: 5 minutes labels: opsgenie: "1" isPaused: false @@ -186,8 +211,23 @@ groups: execErrState: Error for: 5m annotations: - description: Azure Load Balancer is using more than 80% of allocated SNAT ports for over 5 minutes. SNAT port exhaustion can cause outbound connection failures and may require increasing the number of backend instances or using a NAT Gateway. - summary: SNAT port exhaustion on Azure Load Balancer {{$labels.resource}} in cluster {{$labels.cluster}} + summary: "🟡 WARNING: Azure Load Balancer SNAT Port Exhaustion" + description: | + Load balancer is using more than 80% of allocated SNAT ports + + ─── WHERE ─────────────────────────── + Account: {{ $labels.tenant_name }} + Resource: {{ $labels.resource }} + Location: {{ $labels.location }} + + ─── DETAILS ───────────────────────── + Metric: SNAT Port Utilization + Current: > 80% + Threshold: 80% + Duration: 5 minutes + + SNAT port exhaustion can cause outbound connection failures. + Consider increasing backend instances or using a NAT Gateway. labels: opsgenie: "1" isPaused: false