diff --git a/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-elb-5xx-high.json b/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-elb-5xx-high.json new file mode 100644 index 00000000000..d562a4cad72 --- /dev/null +++ b/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-elb-5xx-high.json @@ -0,0 +1,48 @@ +{ + "id": "aws_elb_otel-elb-5xx-high", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when load-balancer-generated 5XX error rate exceeds a tunable threshold. Indicates edge/infrastructure failures such as no healthy targets, connection timeouts, or LB capacity issues.", + "name": "[AWS ELB OTel] High ELB 5XX error rate", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "aws", + "aws-elb" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "investigation_guide": { + "blob": "## High ELB 5XX Error Rate\n\nELB-generated 5XX responses are produced by the load balancer itself — typically when no healthy targets are available, targets time out on connection, or the LB hits capacity limits.\n\n### Triage Steps\n1. Identify the affected load balancer and region from the alert context.\n2. Check `UnHealthyHostCount` and `HealthyHostCount` for all target groups behind this ALB.\n3. Verify target instances are running and passing health checks (security groups, listener rules, target registration).\n4. Look for `RejectedConnectionCount` or high `ConsumedLCUs` indicating LB capacity pressure.\n5. If targets are healthy, check for misconfigured listeners, SSL/TLS issues, or idle timeout mismatches.\n\n### Escalation\nELB 5XX with zero healthy targets is a service outage — page immediately.\n\n### Tuning\n- Adjust the `error_rate_pct` threshold (default 0.5%) to match your tolerance for edge errors.\n- Adjust `requests > 100` minimum sample size based on expected traffic volume." + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-awscloudwatchreceiver.otel-*\n| WHERE attributes.Namespace == \"AWS/ApplicationELB\"\n AND attributes.stat == \"Sum\"\n AND attributes.MetricName IN (\"RequestCount\", \"HTTPCode_ELB_5XX_Count\")\n| STATS\n requests = SUM(`metrics.amazonaws.com/AWS/ApplicationELB/RequestCount`),\n elb_5xx = SUM(`metrics.amazonaws.com/AWS/ApplicationELB/HTTPCode_ELB_5XX_Count`)\n BY attributes.LoadBalancer, resource.attributes.cloud.region\n// Minimum request volume — tune to avoid noisy alerts on low-traffic load balancers\n| WHERE requests > 100\n| EVAL error_rate_pct = ROUND(elb_5xx / requests * 100.0, 2)\n// Alert threshold — tune to match your tolerance (default 0.5%)\n| WHERE error_rate_pct > 0.5\n| SORT error_rate_pct DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.LoadBalancer", + "termSize": 50, + "excludeHitsFromPreviousRun": true + } + } +} diff --git a/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-lcu-peak-high.json b/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-lcu-peak-high.json new file mode 100644 index 00000000000..3726e9b615d --- /dev/null +++ b/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-lcu-peak-high.json @@ -0,0 +1,48 @@ +{ + "id": "aws_elb_otel-lcu-peak-high", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when peak consumed LCUs exceed a tunable threshold. Indicates the ALB is scaling toward capacity limits and may soon reject connections.", + "name": "[AWS ELB OTel] High peak consumed LCUs", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "aws", + "aws-elb" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "investigation_guide": { + "blob": "## High Peak Consumed LCUs\n\n`ConsumedLCUs` is the ALB capacity consumption measure combining new connections, active connections, processed bytes, and rule evaluations. Sustained high LCUs precede connection rejection.\n\n### Triage Steps\n1. Identify the affected load balancer and region.\n2. Correlate with `RequestCount`, `NewConnectionCount`, `ActiveConnectionCount`, and `ProcessedBytes` to find the LCU driver.\n3. Check for `RejectedConnectionCount` — if present, capacity is already breached.\n4. Review traffic patterns for unexpected spikes, bot traffic, or retry storms.\n5. Evaluate whether application changes reduced connection reuse (HTTP/1.1 vs HTTP/2).\n\n### Tuning\n- **Critical:** Set the `peak_lcus > 100` threshold based on your provisioned capacity and historical baseline — the default is a placeholder, not derived from sample data.\n- Monitor trend over time; a single spike may be benign if brief." + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-awscloudwatchreceiver.otel-*\n| WHERE attributes.Namespace == \"AWS/ApplicationELB\"\n AND attributes.MetricName == \"ConsumedLCUs\"\n AND attributes.stat == \"Maximum\"\n| STATS peak_lcus = MAX(`metrics.amazonaws.com/AWS/ApplicationELB/ConsumedLCUs`)\n BY attributes.LoadBalancer, resource.attributes.cloud.region\n// Tune this threshold to your provisioned LCU capacity (default 100 is a placeholder)\n| WHERE peak_lcus > 100\n| SORT peak_lcus DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.LoadBalancer", + "termSize": 50, + "excludeHitsFromPreviousRun": true + } + } +} diff --git a/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-rejected-connections.json b/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-rejected-connections.json new file mode 100644 index 00000000000..56887146eb3 --- /dev/null +++ b/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-rejected-connections.json @@ -0,0 +1,48 @@ +{ + "id": "aws_elb_otel-rejected-connections", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when the ALB rejects connections because it reached its connection ceiling. A hard capacity-class failure requiring immediate attention.", + "name": "[AWS ELB OTel] Rejected connections detected", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "aws", + "aws-elb" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "investigation_guide": { + "blob": "## Rejected Connections Detected\n\nNon-zero `RejectedConnectionCount` means the ALB dropped incoming connections because it hit its connection limit. Clients experience connection failures.\n\n### Triage Steps\n1. Identify the affected load balancer and region.\n2. Check `ActiveConnectionCount` and `NewConnectionCount` for connection storm patterns.\n3. Review `ConsumedLCUs` — the LB may be approaching capacity limits.\n4. Look for connection leaks on clients (missing keep-alive, retry storms) or DDoS/abuse traffic.\n5. Consider requesting an ALB pre-warming or capacity review from AWS if sustained.\n\n### Escalation\nAny rejected connections on a production ALB warrant immediate investigation." + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-awscloudwatchreceiver.otel-*\n| WHERE attributes.Namespace == \"AWS/ApplicationELB\"\n AND attributes.MetricName == \"RejectedConnectionCount\"\n AND attributes.stat == \"Sum\"\n| STATS rejected = SUM(`metrics.amazonaws.com/AWS/ApplicationELB/RejectedConnectionCount`)\n BY attributes.LoadBalancer, resource.attributes.cloud.region\n// Any rejected connection is notable — threshold intentionally 0\n| WHERE rejected > 0\n| SORT rejected DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.LoadBalancer", + "termSize": 50, + "excludeHitsFromPreviousRun": true + } + } +} diff --git a/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-request-count-zero.json b/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-request-count-zero.json new file mode 100644 index 00000000000..b517f4bfe06 --- /dev/null +++ b/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-request-count-zero.json @@ -0,0 +1,48 @@ +{ + "id": "aws_elb_otel-request-count-zero", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when a load balancer receives zero routed requests over the evaluation window. Detects traffic drops that may indicate upstream DNS, routing, or client-side failures.", + "name": "[AWS ELB OTel] Zero request traffic", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "aws", + "aws-elb" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "investigation_guide": { + "blob": "## Zero Request Traffic\n\nA sudden drop to zero `RequestCount` on an ALB that normally serves traffic can indicate DNS failures, upstream routing changes, WAF blocks, or client connectivity issues — even when backends are healthy.\n\n### Triage Steps\n1. Confirm the ALB is expected to receive traffic during this window (exclude maintenance windows).\n2. Check DNS resolution and Route 53 health checks for the ALB endpoint.\n3. Verify listener rules, WAF associations, and security group ingress are unchanged.\n4. Check whether `ELB 5XX` or `UnHealthyHostCount` explain the drop (backends down vs upstream failure).\n5. Compare with CloudFront, API Gateway, or other upstream layers if applicable.\n\n### Tuning\n- Deploy only on ALBs that should always receive traffic; disable for dev/staging or intentionally idle load balancers.\n- Adjust `timeWindowSize` — a longer window reduces false positives from brief quiet periods." + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-awscloudwatchreceiver.otel-*\n| WHERE attributes.Namespace == \"AWS/ApplicationELB\"\n AND attributes.MetricName == \"RequestCount\"\n AND attributes.stat == \"Sum\"\n| STATS requests = SUM(`metrics.amazonaws.com/AWS/ApplicationELB/RequestCount`)\n BY attributes.LoadBalancer, resource.attributes.cloud.region\n// Zero requests in the window — deploy only on always-on production ALBs\n| WHERE requests == 0\n| SORT attributes.LoadBalancer ASC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.LoadBalancer", + "termSize": 50, + "excludeHitsFromPreviousRun": true + } + } +} diff --git a/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-target-5xx-high.json b/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-target-5xx-high.json new file mode 100644 index 00000000000..7f217090414 --- /dev/null +++ b/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-target-5xx-high.json @@ -0,0 +1,48 @@ +{ + "id": "aws_elb_otel-target-5xx-high", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when target-generated 5XX error rate exceeds a tunable threshold for any load balancer target group. Indicates application or backend failures behind the ALB.", + "name": "[AWS ELB OTel] High target 5XX error rate", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "aws", + "aws-elb" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "investigation_guide": { + "blob": "## High Target 5XX Error Rate\n\nTarget-generated 5XX responses mean the application or backend behind the ALB is failing — not the load balancer itself.\n\n### Triage Steps\n1. Identify the affected load balancer, target group, and region from the alert context.\n2. Check application logs and traces for the backends in the target group.\n3. Review recent deployments, dependency outages, or resource saturation on target instances.\n4. Compare with ELB-generated 5XX — if ELB 5XX is also rising, targets may be unreachable (health check failures, connection timeouts).\n5. Inspect target health (`UnHealthyHostCount`) and response time for the same target group.\n\n### Escalation\nIf target 5XX rate persists above the threshold for more than 15 minutes, page the service owner.\n\n### Tuning\n- Adjust the `error_rate_pct` threshold (default 1%) to match your SLO.\n- Adjust `requests > 100` minimum sample size based on expected traffic volume." + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-awscloudwatchreceiver.otel-*\n| WHERE attributes.Namespace == \"AWS/ApplicationELB\"\n AND attributes.stat == \"Sum\"\n AND attributes.MetricName IN (\"RequestCount\", \"HTTPCode_Target_5XX_Count\")\n| STATS\n requests = SUM(`metrics.amazonaws.com/AWS/ApplicationELB/RequestCount`),\n target_5xx = SUM(`metrics.amazonaws.com/AWS/ApplicationELB/HTTPCode_Target_5XX_Count`)\n BY attributes.LoadBalancer, attributes.TargetGroup, resource.attributes.cloud.region\n// Minimum request volume — tune to avoid noisy alerts on low-traffic target groups\n| WHERE requests > 100\n| EVAL error_rate_pct = ROUND(target_5xx / requests * 100.0, 2)\n// Alert threshold — tune to match your SLO (default 1%)\n| WHERE error_rate_pct > 1.0\n| SORT error_rate_pct DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.TargetGroup", + "termSize": 50, + "excludeHitsFromPreviousRun": true + } + } +} diff --git a/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-target-response-time-high.json b/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-target-response-time-high.json new file mode 100644 index 00000000000..585d343aaf6 --- /dev/null +++ b/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-target-response-time-high.json @@ -0,0 +1,48 @@ +{ + "id": "aws_elb_otel-target-response-time-high", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when average target response time exceeds a tunable threshold. Indicates typical backend latency degradation even when error rates remain low.", + "name": "[AWS ELB OTel] High target response time (average)", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "aws", + "aws-elb" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "investigation_guide": { + "blob": "## High Target Response Time (Average)\n\nRising average `TargetResponseTime` means backends are slow to begin sending response headers — users experience degraded latency even without HTTP errors.\n\n### Triage Steps\n1. Identify the affected load balancer, target group, and region.\n2. Check backend CPU, memory, and thread pool saturation on target instances.\n3. Review downstream dependency latency (databases, APIs) for the affected service.\n4. Compare with the tail latency alert (`Maximum` stat) to see if a few slow requests or systemic slowness is driving the average up.\n5. Check whether traffic has shifted to fewer healthy targets (see `UnHealthyHostCount`).\n\n### Tuning\n- Adjust the `avg_response_time > 2.0` threshold (seconds) to match your latency SLO.\n- CloudWatch percentiles are not available in this data source; use the tail (`Maximum`) alert for worst-case latency." + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-awscloudwatchreceiver.otel-*\n| WHERE attributes.Namespace == \"AWS/ApplicationELB\"\n AND attributes.MetricName == \"TargetResponseTime\"\n AND attributes.stat == \"Average\"\n| STATS avg_response_time = AVG(`metrics.amazonaws.com/AWS/ApplicationELB/TargetResponseTime`)\n BY attributes.LoadBalancer, attributes.TargetGroup, resource.attributes.cloud.region\n// Alert threshold in seconds — tune to match your latency SLO (default 2.0s)\n| WHERE avg_response_time > 2.0\n| SORT avg_response_time DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.TargetGroup", + "termSize": 50, + "excludeHitsFromPreviousRun": true + } + } +} diff --git a/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-target-response-time-tail-high.json b/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-target-response-time-tail-high.json new file mode 100644 index 00000000000..b9d5bd7df73 --- /dev/null +++ b/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-target-response-time-tail-high.json @@ -0,0 +1,48 @@ +{ + "id": "aws_elb_otel-target-response-time-tail-high", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when peak (maximum) target response time exceeds a tunable threshold. Serves as a tail-latency proxy where CloudWatch percentiles are unavailable.", + "name": "[AWS ELB OTel] High target response time (tail)", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "aws", + "aws-elb" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 3 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "investigation_guide": { + "blob": "## High Target Response Time (Tail)\n\nPeak `TargetResponseTime` (`Maximum` stat) captures the worst single observation in the window — a proxy for p95/p99 tail latency since percentile statistics are not collected by this source.\n\n### Triage Steps\n1. Identify the affected load balancer, target group, and region.\n2. Look for outlier requests: slow queries, cold starts, GC pauses, or timeout retries.\n3. Compare with the average response time alert — a high tail with normal average suggests a few bad requests; both elevated suggests systemic slowness.\n4. Check target health and whether remaining healthy targets are overloaded.\n\n### Tuning\n- Adjust the `max_response_time > 5.0` threshold (seconds) to approximate your p99 SLO.\n- Pair with the average response time alert for typical vs tail latency visibility." + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-awscloudwatchreceiver.otel-*\n| WHERE attributes.Namespace == \"AWS/ApplicationELB\"\n AND attributes.MetricName == \"TargetResponseTime\"\n AND attributes.stat == \"Maximum\"\n| STATS max_response_time = MAX(`metrics.amazonaws.com/AWS/ApplicationELB/TargetResponseTime`)\n BY attributes.LoadBalancer, attributes.TargetGroup, resource.attributes.cloud.region\n// Tail latency threshold in seconds — tune to approximate p99 SLO (default 5.0s)\n| WHERE max_response_time > 5.0\n| SORT max_response_time DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 15, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.TargetGroup", + "termSize": 50, + "excludeHitsFromPreviousRun": true + } + } +} diff --git a/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-unhealthy-hosts.json b/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-unhealthy-hosts.json new file mode 100644 index 00000000000..007d649b84e --- /dev/null +++ b/packages/aws_elb_otel/kibana/alerting_rule_template/aws_elb_otel-unhealthy-hosts.json @@ -0,0 +1,48 @@ +{ + "id": "aws_elb_otel-unhealthy-hosts", + "type": "alerting_rule_template", + "managed": true, + "attributes": { + "description": "Alerts when any target in a target group is failing health checks (UnHealthyHostCount > 0). Early warning before healthy capacity collapses to zero.", + "name": "[AWS ELB OTel] Unhealthy targets detected", + "ruleTypeId": ".es-query", + "tags": [ + "observability", + "aws", + "aws-elb" + ], + "schedule": { + "interval": "5m" + }, + "alertDelay": { + "active": 2 + }, + "flapping": { + "lookBackWindow": 10, + "statusChangeThreshold": 4 + }, + "artifacts": { + "investigation_guide": { + "blob": "## Unhealthy Targets Detected\n\n`UnHealthyHostCount > 0` means at least one target in the group failed ALB health checks during the window. If all targets become unhealthy, the ALB serves ELB 5XX and the service is effectively down.\n\n### Triage Steps\n1. Identify the affected load balancer, target group, and region.\n2. Check which targets are unhealthy in the AWS console (EC2/ECS/EKS target health view).\n3. Verify health check path, port, protocol, and expected response codes match the application.\n4. Inspect target instance/container logs for crashes, OOM, or failed readiness probes.\n5. Check security group and NACL rules between the ALB and targets.\n6. Monitor `HealthyHostCount` — if trending toward zero, treat as an outage.\n\n### Note\nThis source collects `Maximum` for `UnHealthyHostCount` (not `Minimum` as AWS recommends for AZ-wide alarms) — any unhealthy target in the period triggers the alert." + } + }, + "params": { + "searchType": "esqlQuery", + "esqlQuery": { + "esql": "FROM metrics-awscloudwatchreceiver.otel-*\n| WHERE attributes.Namespace == \"AWS/ApplicationELB\"\n AND attributes.MetricName == \"UnHealthyHostCount\"\n AND attributes.stat == \"Maximum\"\n| STATS unhealthy_hosts = MAX(`metrics.amazonaws.com/AWS/ApplicationELB/UnHealthyHostCount`)\n BY attributes.LoadBalancer, attributes.TargetGroup, resource.attributes.cloud.region\n// Any unhealthy target in the window — threshold is intentionally 0\n| WHERE unhealthy_hosts > 0\n| SORT unhealthy_hosts DESC" + }, + "size": 0, + "threshold": [ + 0 + ], + "thresholdComparator": ">", + "timeField": "@timestamp", + "timeWindowSize": 5, + "timeWindowUnit": "m", + "groupBy": "row", + "termField": "attributes.TargetGroup", + "termSize": 50, + "excludeHitsFromPreviousRun": true + } + } +} diff --git a/packages/aws_elb_otel/kibana/slo_template/aws_elb_otel-request-availability-99.5-Rolling30Days.json b/packages/aws_elb_otel/kibana/slo_template/aws_elb_otel-request-availability-99.5-Rolling30Days.json new file mode 100644 index 00000000000..7c0f2e6da20 --- /dev/null +++ b/packages/aws_elb_otel/kibana/slo_template/aws_elb_otel-request-availability-99.5-Rolling30Days.json @@ -0,0 +1,59 @@ +{ + "attributes": { + "name": "[AWS ELB OTel] Request availability 99.5% rolling 30 days", + "description": "Tracks Application Load Balancer request availability by keeping the combined ELB-generated and target-generated 5XX error rate below 0.5% in each 1-minute interval. Scoped per load balancer and region; aggregates all target groups behind the load balancer. A rolling 30-day target of 99.5% ensures users receive successful responses at the edge.", + "indicator": { + "type": "sli.metric.timeslice", + "params": { + "index": "metrics-*", + "filter": "data_stream.dataset: \"awscloudwatchreceiver.otel\" AND attributes.Namespace: \"AWS/ApplicationELB\"", + "metric": { + "metrics": [ + { + "name": "A", + "aggregation": "sum", + "field": "metrics.amazonaws.com/AWS/ApplicationELB/RequestCount", + "filter": "attributes.MetricName: \"RequestCount\" AND attributes.stat: \"Sum\"" + }, + { + "name": "B", + "aggregation": "sum", + "field": "metrics.amazonaws.com/AWS/ApplicationELB/HTTPCode_Target_5XX_Count", + "filter": "attributes.MetricName: \"HTTPCode_Target_5XX_Count\" AND attributes.stat: \"Sum\"" + }, + { + "name": "C", + "aggregation": "sum", + "field": "metrics.amazonaws.com/AWS/ApplicationELB/HTTPCode_ELB_5XX_Count", + "filter": "attributes.MetricName: \"HTTPCode_ELB_5XX_Count\" AND attributes.stat: \"Sum\"" + } + ], + "equation": "(B+C)/A", + "comparator": "LT", + "threshold": 0.005 + }, + "timestampField": "@timestamp" + } + }, + "budgetingMethod": "timeslices", + "timeWindow": { + "duration": "30d", + "type": "rolling" + }, + "objective": { + "target": 0.995, + "timesliceTarget": 0.95, + "timesliceWindow": "1m" + }, + "tags": [ + "aws_elb", + "otel" + ], + "groupBy": [ + "resource.attributes.cloud.region", + "attributes.LoadBalancer" + ] + }, + "id": "aws_elb_otel-request-availability-99.5-Rolling30Days", + "type": "slo_template" +} diff --git a/packages/aws_elb_otel/kibana/slo_template/aws_elb_otel-target-response-time-avg-99.5-Rolling30Days.json b/packages/aws_elb_otel/kibana/slo_template/aws_elb_otel-target-response-time-avg-99.5-Rolling30Days.json new file mode 100644 index 00000000000..5b894c7fddd --- /dev/null +++ b/packages/aws_elb_otel/kibana/slo_template/aws_elb_otel-target-response-time-avg-99.5-Rolling30Days.json @@ -0,0 +1,48 @@ +{ + "attributes": { + "name": "[AWS ELB OTel] Target response time average 99.5% rolling 30 days", + "description": "Tracks typical backend latency for Application Load Balancer target groups by keeping average TargetResponseTime below 1 second in each 1-minute interval. Scoped per target group, load balancer, and region. A rolling 30-day target of 99.5% ensures users experience responsive service even when errors are absent.", + "indicator": { + "type": "sli.metric.timeslice", + "params": { + "index": "metrics-*", + "filter": "data_stream.dataset: \"awscloudwatchreceiver.otel\" AND attributes.Namespace: \"AWS/ApplicationELB\" AND attributes.MetricName: \"TargetResponseTime\" AND attributes.stat: \"Average\"", + "metric": { + "metrics": [ + { + "name": "A", + "aggregation": "avg", + "field": "metrics.amazonaws.com/AWS/ApplicationELB/TargetResponseTime", + "filter": "attributes.MetricName: \"TargetResponseTime\" AND attributes.stat: \"Average\"" + } + ], + "equation": "A", + "comparator": "LT", + "threshold": 1 + }, + "timestampField": "@timestamp" + } + }, + "budgetingMethod": "timeslices", + "timeWindow": { + "duration": "30d", + "type": "rolling" + }, + "objective": { + "target": 0.995, + "timesliceTarget": 0.95, + "timesliceWindow": "1m" + }, + "tags": [ + "aws_elb", + "otel" + ], + "groupBy": [ + "resource.attributes.cloud.region", + "attributes.LoadBalancer", + "attributes.TargetGroup" + ] + }, + "id": "aws_elb_otel-target-response-time-avg-99.5-Rolling30Days", + "type": "slo_template" +} diff --git a/packages/aws_elb_otel/kibana/slo_template/aws_elb_otel-zero-unhealthy-hosts-99.5-Rolling30Days.json b/packages/aws_elb_otel/kibana/slo_template/aws_elb_otel-zero-unhealthy-hosts-99.5-Rolling30Days.json new file mode 100644 index 00000000000..68b956e0a84 --- /dev/null +++ b/packages/aws_elb_otel/kibana/slo_template/aws_elb_otel-zero-unhealthy-hosts-99.5-Rolling30Days.json @@ -0,0 +1,48 @@ +{ + "attributes": { + "name": "[AWS ELB OTel] Zero unhealthy hosts 99.5% rolling 30 days", + "description": "Tracks target-group capacity health by requiring UnHealthyHostCount to remain at zero in each 1-minute interval (Maximum statistic). Scoped per target group, load balancer, and region. A rolling 30-day target of 99.5% ensures backends pass health checks and the load balancer has capacity to serve traffic.", + "indicator": { + "type": "sli.metric.timeslice", + "params": { + "index": "metrics-*", + "filter": "data_stream.dataset: \"awscloudwatchreceiver.otel\" AND attributes.Namespace: \"AWS/ApplicationELB\" AND attributes.MetricName: \"UnHealthyHostCount\" AND attributes.stat: \"Maximum\"", + "metric": { + "metrics": [ + { + "name": "A", + "aggregation": "max", + "field": "metrics.amazonaws.com/AWS/ApplicationELB/UnHealthyHostCount", + "filter": "attributes.MetricName: \"UnHealthyHostCount\" AND attributes.stat: \"Maximum\"" + } + ], + "equation": "A", + "comparator": "LTE", + "threshold": 0 + }, + "timestampField": "@timestamp" + } + }, + "budgetingMethod": "timeslices", + "timeWindow": { + "duration": "30d", + "type": "rolling" + }, + "objective": { + "target": 0.995, + "timesliceTarget": 0.95, + "timesliceWindow": "1m" + }, + "tags": [ + "aws_elb", + "otel" + ], + "groupBy": [ + "resource.attributes.cloud.region", + "attributes.LoadBalancer", + "attributes.TargetGroup" + ] + }, + "id": "aws_elb_otel-zero-unhealthy-hosts-99.5-Rolling30Days", + "type": "slo_template" +}