diff --git a/examples/slo/catalog/time_slice_slo.yaml b/examples/slo/catalog/time_slice_slo.yaml new file mode 100644 index 0000000..540e6ad --- /dev/null +++ b/examples/slo/catalog/time_slice_slo.yaml @@ -0,0 +1,43 @@ +time_slice-slo: + name: "(SLO) Test API p95 latency Checks" + type: time_slice + description: | + Test API p95 latency should be less than 1 second. + sli_specification: + time_slice: + query: + formula: "query1 + query2" + queries: + - data_source: "metrics" + name: "query1" + query: "p95:trace.express.request{env:production,resource_name:get_/api/test,service:my-service}" + - data_source: "metrics" + name: "query2" + query: "p95:trace.express.request{env:production,resource_name:get_/api/test,service:my-service}" + comparator: "<=" + threshold: 1 + query_interval_seconds: 300 + thresholds: + - target: 99.0 + timeframe: "30d" + warning: 99.5 + + error_budget_alert: + enabled: true + threshold: 80 + timeframe: "30d" + priority: 2 + message: "Alert on 80% of error budget consumed" + + burn_rate_alert: + enabled: true + threshold: 3 + timeframe: "30d" + long_window: "24h" + short_window: "120m" + priority: 2 + message: "Burn rate is high enough to deplete error budget in one day" + + tags: + service: my-service + env: production \ No newline at end of file diff --git a/modules/slo/README.md b/modules/slo/README.md index c195c83..f056ff5 100644 --- a/modules/slo/README.md +++ b/modules/slo/README.md @@ -2,13 +2,13 @@ This module is responsible for creating Datadog [Service Level Objectives](https://docs.datadoghq.com/monitors/service_level_objectives/) and their related monitors and alerts. -The module can create metric-based SLOs (and the corresponding alerts) and monitor-based SLOs (and the corresponding monitors). +The module can create metric-based SLOs (and the corresponding alerts), monitor-based SLOs (and the corresponding monitors) and time-slice-based SLOs (and the corresponding alerts). ## Alerts Datadog alerts for SLOs are terraformed through the monitor object. -An SLO can have many thresholds set, but a monitor can only have one. In order to get around this, the module creates Datadog monitors for each threshold within an SLO. +An SLO can have many thresholds set, but a monitor can only have one. In order to get around this, the module creates Datadog monitors for each threshold within an SLO. ## Usage @@ -101,8 +101,57 @@ monitor-slo: api_version: null ``` +Example of time-slice-based SLO: + +```yaml +time_slice-slo: + name: "(SLO) Test API p95 latency Checks" + type: time_slice + description: | + Test API p95 latency should be less than 1 second. + sli_specification: + time_slice: + query: + formula: "query1 + query2" + queries: + - data_source: "metrics" + name: "query1" + query: "p95:trace.express.request{env:production,resource_name:get_/api/test,service:my-service}" + - data_source: "metrics" + name: "query2" + query: "p95:trace.express.request{env:production,resource_name:get_/api/test,service:my-service}" + comparator: "<=" + threshold: 1 + query_interval_seconds: 300 + thresholds: + - target: 99.0 + timeframe: "30d" + warning: 99.5 + + error_budget_alert: + enabled: true + threshold: 80 + timeframe: "30d" + priority: 2 + message: "Alert on 80% of error budget consumed" + + burn_rate_alert: + enabled: true + threshold: 3 + timeframe: "30d" + long_window: "24h" + short_window: "120m" + priority: 2 + message: "Burn rate is high enough to deplete error budget in one day" + + tags: + service: my-service + env: production +``` + ## References - [Service Level Objectives](https://docs.datadoghq.com/monitors/service_level_objectives/) - [Monitor-based SLOs](https://docs.datadoghq.com/monitors/service_level_objectives/monitor/) + - [Time-slice-based SLOs](https://docs.datadoghq.com/monitors/service_level_objectives/time_slice/) - [Datadog Error Budget](https://docs.datadoghq.com/monitors/service_level_objectives/error_budget/) - [Monitor-based SLO example](https://github.com/DataDog/terraform-provider-datadog/issues/667) diff --git a/modules/slo/time_slice_slo.tf b/modules/slo/time_slice_slo.tf new file mode 100644 index 0000000..376ba3c --- /dev/null +++ b/modules/slo/time_slice_slo.tf @@ -0,0 +1,120 @@ +locals { + datadog_time_slice_slos = { for slo in var.datadog_slos : slo.name => slo if slo.type == "time_slice" && lookup(slo, "enabled", true) && local.enabled } + + temp_datadog_time_slice_slo_error_budget_alerts = flatten([ + for name, slo in local.datadog_time_slice_slos : { + slo = slo + slo_name = name + alert = slo.error_budget_alert + } + if lookup(slo, "error_budget_alert", null) != null && lookup(slo.error_budget_alert, "enabled", false) + ]) + + temp_datadog_time_slice_slo_burn_rate_alerts = flatten([ + for name, slo in local.datadog_time_slice_slos : { + slo = slo + slo_name = name + alert = slo.burn_rate_alert + } + if lookup(slo, "burn_rate_alert", null) != null && lookup(slo.burn_rate_alert, "enabled", false) + ]) + + datadog_time_slice_slo_error_budget_alerts = { for alert in local.temp_datadog_time_slice_slo_error_budget_alerts : alert.slo_name => alert } + datadog_time_slice_slo_burn_rate_alerts = { for alert in local.temp_datadog_time_slice_slo_burn_rate_alerts : alert.slo_name => alert } +} + +resource "datadog_service_level_objective" "time_slice_slo" { + for_each = local.datadog_time_slice_slos + + name = each.value.name + type = each.value.type + description = lookup(each.value, "description", null) + force_delete = lookup(each.value, "force_delete", true) + validate = lookup(each.value, "validate", false) + + sli_specification { + time_slice { + query { + formula { + formula_expression = lookup(each.value.sli_specification.time_slice.query, "formula", null) + } + + dynamic "query" { + for_each = lookup(each.value.sli_specification.time_slice.query, "queries", []) + content { + metric_query { + data_source = lookup(query.value, "data_source", null) + name = lookup(query.value, "name", null) + query = lookup(query.value, "query", null) + } + } + } + } + comparator = lookup(each.value.sli_specification.time_slice, "comparator", null) + threshold = lookup(each.value.sli_specification.time_slice, "threshold", null) + query_interval_seconds = lookup(each.value.sli_specification.time_slice, "query_interval_seconds", null) + } + } + + dynamic "thresholds" { + for_each = each.value.thresholds + content { + target = lookup(thresholds.value, "target", null) + timeframe = lookup(thresholds.value, "timeframe", null) + warning = lookup(thresholds.value, "warning", null) + } + } + + tags = [ + for tagk, tagv in lookup(each.value, "tags", module.this.tags) : (tagv != null ? format("%s:%s", tagk, tagv) : tagk) + ] +} + +resource "datadog_monitor" "time_slice_slo_error_budget_alert" { + for_each = local.datadog_time_slice_slo_error_budget_alerts + + type = "slo alert" + name = format("(SLO Error Budget Alert) %s", each.value.slo.name) + message = format("%s %s", each.value.alert.message, local.alert_tags) + + query = < ${each.value.alert.threshold} + EOF + + monitor_thresholds { + critical = lookup(each.value.alert, "threshold", null) + } + + validate = false + + include_tags = true + priority = lookup(each.value.alert, "priority", 5) + + tags = [ + for tagk, tagv in lookup(each.value.slo, "tags", {}) : (tagv != null ? format("%s:%s", tagk, tagv) : tagk) + ] +} + +resource "datadog_monitor" "time_slice_slo_burn_rate_alert" { + for_each = local.datadog_time_slice_slo_burn_rate_alerts + + type = "slo alert" + name = format("(SLO Burn Rate Alert) %s", each.value.slo.name) + message = format("%s %s", each.value.alert.message, local.alert_tags) + + query = < ${each.value.alert.threshold} + EOF + + monitor_thresholds { + critical = lookup(each.value.alert, "threshold", null) + } + + validate = false + include_tags = true + priority = lookup(each.value.alert, "priority", 5) + + tags = [ + for tagk, tagv in lookup(each.value.slo, "tags", {}) : (tagv != null ? format("%s:%s", tagk, tagv) : tagk) + ] +}