cloudposse · keidarcy · Dec 28, 2024
@@ -0,0 +1,43 @@
+time_slice-slo:
+  name: "(SLO) Test API p95 latency Checks"
+  type: time_slice
+  description: |
+    Test API p95 latency should be less than 1 second.
+  sli_specification:
+    time_slice:
+      query:
+        formula: "query1 + query2"
+        queries:
+          - data_source: "metrics"
+            name: "query1"
+            query: "p95:trace.express.request{env:production,resource_name:get_/api/test,service:my-service}"
+          - data_source: "metrics"
+            name: "query2"
+            query: "p95:trace.express.request{env:production,resource_name:get_/api/test,service:my-service}"
+      comparator: "<="
+      threshold: 1
+      query_interval_seconds: 300
+  thresholds:
+    - target: 99.0
+      timeframe: "30d"
+      warning: 99.5
+
+  error_budget_alert:
+    enabled: true
+    threshold: 80
+    timeframe: "30d"
+    priority: 2
+    message: "Alert on 80% of error budget consumed"
+
+  burn_rate_alert:
+    enabled: true
+    threshold: 3
+    timeframe: "30d"
+    long_window: "24h"
+    short_window: "120m"
+    priority: 2
+    message: "Burn rate is high enough to deplete error budget in one day"
+
+  tags:
+    service: my-service
+    env: production
@@ -2,13 +2,13 @@
 
 This module is responsible for creating Datadog [Service Level Objectives](https://docs.datadoghq.com/monitors/service_level_objectives/) and their related monitors and alerts.
 
-The module can create metric-based SLOs (and the corresponding alerts) and monitor-based SLOs (and the corresponding monitors).
+The module can create metric-based SLOs (and the corresponding alerts), monitor-based SLOs (and the corresponding monitors) and time-slice-based SLOs (and the corresponding alerts).
 
 ## Alerts
 
 Datadog alerts for SLOs are terraformed through the monitor object.
 
-An SLO can have many thresholds set, but a monitor can only have one. In order to get around this, the module creates Datadog monitors for each threshold within an SLO. 
+An SLO can have many thresholds set, but a monitor can only have one. In order to get around this, the module creates Datadog monitors for each threshold within an SLO.
 
 ## Usage
 
@@ -101,8 +101,57 @@ monitor-slo:
     api_version: null
 ```
 
+Example of time-slice-based SLO:
+
+```yaml
+time_slice-slo:
+  name: "(SLO) Test API p95 latency Checks"
+  type: time_slice
+  description: |
+    Test API p95 latency should be less than 1 second.
+  sli_specification:
+    time_slice:
+      query:
+        formula: "query1 + query2"
+        queries:
+          - data_source: "metrics"
+            name: "query1"
+            query: "p95:trace.express.request{env:production,resource_name:get_/api/test,service:my-service}"
+          - data_source: "metrics"
+            name: "query2"
+            query: "p95:trace.express.request{env:production,resource_name:get_/api/test,service:my-service}"
+      comparator: "<="
+      threshold: 1
+      query_interval_seconds: 300
+  thresholds:
+    - target: 99.0
+      timeframe: "30d"
+      warning: 99.5
+
+  error_budget_alert:
+    enabled: true
+    threshold: 80
+    timeframe: "30d"
+    priority: 2
+    message: "Alert on 80% of error budget consumed"
+
+  burn_rate_alert:
+    enabled: true
+    threshold: 3
+    timeframe: "30d"
+    long_window: "24h"
+    short_window: "120m"
+    priority: 2
+    message: "Burn rate is high enough to deplete error budget in one day"
+
+  tags:
+    service: my-service
+    env: production
+```
+
 ## References
  - [Service Level Objectives](https://docs.datadoghq.com/monitors/service_level_objectives/)
  - [Monitor-based SLOs](https://docs.datadoghq.com/monitors/service_level_objectives/monitor/)
+ - [Time-slice-based SLOs](https://docs.datadoghq.com/monitors/service_level_objectives/time_slice/)
  - [Datadog Error Budget](https://docs.datadoghq.com/monitors/service_level_objectives/error_budget/)
  - [Monitor-based SLO example](https://github.com/DataDog/terraform-provider-datadog/issues/667)
@@ -0,0 +1,120 @@
+locals {
+  datadog_time_slice_slos = { for slo in var.datadog_slos : slo.name => slo if slo.type == "time_slice" && lookup(slo, "enabled", true) && local.enabled }
+
+  temp_datadog_time_slice_slo_error_budget_alerts = flatten([
+    for name, slo in local.datadog_time_slice_slos : {
+      slo      = slo
+      slo_name = name
+      alert    = slo.error_budget_alert
+    }
+    if lookup(slo, "error_budget_alert", null) != null && lookup(slo.error_budget_alert, "enabled", false)
+  ])
+
+  temp_datadog_time_slice_slo_burn_rate_alerts = flatten([
+    for name, slo in local.datadog_time_slice_slos : {
+      slo      = slo
+      slo_name = name
+      alert    = slo.burn_rate_alert
+    }
+    if lookup(slo, "burn_rate_alert", null) != null && lookup(slo.burn_rate_alert, "enabled", false)
+  ])
+
+  datadog_time_slice_slo_error_budget_alerts = { for alert in local.temp_datadog_time_slice_slo_error_budget_alerts : alert.slo_name => alert }
+  datadog_time_slice_slo_burn_rate_alerts    = { for alert in local.temp_datadog_time_slice_slo_burn_rate_alerts : alert.slo_name => alert }
+}
+
+resource "datadog_service_level_objective" "time_slice_slo" {
+  for_each = local.datadog_time_slice_slos
+
+  name         = each.value.name
+  type         = each.value.type
+  description  = lookup(each.value, "description", null)
+  force_delete = lookup(each.value, "force_delete", true)
+  validate     = lookup(each.value, "validate", false)
+
+  sli_specification {
+    time_slice {
+      query {
+        formula {
+          formula_expression = lookup(each.value.sli_specification.time_slice.query, "formula", null)
+        }
+
+        dynamic "query" {
+          for_each = lookup(each.value.sli_specification.time_slice.query, "queries", [])
+          content {
+            metric_query {
+              data_source = lookup(query.value, "data_source", null)
+              name        = lookup(query.value, "name", null)
+              query       = lookup(query.value, "query", null)
+            }
+          }
+        }
+      }
+      comparator             = lookup(each.value.sli_specification.time_slice, "comparator", null)
+      threshold              = lookup(each.value.sli_specification.time_slice, "threshold", null)
+      query_interval_seconds = lookup(each.value.sli_specification.time_slice, "query_interval_seconds", null)
+    }
+  }
+
+  dynamic "thresholds" {
+    for_each = each.value.thresholds
+    content {
+      target    = lookup(thresholds.value, "target", null)
+      timeframe = lookup(thresholds.value, "timeframe", null)
+      warning   = lookup(thresholds.value, "warning", null)
+    }
+  }
+
+  tags = [
+    for tagk, tagv in lookup(each.value, "tags", module.this.tags) : (tagv != null ? format("%s:%s", tagk, tagv) : tagk)
+  ]
+}
+
+resource "datadog_monitor" "time_slice_slo_error_budget_alert" {
+  for_each = local.datadog_time_slice_slo_error_budget_alerts
+
+  type    = "slo alert"
+  name    = format("(SLO Error Budget Alert) %s", each.value.slo.name)
+  message = format("%s %s", each.value.alert.message, local.alert_tags)
+
+  query = <<EOF
+    error_budget("${datadog_service_level_objective.time_slice_slo[each.value.slo_name].id}").over("${each.value.alert.timeframe}") > ${each.value.alert.threshold}
+  EOF
+
+  monitor_thresholds {
+    critical = lookup(each.value.alert, "threshold", null)
+  }
+
+  validate = false
+
+  include_tags = true
+  priority     = lookup(each.value.alert, "priority", 5)
+
+  tags = [
+    for tagk, tagv in lookup(each.value.slo, "tags", {}) : (tagv != null ? format("%s:%s", tagk, tagv) : tagk)
+  ]
+}
+
+resource "datadog_monitor" "time_slice_slo_burn_rate_alert" {
+  for_each = local.datadog_time_slice_slo_burn_rate_alerts
+
+  type    = "slo alert"
+  name    = format("(SLO Burn Rate Alert) %s", each.value.slo.name)
+  message = format("%s %s", each.value.alert.message, local.alert_tags)
+
+  query = <<EOF
+    burn_rate("${datadog_service_level_objective.time_slice_slo[each.value.slo_name].id}").over("${each.value.alert.timeframe}").long_window("${each.value.alert.long_window}").short_window("${each.value.alert.short_window}") > ${each.value.alert.threshold}
+  EOF
+
+  monitor_thresholds {
+    critical = lookup(each.value.alert, "threshold", null)
+  }
+
+  validate     = false
+  include_tags = true
+  priority     = lookup(each.value.alert, "priority", 5)
+
+  tags = [
+    for tagk, tagv in lookup(each.value.slo, "tags", {}) : (tagv != null ? format("%s:%s", tagk, tagv) : tagk)
+  ]
+}