Skip to content

Commit 67decc1

Browse files
committed
Add FreeStorageSpace CloudWatch alert for RDS
1 parent 1671184 commit 67decc1

File tree

5 files changed

+80
-24
lines changed

5 files changed

+80
-24
lines changed

infra/terraform/hash/main.tf

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -84,18 +84,19 @@ module "bastion" {
8484
}
8585

8686
module "postgres" {
87-
depends_on = [module.networking]
88-
source = "./postgres"
89-
prefix = local.prefix
90-
subnets = module.networking.snpriv
91-
vpc_id = module.networking.vpc.id
92-
vpc_cidr_block = module.networking.vpc.cidr_block
93-
env = local.env
94-
region = local.region
95-
pg_port = 5432
96-
instance_class = "db.t3.small"
97-
pg_superuser_username = "superuser"
98-
pg_superuser_password = sensitive(data.vault_kv_secret_v2.secrets.data["pg_superuser_password"])
87+
depends_on = [module.networking]
88+
source = "./postgres"
89+
prefix = local.prefix
90+
subnets = module.networking.snpriv
91+
vpc_id = module.networking.vpc.id
92+
vpc_cidr_block = module.networking.vpc.cidr_block
93+
env = local.env
94+
region = local.region
95+
pg_port = 5432
96+
instance_class = "db.t3.small"
97+
pg_superuser_username = "superuser"
98+
pg_superuser_password = sensitive(data.vault_kv_secret_v2.secrets.data["pg_superuser_password"])
99+
pagerduty_main_database_aws_integration_key = sensitive(data.vault_kv_secret_v2.secrets.data["pagerduty_main_database_aws_integration_key"])
99100
}
100101

101102
module "temporal" {
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# CloudWatch alerts for RDS database monitoring
2+
# Provides alerting for critical database metrics
3+
4+
# SNS Topic for database alerts
5+
resource "aws_sns_topic" "database_alerts" {
6+
name = "${var.prefix}-database-alerts"
7+
8+
tags = {
9+
Name = "${var.prefix}-database-alerts"
10+
Service = "postgres"
11+
Purpose = "RDS database health and performance alerts"
12+
}
13+
}
14+
15+
# PagerDuty subscription for database alerts
16+
resource "aws_sns_topic_subscription" "pagerduty" {
17+
topic_arn = aws_sns_topic.database_alerts.arn
18+
protocol = "https"
19+
endpoint = "https://events.pagerduty.com/integration/${var.pagerduty_main_database_aws_integration_key}/enqueue"
20+
}
21+
22+
# CloudWatch Alarm for RDS free storage space
23+
resource "aws_cloudwatch_metric_alarm" "rds_free_storage_space" {
24+
alarm_name = "${var.prefix}-rds-free-storage-space-low"
25+
alarm_description = "CRITICAL: RDS instance ${aws_db_instance.postgres.identifier} has low free storage space. Storage: ${aws_db_instance.postgres.allocated_storage}GB total."
26+
27+
# RDS storage metrics
28+
metric_name = "FreeStorageSpace"
29+
namespace = "AWS/RDS"
30+
statistic = "Minimum"
31+
period = 300 # 5 minutes
32+
evaluation_periods = 2 # Must be low for 10 minutes total
33+
threshold = 10 * 1024 * 1024 * 1024 # 10GB in bytes
34+
comparison_operator = "LessThanThreshold"
35+
treat_missing_data = "breaching"
36+
37+
dimensions = {
38+
DBInstanceIdentifier = aws_db_instance.postgres.identifier
39+
}
40+
41+
alarm_actions = [aws_sns_topic.database_alerts.arn]
42+
ok_actions = [aws_sns_topic.database_alerts.arn]
43+
44+
tags = {
45+
Name = "${var.prefix}-rds-free-storage-space-low-alarm"
46+
Severity = "CRITICAL"
47+
Purpose = "Alert when RDS free storage space is critically low"
48+
}
49+
}

infra/terraform/hash/postgres/variables.tf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,9 @@ variable "pg_superuser_password" {
5050
sensitive = true
5151
description = "Password for the 'superuser' user in the Postgres instance"
5252
}
53+
54+
variable "pagerduty_main_database_aws_integration_key" {
55+
type = string
56+
sensitive = true
57+
description = "PagerDuty integration key for main database AWS alerts"
58+
}

infra/terraform/hash/temporal/task_definitions.tf

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ locals {
2525

2626
logConfiguration = {
2727
logDriver = "awslogs"
28-
options = {
28+
options = {
2929
"awslogs-create-group" = "true"
3030
"awslogs-group" = local.log_group_name
3131
"awslogs-stream-prefix" = local.migrate_service_name
@@ -34,13 +34,13 @@ locals {
3434
}
3535
},
3636
{
37-
essential = true
38-
name = "${local.prefix}-${local.temporal_service_name}"
39-
image = "temporalio/server:${var.temporal_version}"
40-
cpu = 0 # let ECS divvy up the available CPU
41-
dependsOn = [{ condition = "SUCCESS", containerName = "${local.prefix}-${local.migrate_service_name}" }]
37+
essential = true
38+
name = "${local.prefix}-${local.temporal_service_name}"
39+
image = "temporalio/server:${var.temporal_version}"
40+
cpu = 0 # let ECS divvy up the available CPU
41+
dependsOn = [{ condition = "SUCCESS", containerName = "${local.prefix}-${local.migrate_service_name}" }]
4242
healthCheck = {
43-
command = [
43+
command = [
4444
"CMD", "/bin/sh", "-c", "temporal operator cluster health --address $(hostname):7233 | grep -q SERVING"
4545
]
4646
startPeriod = 10
@@ -67,7 +67,7 @@ locals {
6767

6868
logConfiguration = {
6969
logDriver = "awslogs"
70-
options = {
70+
options = {
7171
"awslogs-create-group" = "true"
7272
"awslogs-group" = local.log_group_name
7373
"awslogs-stream-prefix" = local.temporal_service_name
@@ -103,7 +103,7 @@ locals {
103103

104104
logConfiguration = {
105105
logDriver = "awslogs"
106-
options = {
106+
options = {
107107
"awslogs-create-group" = "true"
108108
"awslogs-group" = local.log_group_name
109109
"awslogs-stream-prefix" = local.setup_service_name
@@ -116,7 +116,7 @@ locals {
116116

117117
essential = false
118118
name = "${local.prefix}-${local.ui_service_name}"
119-
image = "temporalio/ui:${var.temporal_ui_version}"
119+
image = "temporalio/ui:${var.temporal_ui_version}"
120120
cpu = 0 # let ECS divvy up the available CPU
121121
dependsOn = [
122122
{ condition = "HEALTHY", containerName = "${local.prefix}-${local.temporal_service_name}" },
@@ -145,7 +145,7 @@ locals {
145145

146146
logConfiguration = {
147147
logDriver = "awslogs"
148-
options = {
148+
options = {
149149
"awslogs-create-group" = "true"
150150
"awslogs-group" = local.log_group_name
151151
"awslogs-stream-prefix" = local.ui_service_name

infra/terraform/hash/tls.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ resource "cloudflare_dns_record" "caa_hash_ai" {
4747
value = "amazon.com"
4848
}
4949

50-
ttl = 1
50+
ttl = 1
5151

5252
tags = ["terraform"]
5353
}

0 commit comments

Comments
 (0)