Skip to content

Commit e83ebe8

Browse files
authored
H-3139: Set up CloudWatch alerts terraform (#7719)
1 parent f60444e commit e83ebe8

File tree

10 files changed

+443
-21
lines changed

10 files changed

+443
-21
lines changed

infra/terraform/hash/.terraform.lock.hcl

Lines changed: 19 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

infra/terraform/hash/main.tf

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ module "observability" {
139139
grafana_secret_key = sensitive(data.vault_kv_secret_v2.secrets.data["grafana_secret_key"])
140140
vpc_zone_id = aws_route53_zone.vpc.zone_id
141141
amazon_trust_ca_bundle = local.amazon_trust_ca_bundle
142+
critical_alerts_topic_arn = module.critical_alerts.sns_topic_arn
142143
}
143144

144145

@@ -237,6 +238,14 @@ module "temporal_worker_integration_ecr" {
237238
ecr_name = "temporalworkerintegration"
238239
}
239240

241+
module "critical_alerts" {
242+
source = "../modules/sns_slack_alerts"
243+
244+
prefix = local.prefix
245+
severity = "critical"
246+
slack_webhook_url = sensitive(data.vault_kv_secret_v2.secrets.data["slack_aws_webhook"])
247+
}
248+
240249
module "application" {
241250
depends_on = [module.networking, module.postgres]
242251
providers = { cloudflare = cloudflare }
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# CloudWatch backup alerting for Grafana service health
2+
# This provides independent alerting when Grafana itself is down
3+
# Uses shared critical alerts SNS topic with Lambda transformer
4+
5+
# CloudWatch Alarm for Grafana ALB target health
6+
resource "aws_cloudwatch_metric_alarm" "grafana_service_down" {
7+
alarm_name = "${var.prefix}-grafana-service-down"
8+
alarm_description = "CRITICAL: Grafana has no healthy targets. All Grafana-based alerting is offline."
9+
10+
# ALB Target Group metrics (much more reliable than ECS)
11+
metric_name = "HealthyHostCount"
12+
namespace = "AWS/ApplicationELB"
13+
statistic = "Average"
14+
period = 60 # 1 minute
15+
evaluation_periods = 3 # Must be down for 3 minutes total
16+
threshold = 1
17+
comparison_operator = "LessThanThreshold"
18+
treat_missing_data = "breaching"
19+
20+
dimensions = {
21+
TargetGroup = aws_lb_target_group.grafana.arn_suffix
22+
LoadBalancer = var.external_load_balancer_arn_suffix
23+
}
24+
25+
# Send to shared critical alerts topic (Lambda → Slack)
26+
alarm_actions = [var.critical_alerts_topic_arn]
27+
ok_actions = [var.critical_alerts_topic_arn]
28+
29+
tags = {
30+
Name = "${var.prefix}-grafana-service-down-alarm"
31+
Severity = "CRITICAL"
32+
Purpose = "Backup alerting for monitoring infrastructure failure"
33+
}
34+
}

infra/terraform/hash/observability/grafana/variables.tf

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,3 +101,13 @@ variable "mimir_http_port" {
101101
variable "ssl_config" {
102102
description = "Shared SSL configuration for container certificates"
103103
}
104+
105+
variable "external_load_balancer_arn_suffix" {
106+
type = string
107+
description = "ARN suffix of the external ALB for CloudWatch metrics"
108+
}
109+
110+
variable "critical_alerts_topic_arn" {
111+
type = string
112+
description = "ARN of the critical alerts SNS topic"
113+
}

infra/terraform/hash/observability/main.tf

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -201,27 +201,29 @@ module "mimir" {
201201

202202
# Grafana service for distributed tracing visualization
203203
module "grafana" {
204-
source = "./grafana"
205-
prefix = var.prefix
206-
cluster_arn = aws_ecs_cluster.observability.arn
207-
vpc = var.vpc
208-
subnets = var.subnets
209-
config_bucket = aws_s3_bucket.configs
210-
log_group_name = aws_cloudwatch_log_group.observability.name
211-
region = var.region
212-
root_url = cloudflare_record.cname_grafana_internal.hostname
213-
service_discovery_namespace_arn = aws_service_discovery_private_dns_namespace.observability.arn
214-
service_discovery_namespace_name = aws_service_discovery_private_dns_namespace.observability.name
215-
grafana_database_host = var.grafana_database_host
216-
grafana_database_port = var.grafana_database_port
217-
grafana_database_password = var.grafana_database_password
218-
grafana_secret_key = var.grafana_secret_key
219-
tempo_api_dns = module.tempo.api_dns
220-
tempo_api_port = module.tempo.api_port
221-
loki_http_dns = module.loki.http_dns
222-
loki_http_port = module.loki.http_port
223-
mimir_http_dns = module.mimir.http_dns
224-
mimir_http_port = module.mimir.http_port
204+
source = "./grafana"
205+
prefix = var.prefix
206+
cluster_arn = aws_ecs_cluster.observability.arn
207+
vpc = var.vpc
208+
subnets = var.subnets
209+
config_bucket = aws_s3_bucket.configs
210+
log_group_name = aws_cloudwatch_log_group.observability.name
211+
region = var.region
212+
root_url = cloudflare_record.cname_grafana_internal.hostname
213+
service_discovery_namespace_arn = aws_service_discovery_private_dns_namespace.observability.arn
214+
service_discovery_namespace_name = aws_service_discovery_private_dns_namespace.observability.name
215+
grafana_database_host = var.grafana_database_host
216+
grafana_database_port = var.grafana_database_port
217+
grafana_database_password = var.grafana_database_password
218+
grafana_secret_key = var.grafana_secret_key
219+
tempo_api_dns = module.tempo.api_dns
220+
tempo_api_port = module.tempo.api_port
221+
loki_http_dns = module.loki.http_dns
222+
loki_http_port = module.loki.http_port
223+
mimir_http_dns = module.mimir.http_dns
224+
mimir_http_port = module.mimir.http_port
225+
external_load_balancer_arn_suffix = aws_lb.observability_external.arn_suffix
226+
critical_alerts_topic_arn = var.critical_alerts_topic_arn
225227

226228
ssl_config = local.full_ca_ssl_config
227229
}

infra/terraform/hash/observability/variables.tf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,8 @@ variable "amazon_trust_ca_bundle" {
5858
type = string
5959
description = "Amazon Trust Services CA Bundle for SSL verification"
6060
}
61+
62+
variable "critical_alerts_topic_arn" {
63+
type = string
64+
description = "ARN of the critical alerts SNS topic (from infrastructure level)"
65+
}
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# SNS to Slack Alerts Module
2+
# Provides clean, formatted Slack notifications via Lambda transformer
3+
4+
# SNS Topic for alerts
5+
resource "aws_sns_topic" "alerts" {
6+
name = "${var.prefix}-${var.severity}-alerts"
7+
8+
tags = {
9+
Name = "${var.prefix}-${var.severity}-alerts"
10+
Purpose = "${var.severity} level alerts via Slack"
11+
Severity = var.severity
12+
}
13+
}
14+
15+
# Lambda function to transform SNS messages to Slack format
16+
resource "aws_lambda_function" "sns_to_slack" {
17+
function_name = "${var.prefix}-${var.severity}-slack-alert"
18+
role = aws_iam_role.lambda_execution.arn
19+
handler = "index.handler"
20+
runtime = "python3.10"
21+
timeout = 30
22+
23+
filename = data.archive_file.lambda_zip.output_path
24+
source_code_hash = data.archive_file.lambda_zip.output_base64sha256
25+
26+
environment {
27+
variables = {
28+
SLACK_WEBHOOK_URL = var.slack_webhook_url
29+
ALERT_SEVERITY = var.severity
30+
}
31+
}
32+
33+
tags = {
34+
Name = "${var.prefix}-${var.severity}-slack-alert"
35+
Purpose = "Transform SNS alerts to Slack format"
36+
Severity = var.severity
37+
}
38+
}
39+
40+
# Lambda source code package
41+
data "archive_file" "lambda_zip" {
42+
type = "zip"
43+
output_path = "/tmp/${var.prefix}-${var.severity}-slack-alert.zip"
44+
45+
source {
46+
content = file("${path.module}/slack_alert.py")
47+
filename = "index.py"
48+
}
49+
}
50+
51+
# IAM role for Lambda execution
52+
resource "aws_iam_role" "lambda_execution" {
53+
name = "${var.prefix}-${var.severity}-slack-alert-role"
54+
55+
assume_role_policy = jsonencode({
56+
Version = "2012-10-17"
57+
Statement = [
58+
{
59+
Action = "sts:AssumeRole"
60+
Effect = "Allow"
61+
Principal = {
62+
Service = "lambda.amazonaws.com"
63+
}
64+
}
65+
]
66+
})
67+
}
68+
69+
# Lambda execution policy
70+
resource "aws_iam_role_policy" "lambda_execution" {
71+
name = "${var.prefix}-${var.severity}-slack-alert-policy"
72+
role = aws_iam_role.lambda_execution.id
73+
74+
policy = jsonencode({
75+
Version = "2012-10-17"
76+
Statement = [
77+
{
78+
Effect = "Allow"
79+
Action = [
80+
"logs:CreateLogGroup",
81+
"logs:CreateLogStream",
82+
"logs:PutLogEvents"
83+
]
84+
Resource = "arn:aws:logs:*:*:*"
85+
}
86+
]
87+
})
88+
}
89+
90+
# SNS Topic Subscription to Lambda
91+
resource "aws_sns_topic_subscription" "lambda" {
92+
topic_arn = aws_sns_topic.alerts.arn
93+
protocol = "lambda"
94+
endpoint = aws_lambda_function.sns_to_slack.arn
95+
}
96+
97+
# Allow SNS to invoke Lambda
98+
resource "aws_lambda_permission" "allow_sns" {
99+
statement_id = "AllowExecutionFromSNS"
100+
action = "lambda:InvokeFunction"
101+
function_name = aws_lambda_function.sns_to_slack.function_name
102+
principal = "sns.amazonaws.com"
103+
source_arn = aws_sns_topic.alerts.arn
104+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
output "sns_topic_arn" {
2+
description = "ARN of the SNS topic for alerts"
3+
value = aws_sns_topic.alerts.arn
4+
}

0 commit comments

Comments
 (0)