From cde687389721f9eb966a56b0a7637167b0c3f466 Mon Sep 17 00:00:00 2001 From: SakshiKekre Date: Thu, 19 Jun 2025 11:44:36 -0700 Subject: [PATCH 1/3] Added alert policy in terraform for workflow Failures --- .../modules/fastapi_cloudrun/monitoring.tf | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/terraform/infra-policyengine-api/modules/fastapi_cloudrun/monitoring.tf b/terraform/infra-policyengine-api/modules/fastapi_cloudrun/monitoring.tf index 6ccd8ae4..4d12377e 100644 --- a/terraform/infra-policyengine-api/modules/fastapi_cloudrun/monitoring.tf +++ b/terraform/infra-policyengine-api/modules/fastapi_cloudrun/monitoring.tf @@ -117,3 +117,54 @@ resource "google_monitoring_alert_policy" "limit_alert" { } } +resource "google_monitoring_alert_policy" "simulation_workflow_failure" { + display_name = "Simulation Workflow Failures (for tf test)" + combiner = "OR" + + conditions { + display_name = "Simulation Workflow Failed in Last 5 Minutes" + + condition_threshold { + filter = <<-EOT + metric.type="workflows.googleapis.com/workflow/execution_count" + AND resource.type="cloud_workflow" + AND metric.label."status"="FAILED" + AND resource.label."workflow_name"="policyengine-api-simulation" + EOT + + duration = "300s" # 5-minute window + comparison = "COMPARISON_GT" + threshold_value = 0 + + aggregations { + alignment_period = "300s" + per_series_aligner = "ALIGN_RATE" + cross_series_reducer = "REDUCE_SUM" + } + } + } + + notification_channels = local.notification_channels + + documentation { + content = <<-EOT + 🚨 *Simulation Workflow Failure Alert* + + One or more executions of the simulation workflow failed within the last 5 minutes. + + *Steps:* + - Check Cloud Workflows logs + - Confirm input data + - Review recent deploys: [Latest commit](${var.commit_url}) + EOT + mime_type = "text/markdown" + } + + user_labels = { + service = "policyengine-api" + type = "workflow-alert" + } + + enabled = true +} + From 8b2b1c5b7b51ade3b1c12d066958e3bce214615c Mon Sep 17 00:00:00 2001 From: SakshiKekre Date: Fri, 20 Jun 2025 00:20:31 -0700 Subject: [PATCH 2/3] block needs project id --- .../modules/fastapi_cloudrun/monitoring.tf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/terraform/infra-policyengine-api/modules/fastapi_cloudrun/monitoring.tf b/terraform/infra-policyengine-api/modules/fastapi_cloudrun/monitoring.tf index 4d12377e..537eba17 100644 --- a/terraform/infra-policyengine-api/modules/fastapi_cloudrun/monitoring.tf +++ b/terraform/infra-policyengine-api/modules/fastapi_cloudrun/monitoring.tf @@ -117,7 +117,8 @@ resource "google_monitoring_alert_policy" "limit_alert" { } } -resource "google_monitoring_alert_policy" "simulation_workflow_failure" { +resource "google_monitoring_alert_policy" "simulation_workflow_failure_alert" { + project = var.project_id display_name = "Simulation Workflow Failures (for tf test)" combiner = "OR" From e6a6262c28e5d4593e0e3ebd99d386e93dea0f63 Mon Sep 17 00:00:00 2001 From: SakshiKekre Date: Mon, 23 Jun 2025 07:00:52 -0700 Subject: [PATCH 3/3] Modified policy to use PromQL, added URL to workflow in policy documentation --- .../modules/fastapi_cloudrun/monitoring.tf | 32 ++++++------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/terraform/infra-policyengine-api/modules/fastapi_cloudrun/monitoring.tf b/terraform/infra-policyengine-api/modules/fastapi_cloudrun/monitoring.tf index 537eba17..a813808d 100644 --- a/terraform/infra-policyengine-api/modules/fastapi_cloudrun/monitoring.tf +++ b/terraform/infra-policyengine-api/modules/fastapi_cloudrun/monitoring.tf @@ -119,28 +119,19 @@ resource "google_monitoring_alert_policy" "limit_alert" { resource "google_monitoring_alert_policy" "simulation_workflow_failure_alert" { project = var.project_id - display_name = "Simulation Workflow Failures (for tf test)" + display_name = "Simulation Workflow Failures" combiner = "OR" conditions { display_name = "Simulation Workflow Failed in Last 5 Minutes" - - condition_threshold { - filter = <<-EOT - metric.type="workflows.googleapis.com/workflow/execution_count" - AND resource.type="cloud_workflow" - AND metric.label."status"="FAILED" - AND resource.label."workflow_name"="policyengine-api-simulation" + condition_prometheus_query_language { + query = <<-EOT + increase(workflows_googleapis_com:finished_execution_count{monitored_resource="workflows.googleapis.com/Workflow",status="FAILED"}[5m]) > 1 EOT - - duration = "300s" # 5-minute window - comparison = "COMPARISON_GT" - threshold_value = 0 - - aggregations { - alignment_period = "300s" - per_series_aligner = "ALIGN_RATE" - cross_series_reducer = "REDUCE_SUM" + duration = "300s" + evaluation_interval = "60s" + labels = { + severity = "critical" } } } @@ -154,18 +145,13 @@ resource "google_monitoring_alert_policy" "simulation_workflow_failure_alert" { One or more executions of the simulation workflow failed within the last 5 minutes. *Steps:* - - Check Cloud Workflows logs + - Check Cloud Workflows logs: [View Details](https://console.cloud.google.com/workflows/workflow/${var.region}/simulation-workflow/metrics?project=${var.project_id}) - Confirm input data - Review recent deploys: [Latest commit](${var.commit_url}) EOT mime_type = "text/markdown" } - user_labels = { - service = "policyengine-api" - type = "workflow-alert" - } - enabled = true }