From f3e80d9f4db8aa65334c16da2fc27a869eeba63e Mon Sep 17 00:00:00 2001
From: Your Name <user@example.com>
Date: Fri, 16 Jan 2026 13:32:41 -0500
Subject: [PATCH] adding monitoring

---
 .github/copilot-instructions.md               | 131 +++++
 .gitignore                                    |   2 +
 .terraform_commits                            |  12 +
 .vscode/settings.json                         |  11 +
 DOCUMENTATION_REVIEW_2026-01-15.md            | 159 ++++++
 MONITORING_DEPLOYMENT_SUMMARY.md              | 310 +++++++++++
 MONITORING_IMPLEMENTATION_PLAN.md             | 363 +++++++++++++
 README.md                                     |  31 +-
 RUNBOOK.md                                    | 500 ++++++++++++++++++
 aws_ecs_cluster_capacity_providers.fargate    |  42 ++
 default.auto.tfvars                           |   3 +
 example.auto.tfvars                           |  90 ----
 example.tfvars.template                       | 111 ++++
 lambda/requirements_pat.txt                   |   4 +
 lambda/token_refresh_pat.py                   | 191 +++++++
 lambda_token_refresh.tf                       |  53 +-
 lambda_token_refresh.tf.tmp                   | 185 +++++++
 monitoring.tf                                 | 227 ++++++++
 providers.tf                                  |  18 +-
 terraform_data_dirs/csvd/environment          |   1 -
 terraform_data_dirs/csvd/modules/ecr-clone    |   1 -
 .../csvd/modules/github-runner                |   1 -
 terraform_data_dirs/csvd/modules/modules.json |   1 -
 .../hashicorp/aws/5.70.0/linux_amd64          |   1 -
 .../hashicorp/dns/3.4.2/linux_amd64           |   1 -
 .../hashicorp/dns/3.4.3/linux_amd64           |   1 -
 .../hashicorp/github/6.3.1/linux_amd64        |   1 -
 .../hashicorp/github/6.6.0/linux_amd64        |   1 -
 .../hashicorp/null/3.2.3/linux_amd64          |   1 -
 .../hashicorp/null/3.2.4/linux_amd64          |   1 -
 .../hashicorp/random/3.6.3/linux_amd64        |   1 -
 .../hashicorp/random/3.7.2/linux_amd64        |   1 -
 .../integrations/github/5.45.0/linux_amd64    |   1 -
 .../integrations/github/6.6.0/linux_amd64     |   1 -
 .../sct-engineering/modules/github-runner     |   1 -
 .../sct-engineering/modules/modules.json      |   1 -
 .../hashicorp/aws/5.70.0/linux_amd64          |   1 -
 .../hashicorp/dns/3.4.2/linux_amd64           |   1 -
 .../hashicorp/github/6.3.1/linux_amd64        |   1 -
 .../hashicorp/local/2.5.2/linux_amd64         |   1 -
 .../hashicorp/null/3.2.3/linux_amd64          |   1 -
 .../hashicorp/random/3.6.3/linux_amd64        |   1 -
 variables.tf                                  |  55 +-
 43 files changed, 2375 insertions(+), 146 deletions(-)
 create mode 100644 .github/copilot-instructions.md
 create mode 100644 .vscode/settings.json
 create mode 100644 DOCUMENTATION_REVIEW_2026-01-15.md
 create mode 100644 MONITORING_DEPLOYMENT_SUMMARY.md
 create mode 100644 MONITORING_IMPLEMENTATION_PLAN.md
 create mode 100644 RUNBOOK.md
 create mode 100644 aws_ecs_cluster_capacity_providers.fargate
 delete mode 100644 example.auto.tfvars
 create mode 100644 example.tfvars.template
 create mode 100644 lambda/requirements_pat.txt
 create mode 100644 lambda/token_refresh_pat.py
 create mode 100644 lambda_token_refresh.tf.tmp
 create mode 100644 monitoring.tf
 delete mode 100644 terraform_data_dirs/csvd/environment
 delete mode 160000 terraform_data_dirs/csvd/modules/ecr-clone
 delete mode 160000 terraform_data_dirs/csvd/modules/github-runner
 delete mode 100644 terraform_data_dirs/csvd/modules/modules.json
 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64
 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64
 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/dns/3.4.3/linux_amd64
 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64
 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/github/6.6.0/linux_amd64
 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64
 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/null/3.2.4/linux_amd64
 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64
 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/random/3.7.2/linux_amd64
 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/5.45.0/linux_amd64
 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/6.6.0/linux_amd64
 delete mode 160000 terraform_data_dirs/sct-engineering/modules/github-runner
 delete mode 100644 terraform_data_dirs/sct-engineering/modules/modules.json
 delete mode 120000 terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64
 delete mode 120000 terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64
 delete mode 120000 terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64
 delete mode 120000 terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/local/2.5.2/linux_amd64
 delete mode 120000 terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64
 delete mode 120000 terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
new file mode 100644
index 0000000..b176fdf
--- /dev/null
+++ b/.github/copilot-instructions.md
@@ -0,0 +1,131 @@
+# GitHub Copilot Instructions for ghe-runner Repository
+
+## General Guidelines
+
+### Terraform Commands
+- **ALWAYS use the `tf` alias instead of `terraform` command**
+- The `tf` alias performs important behind-the-scenes operations required for this environment
+- Examples:
+  - ✅ `tf plan` (correct)
+  - ✅ `tf apply` (correct)
+  - ❌ `terraform plan` (incorrect)
+  - ❌ `terraform apply` (incorrect)
+
+### Terminal Commands
+- When running terminal commands, always use the `run_in_terminal` tool
+- Set `isBackground=false` for commands that need output
+- Set `isBackground=true` for long-running processes (servers, watches)
+
+### AWS Authentication
+- AWS credentials may expire during sessions
+- User will refresh credentials manually using `awscreds` command
+- Do not attempt to source aws credentials automatically
+
+### GitHub Authentication
+- This project uses **token-only authentication** (GITHUB_TOKEN environment variable)
+- GitHub App authentication is optional (variables have default = null)
+- Never require GitHub App variables unless explicitly requested
+
+## Project-Specific Context
+
+### Infrastructure
+- **Region**: us-gov-west-1 (AWS GovCloud)
+- **ECS Cluster**: ecs-ghe-runners-us-gov-west-1
+- **GitHub Enterprise**: github.e.it.census.gov
+- **Organization**: SCT-Engineering
+- **Proxy**: proxy.tco.census.gov:3128 (required for outbound traffic)
+
+### Critical Understanding: Persistent Runners & Token Lifecycle
+⚠️ **IMPORTANT**: Runners are **persistent, long-running containers** (not ephemeral):
+- Runners run continuously 24/7, handling multiple jobs over their lifetime
+- Registration token is used **only during container startup** (one-time registration)
+- Lambda refreshes token every 30 min to ensure valid token for ECS task restarts
+- **Deadlock risk**: If all runners die AND token expires, ECS cannot auto-recover
+  - Running tasks don't need token refresh (already registered)
+  - Failed tasks being restarted by ECS need valid token from Secrets Manager
+  - This is why monitoring and quick response are essential
+
+### File Conventions
+- Main configuration: `default.auto.tfvars`
+- Example template: `example.tfvars.template` (do NOT rename to `.auto.tfvars`)
+- Monitoring: `monitoring.tf`
+- Emergency procedures: `RUNBOOK.md`
+
+### Terraform Modules
+- Primary module: `HappyPathway/github-runner/ecs`
+- Optional ECR clone: `HappyPathway/ecr-clone/aws`
+- Module outputs: Check `outputs.tf` before referencing module attributes
+
+## Code Editing Guidelines
+
+### When Making File Changes
+1. Always read sufficient context before editing (5+ lines before/after)
+2. Use `replace_string_in_file` with exact matches including whitespace
+3. Never use placeholder comments like `...existing code...` in edits
+4. Verify changes with `tf plan` after modifications
+
+### When Implementing Features
+1. Create a todo list for multi-step work
+2. Mark items in-progress before starting
+3. Mark items completed immediately after finishing
+4. Update the list as new tasks are discovered
+
+## Monitoring & Alerting
+
+### Alert Configuration
+- Alert email: david.j.arnold.jr@census.gov
+- SNS topic: github-runner-critical-alerts
+- Critical alarms: runners < 50% capacity, all runners down
+- Dashboard: CloudWatch dashboard for visibility
+
+### Emergency Response
+- Refer to `RUNBOOK.md` for incident procedures
+- Three critical scenarios documented:
+  1. Lambda token refresh failing
+  2. Runners at 50% capacity
+  3. All runners down (EMERGENCY)
+
+## Testing & Validation
+
+### Before Committing
+1. Run `tf plan` to validate configuration
+2. Check for errors with `get_errors` tool if available
+3. Verify outputs are as expected
+4. Review changes in context of overall system
+
+### After Deployment
+1. Verify SNS email subscription confirmation
+2. Check CloudWatch alarms are configured
+3. Test dashboard accessibility
+4. Document any lessons learned
+
+## Common Issues & Solutions
+
+### "Invalid AWS Region" Error
+- Ensure `providers.tf` has `region = "us-gov-west-1"`
+
+### "Unsupported attribute" on Module Outputs
+- Check `outputs.tf` for available module outputs
+- Use `var.repo_org` for service name, not `module.github-runner.service_name`
+
+### Image Pull Failures
+- Enable ECR clone: `enable_ecr_clone = true`
+- Verify image version exists in source registry
+
+### Token Expiration Risk
+- Monitor Lambda execution via CloudWatch Logs
+- Check token age in Secrets Manager
+- Manual refresh available via Lambda invoke
+
+## Resources
+
+- [Monitoring Plan](./MONITORING_IMPLEMENTATION_PLAN.md)
+- [Emergency Runbook](./RUNBOOK.md)
+- [GitHub App Setup](./GITHUB_APP_SETUP.md)
+- [AWS Permissions](./AWS_PERMISSIONS.md)
+- [Security Review](./SECURITY_REVIEW.md)
+
+---
+
+**Last Updated**: January 15, 2026
+**Maintainer**: CSVD Team
diff --git a/.gitignore b/.gitignore
index 2516318..b28d3f7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -54,3 +54,5 @@ aws-image-pipeline/aws-image-pipeline
 automation-repos/automation-repos
 ghe-runners/ghe-runners
 docker-image-pipeline/docker-image-pipeline
+
+terraform_data_dirs
\ No newline at end of file
diff --git a/.terraform_commits b/.terraform_commits
index a239210..7f3ad12 100644
--- a/.terraform_commits
+++ b/.terraform_commits
@@ -88,5 +88,17 @@
     "commit_message": "Add GitHub Actions Runner Setup Guide to README.md",
     "author": "Your Name",
     "timestamp": "2025-10-31T13:13:21.490997"
+  },
+  {
+    "commit_hash": "fa186792281de61333a09ed8477d865d96cb3ae8",
+    "commit_message": "feat(lambda): Implement GitHub Actions runner token refresh Lambda function\n\n- Added `token_refresh.py` to handle the token refresh logic.\n- Integrated AWS Secrets Manager for storing the GitHub registration token.\n- Utilized GitHub App authentication for secure API access.\n- Scheduled Lambda function to run every 30 minutes using CloudWatch Events.\n- Created necessary IAM roles and policies for Lambda execution.\n\nchore(lambda): Add requirements for token refresh Lambda\n\n- Added `requirements.txt` with dependencies: PyJWT and cryptography.\n\nfeat(terraform): Configure Lambda function and CloudWatch Events\n\n- Created Terraform configuration for the Lambda function and its dependencies.\n- Set up CloudWatch Event Rule to trigger the Lambda function every 30 minutes.\n- Configured IAM roles and policies for Lambda execution and Secrets Manager access.\n\ndocs(scripts): Add monitoring tools for GitHub Runner ECS services\n\n- Created monitoring scripts to track ECS service health and CloudWatch logs.\n- Added README with usage instructions and troubleshooting tips.\n- Implemented a continuous monitoring script using rich for better output formatting.\n\nchore(scripts): Add requirements for monitoring scripts\n\n- Added `requirements.txt` for monitoring scripts with dependencies: boto3, botocore, and rich.\n\nfix(scripts): Update monitoring script to use Terraform outputs\n\n- Modified `monitor_runners.py` to fetch necessary configuration from Terraform outputs.\n- Improved error handling and logging for better visibility.\n\nfeat(varfiles): Add configuration files for Terraform modules\n\n- Created JSON and TFVars files for managing Terraform workspace and GitHub organization settings.",
+    "author": "Your Name",
+    "timestamp": "2026-01-12T14:58:24.831561"
+  },
+  {
+    "commit_hash": "fa186792281de61333a09ed8477d865d96cb3ae8",
+    "commit_message": "feat(lambda): Implement GitHub Actions runner token refresh Lambda function\n\n- Added `token_refresh.py` to handle the token refresh logic.\n- Integrated AWS Secrets Manager for storing the GitHub registration token.\n- Utilized GitHub App authentication for secure API access.\n- Scheduled Lambda function to run every 30 minutes using CloudWatch Events.\n- Created necessary IAM roles and policies for Lambda execution.\n\nchore(lambda): Add requirements for token refresh Lambda\n\n- Added `requirements.txt` with dependencies: PyJWT and cryptography.\n\nfeat(terraform): Configure Lambda function and CloudWatch Events\n\n- Created Terraform configuration for the Lambda function and its dependencies.\n- Set up CloudWatch Event Rule to trigger the Lambda function every 30 minutes.\n- Configured IAM roles and policies for Lambda execution and Secrets Manager access.\n\ndocs(scripts): Add monitoring tools for GitHub Runner ECS services\n\n- Created monitoring scripts to track ECS service health and CloudWatch logs.\n- Added README with usage instructions and troubleshooting tips.\n- Implemented a continuous monitoring script using rich for better output formatting.\n\nchore(scripts): Add requirements for monitoring scripts\n\n- Added `requirements.txt` for monitoring scripts with dependencies: boto3, botocore, and rich.\n\nfix(scripts): Update monitoring script to use Terraform outputs\n\n- Modified `monitor_runners.py` to fetch necessary configuration from Terraform outputs.\n- Improved error handling and logging for better visibility.\n\nfeat(varfiles): Add configuration files for Terraform modules\n\n- Created JSON and TFVars files for managing Terraform workspace and GitHub organization settings.",
+    "author": "Your Name",
+    "timestamp": "2026-01-15T17:53:12.576503"
   }
 ]
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..0aeacfb
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,11 @@
+{
+  "github.copilot.chat.mcpServers": {
+    "terraform": {
+      "command": "/home/a/arnol377/.local/bin/terraform-mcp-server",
+      "args": [],
+      "env": {
+        "TF_WORKSPACE_DIR": "/home/a/arnol377/git/ghe-runner"
+      }
+    }
+  }
+}
diff --git a/DOCUMENTATION_REVIEW_2026-01-15.md b/DOCUMENTATION_REVIEW_2026-01-15.md
new file mode 100644
index 0000000..f65415d
--- /dev/null
+++ b/DOCUMENTATION_REVIEW_2026-01-15.md
@@ -0,0 +1,159 @@
+# Documentation Review - January 15, 2026
+
+## Summary of Updates
+
+Updated documentation to accurately reflect the **persistent, long-running runner architecture** rather than describing them as ephemeral/dynamic containers.
+
+## Key Architectural Clarifications
+
+### Runner Model
+- ✅ **CORRECT**: Runners are persistent, long-running ECS Fargate containers
+- ✅ **CORRECT**: Runners stay active 24/7, polling GitHub for jobs
+- ✅ **CORRECT**: Same runner handles multiple workflow jobs over its lifetime
+- ✅ **CORRECT**: Runners only restart on: crash, manual stop, service deployment
+- ❌ **INCORRECT** (Previous): Runners spin up dynamically per job
+
+### Token Lifecycle Understanding
+- ✅ **CORRECT**: Registration token is ONLY used during container startup
+- ✅ **CORRECT**: Running runners don't need token refresh (already registered)
+- ✅ **CORRECT**: Lambda token refresh is insurance for ECS task restarts
+- ✅ **CORRECT**: Deadlock occurs when: all runners down + token expired
+- ❌ **INCORRECT** (Previous): Implied tokens are needed continuously
+
+### ECS Auto-Recovery Behavior
+- ✅ **CORRECT**: If a task dies, ECS automatically starts a replacement
+- ✅ **CORRECT**: Replacement task needs valid token from Secrets Manager
+- ✅ **CORRECT**: Lambda ensures fresh token available for automatic recovery
+- ✅ **CORRECT**: Without valid token, ECS enters crash loop
+
+## Files Updated
+
+### 1. README.md
+**Changes:**
+- Added "Runner Model" note emphasizing persistent containers
+- Updated "Key Features" to include "Persistent Runners" and "Automated Token Refresh"
+- Rewrote "Architecture" section with "Runner Lifecycle Model"
+- Added detailed explanation of startup → active → job execution → restart cycle
+- Updated architecture diagram to show "Persistent Runner" and lifecycle states
+- Added Lambda Token Refresh component to diagram
+
+**Key Additions:**
+```
+Runner Lifecycle Model:
+1. Startup: Reads token, registers with GitHub
+2. Active State: Stays online, polls for jobs
+3. Job Execution: Executes jobs, returns to polling
+4. Restart: Only on failure, manual stop, or update
+5. Auto-Recovery: ECS restarts tasks (requires valid token)
+```
+
+### 2. RUNBOOK.md
+**Changes:**
+- Renamed section from "Token Lifecycle Dependency" to "Persistent Runners & Token Lifecycle"
+- Added "Runner Architecture" subsection explaining 24/7 operation
+- Clarified "Token Lifecycle & Deadlock Risk" with focus on startup-only token use
+- Added "Why Lambda Token Refresh Matters" section
+- Updated Scenario 1 impact assessment to clarify running vs. new runners
+- Updated Scenario 2 impact assessment to explain reduced capacity implications
+- Expanded Scenario 3 deadlock warning with detailed explanation
+- Added "Task Crash Loop" to common root causes table
+
+**Key Additions:**
+```
+Running runners: Already registered, don't need token refresh
+Token refresh purpose: Ensures valid token for ECS task restarts
+Deadlock scenario: ECS tries to restart → token expired → tasks fail → retry loop
+```
+
+### 3. .github/copilot-instructions.md
+**Changes:**
+- Updated "Critical Understanding" section title and content
+- Clarified that runners are "persistent, long-running containers (not ephemeral)"
+- Explained registration token is "only during container startup (one-time registration)"
+- Specified Lambda refreshes for "ECS task restarts" not continuous runner operation
+- Detailed deadlock risk with distinction between running vs. restarting tasks
+
+**Key Additions:**
+```
+Runners run continuously 24/7, handling multiple jobs
+Registration token used only during container startup
+Running tasks don't need token refresh (already registered)
+Failed tasks being restarted by ECS need valid token
+```
+
+### 4. lambda_token_refresh.tf
+**Changes:**
+- Expanded header comment from 2 lines to 12 lines
+- Added "IMPORTANT" note about persistent runners
+- Explained token lifecycle in detail
+- Clarified purpose as "insurance for ECS automatic task recovery"
+- Added critical scenario explanation
+
+**Key Additions:**
+```
+IMPORTANT: Runners are persistent, long-running containers
+Registration token ONLY needed during container startup
+Token refresh purpose: Insurance for ECS automatic task recovery
+Critical for: Preventing deadlock when all runners down + token expires
+```
+
+### 5. lambda/token_refresh_pat.py
+**Changes:**
+- Expanded docstring from 7 lines to 17 lines
+- Added "CRITICAL CONTEXT" section
+- Detailed persistent runner architecture
+- Explained deadlock scenario step-by-step
+- Added schedule and authentication details
+
+**Key Additions:**
+```
+CRITICAL CONTEXT:
+- Runners are persistent, long-running ECS containers (not ephemeral)
+- Registration tokens ONLY used during container startup
+- Running runners don't need token refresh
+- Purpose: Prevent deadlock scenario [detailed explanation]
+```
+
+### 6. monitoring.tf
+**Changes:**
+- Expanded header comment from 7 lines to 13 lines
+- Added "RUNNER MODEL" section
+- Clarified monitoring tracks container health, not job execution
+- Updated monitoring area descriptions
+
+**Key Additions:**
+```
+RUNNER MODEL: Persistent, long-running containers (not ephemeral)
+- Runners stay online 24/7, handling multiple jobs
+- Only restart on: task failure, manual stop, service deployment
+- Monitoring tracks runner CONTAINER health, not individual job execution
+```
+
+## Documentation Consistency
+
+All documentation now consistently reflects:
+
+1. **Runner Persistence**: Emphasized that runners are NOT ephemeral
+2. **Token Usage**: Clear that tokens are only for startup, not continuous operation
+3. **Lambda Purpose**: Reframed as "insurance" for ECS auto-recovery
+4. **Deadlock Risk**: Detailed explanation with precise conditions
+5. **ECS Behavior**: Clarified automatic task replacement mechanism
+6. **Monitoring Context**: Metrics track container health, not job execution
+
+## Benefits of These Updates
+
+1. **Operational Understanding**: Clearer picture of how the system actually works
+2. **Troubleshooting**: Better context for investigating runner issues
+3. **Cost Implications**: Understanding that runners run 24/7 (not per-job)
+4. **Monitoring Interpretation**: Metrics represent container state, not workflow state
+5. **Emergency Response**: More accurate mental model for incident response
+
+## No Configuration Changes
+
+These updates are **documentation-only**. No infrastructure, code logic, or configuration was modified. The system operates exactly as before - we've simply corrected the documentation to match reality.
+
+---
+
+**Review Date**: January 15, 2026  
+**Reviewer**: GitHub Copilot (with user guidance)  
+**Status**: Complete ✅
diff --git a/MONITORING_DEPLOYMENT_SUMMARY.md b/MONITORING_DEPLOYMENT_SUMMARY.md
new file mode 100644
index 0000000..a78b599
--- /dev/null
+++ b/MONITORING_DEPLOYMENT_SUMMARY.md
@@ -0,0 +1,310 @@
+# Monitoring Implementation - Deployment Summary
+
+**Date**: January 15, 2026  
+**Status**: Ready for Deployment  
+**Estimated Implementation Time**: 6-8 hours (actual: ~2 hours for code)
+
+## Overview
+
+All monitoring infrastructure has been implemented and validated via `terraform plan`. The implementation includes critical alerting for GitHub Actions runners on ECS Fargate with a focus on preventing the token lifecycle deadlock.
+
+## Implemented Resources
+
+### 1. SNS Topic and Email Subscription
+- **Resource**: `aws_sns_topic.github_runner_critical_alerts`
+- **Name**: `github-runner-critical-alerts-229685449397-us-gov-west-1`
+- **Email**: david.j.arnold.jr@census.gov
+- **Purpose**: Central notification channel for all critical alerts
+
+### 2. Runner Availability Alarms
+
+#### Critical Capacity Alarm
+- **Resource**: `aws_cloudwatch_metric_alarm.runners_critical`
+- **Name**: `github-runners-critical-capacity-cvsd-dev-ew`
+- **Threshold**: 1 runner (50% of desired count of 1)
+- **Evaluation**: 2 periods of 5 minutes (10 minutes total)
+- **Trigger**: When running tasks < 50% capacity
+- **Actions**: Send notification to SNS topic
+
+#### Emergency All-Down Alarm
+- **Resource**: `aws_cloudwatch_metric_alarm.runners_emergency`
+- **Name**: `github-runners-emergency-all-down-cvsd-dev-ew`
+- **Threshold**: 0 runners
+- **Evaluation**: 1 period of 1 minute (immediate)
+- **Trigger**: When all runners are down
+- **Actions**: Send notification to SNS topic
+- **Critical**: This is the deadlock scenario - immediate attention required
+
+### 3. CloudWatch Dashboard
+- **Resource**: `aws_cloudwatch_dashboard.github_runners`
+- **Name**: `github-runners-cvsd-dev-ew`
+- **URL**: https://console.aws.amazon.com/cloudwatch/home?region=us-gov-west-1#dashboards:name=github-runners-cvsd-dev-ew
+
+#### Dashboard Widgets:
+1. **Runner Count Trend** (12x6)
+   - Shows average, maximum, and minimum running task count
+   - Includes annotations for desired count and critical threshold
+   - 5-minute periods
+
+2. **Resource Utilization** (12x6)
+   - CPU utilization (vCPU) on left axis
+   - Memory utilization (MB) on right axis
+   - Helps identify resource constraints
+
+3. **Alarm Status** (8x4)
+   - Visual status of both critical alarms
+   - Quick at-a-glance health check
+
+4. **Recent Error Events** (16x4)
+   - Log query showing recent errors/failures
+   - Last 20 events sorted by timestamp
+
+### 4. Emergency Runbook
+- **File**: `RUNBOOK.md`
+- **Scenarios Documented**:
+  1. Lambda Token Refresh Failing
+  2. Runners at 50% Capacity
+  3. All Runners Down (Emergency)
+
+Each scenario includes:
+- Detection methods
+- Impact assessment
+- Investigation steps with AWS CLI commands
+- Common root causes and resolutions
+- Post-resolution checklist
+
+## Deployment Steps
+
+### Prerequisites
+✅ AWS credentials configured and valid  
+✅ GITHUB_TOKEN environment variable set  
+✅ Terraform initialized (`tf init`)  
+✅ Configuration validated (`tf plan` successful)
+
+### Deploy Monitoring
+
+```bash
+# 1. Review the plan one more time
+cd /home/a/arnol377/git/ghe-runner
+tf plan
+
+# 2. Apply the changes
+tf apply
+
+# 3. Confirm the resources to be created (6 resources)
+#    - aws_sns_topic.github_runner_critical_alerts
+#    - aws_sns_topic_subscription.alert_email
+#    - aws_cloudwatch_metric_alarm.runners_critical
+#    - aws_cloudwatch_metric_alarm.runners_emergency
+#    - aws_cloudwatch_dashboard.github_runners
+#    - (1 secret version replacement)
+
+# Type 'yes' when prompted
+```
+
+### Post-Deployment Verification
+
+1. **Confirm Email Subscription**
+   ```bash
+   # You will receive an email from AWS Notifications
+   # Subject: "AWS Notification - Subscription Confirmation"
+   # Click the "Confirm subscription" link in the email
+   ```
+
+2. **Verify SNS Topic**
+   ```bash
+   aws sns list-subscriptions-by-topic \
+     --topic-arn $(terraform output -raw sns_topic_arn) \
+     --region us-gov-west-1
+   
+   # Should show Status: "Confirmed" after you click the email link
+   ```
+
+3. **Check Alarms Status**
+   ```bash
+   aws cloudwatch describe-alarms \
+     --alarm-names \
+       "github-runners-critical-capacity-cvsd-dev-ew" \
+       "github-runners-emergency-all-down-cvsd-dev-ew" \
+     --region us-gov-west-1
+   ```
+
+4. **View Dashboard**
+   ```bash
+   # Get the dashboard URL
+   terraform output dashboard_url
+   
+   # Open in browser (you may need to be on VPN/internal network)
+   ```
+
+5. **Test Alarm (Optional)**
+   ```bash
+   # Manually trigger the emergency alarm by setting state
+   aws cloudwatch set-alarm-state \
+     --alarm-name "github-runners-emergency-all-down-cvsd-dev-ew" \
+     --state-value ALARM \
+     --state-reason "Testing alert notification" \
+     --region us-gov-west-1
+   
+   # You should receive an email within 1-2 minutes
+   # Reset to OK after testing
+   aws cloudwatch set-alarm-state \
+     --alarm-name "github-runners-emergency-all-down-cvsd-dev-ew" \
+     --state-value OK \
+     --state-reason "Test complete" \
+     --region us-gov-west-1
+   ```
+
+## Files Modified/Created
+
+### New Files
+- ✅ `monitoring.tf` - All monitoring infrastructure
+- ✅ `RUNBOOK.md` - Emergency response procedures
+- ✅ `MONITORING_DEPLOYMENT_SUMMARY.md` - This file
+
+### Modified Files
+- ✅ `providers.tf` - Added region to AWS provider
+- ✅ `variables.tf` - Made GitHub App variables optional (token-only auth)
+- ✅ `default.auto.tfvars` - Contains alert_email configuration
+- ✅ `example.tfvars.template` - Renamed from example.auto.tfvars
+
+### Existing Files (Not Modified)
+- `lambda_token_refresh.tf.tmp` - Lambda token refresh (currently disabled)
+- `main.tf` - Main module configuration
+- `ecs_cluster.tf` - ECS cluster setup
+
+## Monitoring Coverage
+
+### What's Monitored ✅
+- Runner availability (count of running tasks)
+- Critical capacity threshold (50%)
+- Emergency all-down scenario (0 runners)
+- Resource utilization (CPU and memory)
+- Recent error events in logs
+
+### What's NOT Monitored (Future Enhancements)
+- ❌ Lambda token refresh failures (Lambda not currently deployed)
+- ❌ Task failure rate over time
+- ❌ Workflow queue depth
+- ❌ Average task startup time
+- ❌ Network connectivity issues
+- ❌ ECR image pull latency
+
+## Cost Estimate
+
+**Monthly Costs** (approximate, us-gov-west-1 pricing):
+- SNS topic: $0.00 (under free tier for email)
+- CloudWatch Alarms (2): $0.20 per alarm = $0.40/month
+- CloudWatch Dashboard (1): $3.00/month
+- Log Insights queries: ~$0.50/month (occasional use)
+
+**Total Estimated Cost**: ~$4/month
+
+## Known Limitations
+
+1. **ECS Container Insights Required**
+   - The alarms rely on `ECS/ContainerInsights` metrics
+   - Verify Container Insights is enabled for the cluster:
+     ```bash
+     aws ecs describe-clusters \
+       --clusters ecs-ghe-runners-us-gov-west-1 \
+       --region us-gov-west-1 \
+       --include SETTINGS
+     ```
+   - If not enabled, alarms may not have data
+
+2. **Lambda Token Refresh**
+   - Lambda function exists in `lambda_token_refresh.tf.tmp` but is not deployed
+   - To enable Lambda monitoring:
+     - Rename `lambda_token_refresh.tf.tmp` to `lambda_token_refresh.tf`
+     - Update Lambda alarm to include SNS actions
+     - Deploy with `terraform apply`
+
+3. **Alarm Sensitivity**
+   - With desired_count=1, critical alarm triggers at <1 runner (i.e., 0 runners)
+   - This means critical and emergency alarms will trigger simultaneously
+   - Consider increasing desired_count to 2+ for better gradation
+
+4. **Email Delays**
+   - SNS email notifications can take 1-5 minutes to arrive
+   - For faster alerting, consider integrating with PagerDuty or Slack
+
+## Next Steps
+
+### Immediate (Deploy Today)
+1. ✅ Review this deployment summary
+2. ⏳ Run `terraform apply` to deploy monitoring
+3. ⏳ Confirm SNS email subscription
+4. ⏳ Verify alarms are created and healthy
+5. ⏳ Bookmark CloudWatch dashboard URL
+
+### Short Term (This Week)
+1. Test alarm notifications (manual trigger)
+2. Review runbook with team
+3. Add runbook to on-call documentation
+4. Consider enabling Lambda token refresh monitoring
+
+### Medium Term (Next Sprint)
+1. Increase desired_count to 2+ for better alarm gradation
+2. Enable ECS Container Insights if not already enabled
+3. Create CloudWatch Logs metric filters for specific error patterns
+4. Integrate with PagerDuty or other alerting platform
+
+### Long Term (Next Quarter)
+1. Implement automated remediation for common issues
+2. Add workflow queue depth monitoring (requires GitHub API integration)
+3. Create custom CloudWatch metrics for workflow execution times
+4. Develop capacity planning dashboard based on historical data
+
+## Rollback Plan
+
+If issues arise after deployment:
+
+```bash
+# Remove all monitoring resources
+terraform destroy \
+  -target=aws_sns_topic.github_runner_critical_alerts \
+  -target=aws_sns_topic_subscription.alert_email \
+  -target=aws_cloudwatch_metric_alarm.runners_critical \
+  -target=aws_cloudwatch_metric_alarm.runners_emergency \
+  -target=aws_cloudwatch_dashboard.github_runners
+
+# Or remove just the alarms if they're causing issues
+terraform destroy \
+  -target=aws_cloudwatch_metric_alarm.runners_critical \
+  -target=aws_cloudwatch_metric_alarm.runners_emergency
+```
+
+## Support and Questions
+
+- **Technical Questions**: Refer to `RUNBOOK.md` for operational procedures
+- **Terraform Issues**: Check `terraform plan` output, review AWS provider version
+- **AWS CloudWatch Help**: https://docs.aws.amazon.com/cloudwatch/
+- **ECS Monitoring**: https://docs.aws.amazon.com/AmazonECS/latest/developerguide/cloudwatch-metrics.html
+
+## Success Criteria
+
+The monitoring implementation is successful when:
+- ✅ All 6 resources created successfully
+- ✅ SNS email subscription confirmed
+- ✅ Both alarms are in "OK" state (runners healthy)
+- ✅ CloudWatch dashboard is accessible and showing data
+- ✅ Team members can access and understand the runbook
+- ✅ Test alarm notification received successfully
+
+---
+
+**Implementation Status**: ✅ READY FOR DEPLOYMENT
+
+**Terraform Plan**: ✅ VALIDATED (6 resources to add)
+
+**Documentation**: ✅ COMPLETE
+- Monitoring infrastructure code
+- Emergency runbook
+- Deployment summary
+
+**Next Action**: Run `terraform apply` to deploy monitoring infrastructure
+
+---
+
+*For questions or issues during deployment, refer to RUNBOOK.md or contact the CSVD team lead.*
diff --git a/MONITORING_IMPLEMENTATION_PLAN.md b/MONITORING_IMPLEMENTATION_PLAN.md
new file mode 100644
index 0000000..b872655
--- /dev/null
+++ b/MONITORING_IMPLEMENTATION_PLAN.md
@@ -0,0 +1,363 @@
+# GitHub Runner Monitoring Implementation Plan (ESSENTIAL ONLY)
+
+## Epic: Critical Monitoring for ECS GitHub Runners
+
+**Epic Summary**: Implement essential monitoring for GitHub Actions ECS Fargate runners focused on preventing token deadlock scenarios and providing actionable alerts.
+
+**Business Value**: Prevent critical outages caused by token expiration when all runners are down. Get notified before problems become incidents.
+
+**Priority**: Critical
+**Target Release**: Q1 2026 (Week 1)
+**Estimated Effort**: 6-8 hours (Single week implementation)
+
+---
+
+## Critical Stories (MUST HAVE)
+
+### Story 1: Lambda Token Refresh Monitoring (CRITICAL - 2 hours)
+
+**Story**: As a DevOps engineer, I need to know immediately when the Lambda token refresh fails so I can intervene before token expires and causes deadlock.
+
+**Acceptance Criteria**:
+- [ ] SNS topic created for critical alerts
+- [ ] Email notifications configured
+- [ ] Existing Lambda error alarm connected to SNS
+- [ ] Alarm fires on Lambda failures (2+ consecutive errors)
+- [ ] Test email received and verified
+
+**Technical Tasks**:
+- Create SNS topic: `github_runner_critical_alerts`
+- Add email subscription to SNS topic
+- Update existing `aws_cloudwatch_metric_alarm.lambda_errors` with SNS action
+- Test alarm by simulating Lambda failure
+
+**Estimated Effort**: 2 hours
+
+**Definition of Done**:
+- SNS topic created and email verified
+- Alarm fires and email received during test
+- Simple documentation added to README
+
+---
+
+### Story 2: Runner Availability Alerts (CRITICAL - 2 hours)
+
+**Story**: As a DevOps engineer, I need to be alerted when runner count drops so I can investigate before all runners die and token refresh stops working.
+
+**Acceptance Criteria**:
+- [ ] CRITICAL alarm fires when running count < 50% of desired
+- [ ] EMERGENCY alarm fires when running count = 0
+- [ ] Both alarms send to SNS topic
+- [ ] Alarms tested with threshold simulation
+
+**Technical Tasks**:
+- Create `monitoring_alerts.tf` file
+- Create alarm: `runners_critical` (threshold: `var.desired_count * 0.5`)
+- Create alarm: `runners_emergency` (threshold: 0)
+- Connect both alarms to existing SNS topic from Story 1
+- Reference existing `var.desired_count` variable (no new variables needed!)
+
+**Estimated Effort**: 2 hours
+
+**Definition of Done**:
+- Both alarms created
+- Alarms tested (manually adjust desired count to trigger)
+- Email notifications received
+
+---
+
+### Story 3: Basic Monitoring Dashboard (NICE TO HAVE - 2-3 hours)
+
+**Story**: As a DevOps engineer, I want a simple dashboard showing runner health and Lambda status so I can quickly check system state.
+
+**Acceptance Criteria**:
+- [ ] Dashboard shows runner count over last 24 hours
+- [ ] Dashboard shows Lambda invocations and errors
+- [ ] Dashboard shows alarm states
+- [ ] Dashboard URL added to outputs
+
+**Technical Tasks**:
+- Create `monitoring_dashboard.tf` file
+- Create CloudWatch Dashboard with 4 widgets:
+  1. ECS RunningTaskCount (line graph)
+  2. Lambda Invocations + Errors (line graph)
+  3. Alarm status (number widget)
+  4. Recent ECS events (log widget)
+- Add dashboard URL to outputs.tf
+
+**Estimated Effort**: 2-3 hours
+
+**Definition of Done**:
+- Dashboard accessible and displays data
+- Dashboard URL documented
+
+---
+
+### Story 4: Emergency Runbook (CRITICAL - 1-2 hours)
+
+**Story**: As a DevOps engineer, I need clear instructions for what to do when I get an alert so I can resolve issues quickly.
+
+**Acceptance Criteria**:
+- [ ] Runbook created with 3 key scenarios
+- [ ] Each scenario has step-by-step instructions
+- [ ] Contact escalation documented
+- [ ] Runbook linked in alarm descriptions
+
+**Technical Tasks**:
+- Create `MONITORING_RUNBOOK.md` with:
+  - **Scenario 1**: "Lambda failing - runners still up" (fix Lambda)
+  - **Scenario 2**: "Runners at 50% or below" (investigate task failures)
+  - **Scenario 3**: "All runners down" (emergency token regeneration steps)
+- Add diagnostic commands for each scenario
+- Add escalation contact info
+- Update alarm descriptions to reference runbook sections
+
+**Estimated Effort**: 1-2 hours
+
+**Definition of Done**:
+- Runbook completed and reviewed
+- Alarm descriptions updated with runbook links
+
+---
+
+## Technical Architecture (Simplified)
+
+### Infrastructure Components
+
+```
+New Terraform Files (ESSENTIAL ONLY):
+├── monitoring_alerts.tf               (Stories 1-2: ~100 lines)
+│   ├── aws_sns_topic.critical_alerts
+│   ├── aws_sns_topic_subscription.email
+│   ├── aws_cloudwatch_metric_alarm.runners_critical
+│   └── aws_cloudwatch_metric_alarm.runners_emergency
+│
+└── monitoring_dashboard.tf            (Story 3: ~80 lines)
+    └── aws_cloudwatch_dashboard.runner_health
+
+Updated Terraform Files:
+├── lambda_token_refresh.tf            (Story 1: 2 line change)
+│   └── Add alarm_actions to existing lambda_errors alarm
+│
+├── outputs.tf                         (Story 3: ~10 lines)
+│   ├── output.monitoring_dashboard_url
+│   └── output.sns_topic_arn
+│
+└── variables.tf                       (Story 1: ~5 lines)
+    └── variable.alert_email (only new variable needed)
+    
+    Note: Uses existing var.desired_count for alarm thresholds!
+
+New Documentation:
+└── MONITORING_RUNBOOK.md              (Story 4: Simple procedures)
+```
+
+### CloudWatch Alarms (3 Total - ESSENTIAL)
+
+**CRITICAL Alarms**:
+1. `lambda_errors` - Lambda token refresh failures (ALREADY EXISTS - just add SNS)
+2. `runners_critical` - <50% runners available (NEW)
+3. `runners_emergency` - Zero runners (NEW)
+
+---
+
+## Implementation Sequence (STREAMLINED)
+
+### Day 1 (3-4 hours)
+- **Morning**: Story 1 - Lambda alert SNS setup (2 hours)
+  - Create SNS topic
+  - Add email subscription
+  - Update Lambda alarm
+  - Test notification
+- **Afternoon**: Story 2 - Runner availability alarms (2 hours)
+  - Create runner count alarms
+  - Test alarms
+  - Verify emails
+
+### Day 2 (2-3 hours)
+- **Morning**: Story 3 - Basic dashboard (2-3 hours)
+  - Create simple 4-widget dashboard
+  - Test and validate
+  - Document URL
+
+### Day 3 (1-2 hours)
+- **Morning**: Story 4 - Runbook (1-2 hours)
+  - Write emergency procedures
+  - Document 3 scenarios
+  - Add to repository
+
+---
+
+## Risk Assessment (Simplified)
+
+| Risk | Probability | Impact | Mitigation |
+|------|------------|--------|------------|
+| Token expires before alert received | Medium | Critical | Lambda alarm fires immediately on failure |
+| False alarm on runner count | Low | Low | 50% threshold gives buffer; test thresholds |
+| Email delivery delay | Low | Medium | Use confirmed email; consider SMS backup later |
+
+---
+
+## Success Metrics (Essential)
+
+**Key Goals**:
+- ✅ Get notified within 5 minutes when Lambda fails
+- ✅ Get notified when runners drop to critical levels
+- ✅ Have clear procedures to follow when alerts fire
+- ✅ Zero token deadlock incidents
+
+**Cost Target**: <$2/month (3 alarms × $0.10 = $0.30 + SNS negligible)
+
+---
+
+## Configuration
+
+1. **Alert Email**: ✅ **CONFIRMED**
+   - **Email**: david.j.arnold.jr@census.gov
+
+2. **Implementation Approach**:
+   - [ ] Single PR with all changes, OR
+   - [ ] Multiple PRs (one per story for easier review)
+
+**Note**: Runner thresholds will automatically use your existing `var.desired_count` variable!  
+- Critical alarm = `var.desired_count * 0.5` (50% threshold)
+- Emergency alarm = `0` (no runners)
+
+---
+
+## Acceptance Criteria for Epic Completion
+
+### MUST HAVE (Week 1)
+- [ ] SNS topic created and email verified
+- [ ] Lambda alarm sends notifications
+- [ ] Runner count alarms created and tested
+- [ ] Basic dashboard accessible
+- [ ] Runbook document created
+
+### VALIDATION
+- [ ] Simulate Lambda failure → receive email
+- [ ] Manually scale runners down → receive email at thresholds
+- [ ] Dashboard displays current state
+- [ ] Team knows where to find runbook
+
+---
+
+## What We're NOT Doing (Defer to Later)
+
+❌ **Token age monitoring** (complex, Lambda alarm is sufficient)  
+❌ **Log metric filters** (nice to have, not critical)  
+❌ **CloudTrail integration** (adds complexity)  
+❌ **Enhanced monitoring script** (existing script works)  
+❌ **Emergency automation scripts** (manual procedures in runbook are sufficient)  
+❌ **Predictive/anomaly detection** (future enhancement)  
+❌ **Multiple SNS topics per severity** (single topic is fine)  
+❌ **Saved Log Insights queries** (can add later)  
+❌ **CPU/Memory alarms** (not critical for token deadlock prevention)
+
+---
+
+## File Changes Summary
+
+**New Files (2)**:
+```
+monitoring_alerts.tf      (~100 lines - 3 alarms + SNS)
+monitoring_dashboard.tf   (~80 lines - simple dashboard)
+MONITORING_RUNBOOK.md     (~50 lines - procedures)
+```
+
+**Modified Files (2)**:
+```
+lambda_token_refresh.tf   (Add 1 line: alarm_actions)
+outputs.tf                (Add 2 outputs: dashboard URL, SNS ARN)
+variables.tf              (Add 1 variable: alert_email only!)
+```
+
+**Note**: Reuses existing `var.desired_count` for alarm thresholds - no new runner variables needed!
+
+**Total New Code**: ~230 lines of Terraform + 1 runbook doc
+
+---
+
+## Implementation Checklist
+
+### Pre-Implementation
+- [x] Confirm alert email address: **david.j.arnold.jr@census.gov**
+- [ ] Review plan with team
+- [ ] Choose implementation approach (single PR vs multiple PRs)
+
+**Note**: No need to configure runner count thresholds - will use existing `var.desired_count` automatically!
+
+### Story 1: Lambda Alerts (2 hours)
+- [ ] Create `monitoring_alerts.tf`
+- [ ] Add SNS topic resource
+- [ ] Add SNS email subscription
+- [ ] Update `lambda_token_refresh.tf` with alarm action
+- [ ] Add variables to `variables.tf`
+- [ ] `terraform plan` and review changes
+- [ ] `terraform apply`
+- [ ] Confirm email subscription in inbox
+- [ ] Test: Trigger alarm manually (set threshold to 0)
+- [ ] Verify email received
+
+### Story 2: Runner Alerts (2 hours)
+- [ ] Add runner alarms to `monitoring_alerts.tf`
+- [ ] Use `var.desired_count * 0.5` for critical threshold
+- [ ] Use `0` for emergency threshold
+- [ ] `terraform plan` and review
+- [ ] `terraform apply`
+- [ ] Test: Scale runners down to trigger critical alarm
+- [ ] Verify email received
+- [ ] Test: Scale to 0 to trigger emergency alarm
+- [ ] Verify email received
+- [ ] Scale back to normal
+
+### Story 3: Dashboard (2-3 hours)
+- [ ] Create `monitoring_dashboard.tf`
+- [ ] Add 4 basic widgets (runner count, Lambda, alarms, logs)
+- [ ] Add outputs to `outputs.tf`
+- [ ] `terraform plan` and review
+- [ ] `terraform apply`
+- [ ] Access dashboard URL from outputs
+- [ ] Verify all widgets display data
+- [ ] Bookmark dashboard URL
+
+### Story 4: Runbook (1-2 hours)
+- [ ] Create `MONITORING_RUNBOOK.md`
+- [ ] Document Scenario 1: Lambda failing
+- [ ] Document Scenario 2: Runners at 50%
+- [ ] Document Scenario 3: All runners down
+- [ ] Add diagnostic commands for each scenario
+- [ ] Add escalation contacts
+- [ ] Update alarm descriptions with runbook reference
+- [ ] Commit and push documentation
+
+### Final Validation
+- [ ] All alarms in OK state
+- [ ] Dashboard accessible
+- [ ] Email alerts working
+- [ ] Runbook reviewed by team
+- [ ] Update main README.md with monitoring section
+
+---
+
+## Notes
+
+- **Focus**: Prevent token deadlock via early Lambda failure detection
+- **Simplicity**: Minimal alarms, single email destination
+- **Quick Win**: Can be done in 1 week by 1 person
+- **Extensible**: Easy to add more sophisticated monitoring later
+
+---
+
+## Related Documentation
+
+- [lambda_token_refresh.tf](./lambda_token_refresh.tf) - Existing Lambda alarm to be enhanced
+- [README.md](./README.md) - Will add monitoring section after implementation
+- [MONITORING_RUNBOOK.md](./MONITORING_RUNBOOK.md) - To be created in Story 4
+
+---
+
+**Document Version**: 2.0 (Streamlined)  
+**Last Updated**: January 9, 2026  
+**Implementation Timeline**: 3 days (6-8 hours total)  
+**Status**: Ready for Implementation
diff --git a/README.md b/README.md
index 9dd912a..05d96ad 100644
--- a/README.md
+++ b/README.md
@@ -6,34 +6,55 @@ Infrastructure as Code (Terraform) for deploying self-hosted GitHub Actions runn
 
 This repository manages the deployment of **organization-level self-hosted GitHub Actions runners** using AWS ECS Fargate. Runners are deployed per AWS account and automatically register with your GitHub Enterprise organization, providing secure, scalable, and cost-effective CI/CD execution environments.
 
+**Runner Model**: Runners are **persistent, long-running containers** that stay active 24/7, continuously polling GitHub for jobs. They are not ephemeral - the same runner handles multiple workflow jobs over its lifetime.
+
 ### Key Features
 
-- **Serverless Architecture**: ECS Fargate eliminates server management overhead
+- **Persistent Runners**: Long-running containers that stay online and handle multiple jobs
+- **Serverless Infrastructure**: ECS Fargate eliminates server management overhead
 - **Account-Based Isolation**: Each AWS account has its own dedicated runners
 - **Automatic IAM Authentication**: ECS Task Roles provide seamless AWS access
 - **Multi-Label Support**: Runners tagged with account ID, name, region, and more
 - **Proxy-Enabled**: Pre-configured for enterprise proxy environments
 - **CloudWatch Integration**: Centralized logging and monitoring
+- **Automated Token Refresh**: Lambda keeps registration tokens fresh for task restarts
 - **Scalable**: Adjust runner count based on workload demands
 
 ## Architecture
 
+### Runner Lifecycle Model
+
+Runners are **persistent, long-running containers** that operate continuously:
+
+1. **Startup**: Container starts, reads registration token from Secrets Manager, registers with GitHub
+2. **Active State**: Runner stays online indefinitely, polling GitHub for workflow jobs
+3. **Job Execution**: When a job arrives, runner executes it and returns to polling
+4. **Restart**: Only stops when task fails, is manually terminated, or service is updated
+5. **Auto-Recovery**: If a task dies, ECS automatically starts a replacement (requires valid token)
+
 ```
 GitHub Enterprise (github.e.it.census.gov)
             │
-            │ (OAuth App Authentication)
+            │ (Token Authentication)
             ▼
     ECS Cluster (per account/region)
     ecs-ghe-runners-{region}
             │
-            ├── Fargate Task (Runner 1)
+            ├── Fargate Task (Persistent Runner 1)
             │   ├── Container: github-runner:{version}
+            │   ├── Lifecycle: Long-running (24/7)
+            │   ├── Registration: One-time at startup
             │   ├── IAM Task Role (AWS Auth)
             │   ├── Labels: Account ID, Name, Region
             │   └── Logs → CloudWatch
             │
-            ├── Fargate Task (Runner 2)
-            └── Fargate Task (Runner N)
+            ├── Fargate Task (Persistent Runner 2)
+            └── Fargate Task (Persistent Runner N)
+            
+    Lambda Token Refresh (Every 30 min)
+            │
+            └──> AWS Secrets Manager
+                 (Keeps registration token fresh for task restarts)
 ```
 
 **Network Architecture:**
diff --git a/RUNBOOK.md b/RUNBOOK.md
new file mode 100644
index 0000000..b319930
--- /dev/null
+++ b/RUNBOOK.md
@@ -0,0 +1,500 @@
+# GitHub Actions Runner Emergency Runbook
+
+**Last Updated**: January 15, 2026  
+**Owner**: CSVD Team  
+**Alert Email**: david.j.arnold.jr@census.gov
+
+## Purpose
+
+This runbook provides step-by-step procedures for responding to critical GitHub Actions runner incidents. GitHub runners are essential for CI/CD workflows - when they're down, no workflows can execute.
+
+## Critical Understanding: Persistent Runners & Token Lifecycle
+
+### Runner Architecture
+Our GitHub Actions runners are **persistent, long-running ECS Fargate containers** that:
+- Run continuously 24/7, not per-job
+- Register **once** at startup and maintain connection
+- Handle multiple workflow jobs over their lifetime
+- Only restart when: task fails, manual termination, or service update
+
+### Token Lifecycle & Deadlock Risk
+⚠️ **CRITICAL**: Registration tokens are only needed during **runner startup**:
+- **Running runners**: Already registered, don't need token refresh
+- **Token refresh purpose**: Ensures valid token available when ECS restarts failed tasks
+- **Deadlock scenario**: If all runners die AND token expires, ECS cannot auto-recover
+  - ECS tries to start replacement tasks
+  - New tasks fail registration (expired token)
+  - Token refresh workflow can't run (no runners available)
+  - **Manual intervention required**
+
+### Why Lambda Token Refresh Matters
+- Lambda refreshes registration token every 30 minutes (tokens expire in 1 hour)
+- This ensures **whenever a task restarts** (crash, deployment, scale-up), a valid token exists
+- Running runners are unaffected by token refresh - they're already registered
+- This is **insurance for automated ECS task recovery**
+
+---
+
+## Scenario 1: Lambda Token Refresh Failing
+
+### Detection
+- **CloudWatch Alarm**: `github-runner-token-refresh-errors`
+- **Symptoms**: Lambda function experiencing errors or timeouts
+- **Impact**: If token expires and all runners go down, we cannot recover automatically
+
+### Impact Assessment
+- **Severity**: CRITICAL
+- **Time to Impact**: Tokens expire after 1 hour from last refresh
+- **Affected Systems**: 
+  - Running runners are NOT affected (already registered)
+  - New task starts will fail (scale-up, ECS auto-recovery after crash)
+  - Service deployments will fail (tasks can't register)
+  - If all runners die during token expiration, deadlock occurs
+
+### Investigation Steps
+
+1. **Check Lambda Function Logs**
+   ```bash
+   aws logs tail /aws/lambda/github-runner-token-refresh-<account-id> --follow --region us-gov-west-1
+   ```
+
+2. **Verify Lambda Execution**
+   ```bash
+   aws lambda invoke --function-name github-runner-token-refresh-<account-id> \
+     --region us-gov-west-1 \
+     --log-type Tail \
+     /tmp/lambda-output.json
+   ```
+
+3. **Check GitHub API Connectivity**
+   ```bash
+   curl -v https://github.e.it.census.gov/api/v3/
+   ```
+
+4. **Verify Secrets Manager Access**
+   ```bash
+   aws secretsmanager get-secret-value \
+     --secret-id /github-runners/sct-engineering-<account>-<region>/SCT-Engineering-* \
+     --region us-gov-west-1
+   ```
+
+### Common Root Causes
+
+| Cause | Detection | Resolution |
+|-------|-----------|------------|
+| **GitHub API Rate Limiting** | HTTP 429 errors in logs | Wait for rate limit reset (check X-RateLimit-Reset header) |
+| **Network Connectivity** | Connection timeout errors | Verify VPC endpoints, security groups, proxy configuration |
+| **IAM Permission Issues** | Access Denied errors | Review Lambda execution role permissions |
+| **GitHub Token Invalid** | HTTP 401/403 errors | Manually refresh GITHUB_TOKEN environment variable |
+| **Secrets Manager Issues** | SecretNotFound errors | Verify secret exists and Lambda has access |
+
+### Resolution Steps
+
+#### If GitHub Token is Invalid:
+```bash
+# 1. Generate new token from GitHub
+#    Navigate to: https://github.e.it.census.gov/settings/tokens
+
+# 2. Update environment variable (where Terraform runs)
+export GITHUB_TOKEN="your-new-token"
+
+# 3. Trigger Lambda manually to refresh registration token
+aws lambda invoke --function-name github-runner-token-refresh-<account-id> \
+  --region us-gov-west-1 \
+  /tmp/lambda-output.json
+
+# 4. Verify token was updated in Secrets Manager
+aws secretsmanager get-secret-value \
+  --secret-id <secret-arn> \
+  --region us-gov-west-1
+```
+
+#### If Network Connectivity Issue:
+```bash
+# 1. Check security group rules
+aws ec2 describe-security-groups \
+  --group-ids <lambda-security-group> \
+  --region us-gov-west-1
+
+# 2. Verify VPC endpoint connectivity (if applicable)
+aws ec2 describe-vpc-endpoints --region us-gov-west-1
+
+# 3. Test from Lambda VPC context if needed
+```
+
+#### If IAM Permission Issue:
+```bash
+# 1. Review Lambda execution role
+aws iam get-role-policy \
+  --role-name github-runner-token-refresh-<account>-role \
+  --policy-name token-refresh-policy
+
+# 2. Add missing permissions via Terraform (if needed)
+# Edit lambda_token_refresh.tf.tmp and apply
+```
+
+### Post-Resolution
+- [ ] Verify Lambda is executing successfully (check CloudWatch Logs)
+- [ ] Confirm alarm has cleared
+- [ ] Check that runners can still start successfully
+- [ ] Document root cause in incident tracking system
+
+---
+
+## Scenario 2: Runners at 50% Capacity
+
+### Detection
+- **CloudWatch Alarm**: `github-runners-critical-capacity`
+- **Symptoms**: Running task count below 50% of desired count
+- **Impact**: Reduced workflow capacity, potential build queue buildup
+
+### Impact Assessment
+- **Severity**: HIGH
+- **Time to Impact**: Immediate (reduced capacity for workflow execution)
+- **Affected Systems**: 
+  - Reduced CI/CD pipeline throughput
+  - Some workflows may queue waiting for available runners
+  - **Critical**: Fewer runners = higher risk if remaining runners fail
+
+### Investigation Steps
+
+1. **Check Current Runner Status**
+   ```bash
+   aws ecs describe-services \
+     --cluster ecs-ghe-runners-us-gov-west-1 \
+     --services SCT-Engineering \
+     --region us-gov-west-1
+   ```
+
+2. **Check Task Status and Failures**
+   ```bash
+   # Get recent stopped tasks
+   aws ecs list-tasks \
+     --cluster ecs-ghe-runners-us-gov-west-1 \
+     --service-name SCT-Engineering \
+     --desired-status STOPPED \
+     --region us-gov-west-1 \
+     --max-items 10
+   
+   # Describe a stopped task to see reason
+   aws ecs describe-tasks \
+     --cluster ecs-ghe-runners-us-gov-west-1 \
+     --tasks <task-arn> \
+     --region us-gov-west-1
+   ```
+
+3. **Check Container Logs**
+   ```bash
+   aws logs tail /ecs-ghe-runners/<org>-<account>-<region> \
+     --follow \
+     --region us-gov-west-1 \
+     --filter-pattern "error"
+   ```
+
+4. **Verify ECS Service Health**
+   ```bash
+   aws ecs describe-services \
+     --cluster ecs-ghe-runners-us-gov-west-1 \
+     --services SCT-Engineering \
+     --region us-gov-west-1 \
+     --query 'services[0].events[:10]'
+   ```
+
+### Common Root Causes
+
+| Cause | Detection | Resolution |
+|-------|-----------|------------|
+| **Image Pull Failures** | `CannotPullContainerError` | Verify ECR image exists, check IAM permissions, enable ECR clone |
+| **Resource Constraints** | `OutOfMemoryError` or CPU throttling | Increase task CPU/memory in terraform |
+| **Network Issues** | Tasks fail to start | Check VPC, subnets, security groups, NAT gateway |
+| **Registration Token Expired** | Tasks start but fail to register with GitHub | Check Lambda token refresh, verify token in Secrets Manager |
+| **ECS Service Issues** | Deployment failures, task placement errors | Check service events, task definition validity |
+| **Task Crash Loop** | Tasks repeatedly starting and stopping | Check container logs for application errors, verify GitHub connectivity |
+
+### Resolution Steps
+
+#### If Image Pull Failures:
+```bash
+# 1. Verify ECR image exists
+aws ecr describe-images \
+  --repository-name github-runners/github-runner \
+  --region us-gov-west-1
+
+# 2. If image doesn't exist, enable ECR cloning in Terraform
+# Edit default.auto.tfvars:
+#   enable_ecr_clone = true
+#   ecr_clone_images = ["github-runner"]
+
+# 3. Apply Terraform
+terraform apply -target=module.ecr-clone
+```
+
+#### If Resource Constraints:
+```bash
+# 1. Edit default.auto.tfvars
+#   task_cpu    = 4096  # Increase from 2048
+#   task_memory = 8192  # Increase from 4096
+
+# 2. Apply Terraform
+terraform apply
+```
+
+#### If Registration Token Issue:
+```bash
+# 1. Check token in Secrets Manager
+aws secretsmanager get-secret-value \
+  --secret-id <secret-arn> \
+  --region us-gov-west-1
+
+# 2. Manually trigger token refresh
+aws lambda invoke --function-name github-runner-token-refresh-<account> \
+  --region us-gov-west-1 \
+  /tmp/output.json
+
+# 3. Force service update to pick up new token
+aws ecs update-service \
+  --cluster ecs-ghe-runners-us-gov-west-1 \
+  --service SCT-Engineering \
+  --force-new-deployment \
+  --region us-gov-west-1
+```
+
+### Post-Resolution
+- [ ] Verify runner count has returned to desired count
+- [ ] Check alarm has cleared
+- [ ] Monitor for 30 minutes to ensure stability
+- [ ] Review and address any similar issues in logs
+
+---
+
+## Scenario 3: All Runners Down (EMERGENCY)
+
+### Detection
+- **CloudWatch Alarm**: `github-runners-emergency-all-down`
+- **Symptoms**: Zero running tasks, all workflows blocked
+- **Impact**: Complete CI/CD outage
+
+### Impact Assessment
+- **Severity**: EMERGENCY
+- **Time to Impact**: IMMEDIATE
+- **Affected Systems**: All GitHub workflows, entire CI/CD pipeline
+
+⚠️ **CRITICAL WARNING**: Deadlock Scenario Risk
+- **Normal operation**: ECS automatically restarts failed tasks
+- **Deadlock occurs when**: All runners down + registration token expired
+- **Why deadlock happens**: 
+  - ECS tries to start replacement tasks
+  - Tasks need valid registration token from Secrets Manager
+  - If token expired, new tasks fail to register with GitHub
+  - Tasks crash, ECS retries indefinitely with same expired token
+  - Manual intervention required to break the loop
+- **Time window**: Token expires 1 hour after last Lambda refresh
+- **Prevention**: Lambda refreshes every 30 min (50% safety margin)
+
+### Immediate Response (First 5 Minutes)
+
+1. **Assess Token Expiration Risk**
+   ```bash
+   # Check when token was last refreshed
+   aws secretsmanager describe-secret \
+     --secret-id <secret-arn> \
+     --region us-gov-west-1 \
+     --query 'LastChangedDate'
+   
+   # If token is >50 minutes old, we have ~10 minutes before deadlock
+   ```
+
+2. **Quick Status Check**
+   ```bash
+   # Check service status
+   aws ecs describe-services \
+     --cluster ecs-ghe-runners-us-gov-west-1 \
+     --services SCT-Engineering \
+     --region us-gov-west-1 \
+     --query 'services[0].[runningCount,desiredCount,deployments[0].status]'
+   ```
+
+3. **Check Recent Task Failures**
+   ```bash
+   # Get most recent task failure
+   aws ecs describe-tasks \
+     --cluster ecs-ghe-runners-us-gov-west-1 \
+     --tasks $(aws ecs list-tasks --cluster ecs-ghe-runners-us-gov-west-1 \
+       --service-name SCT-Engineering --desired-status STOPPED \
+       --region us-gov-west-1 --max-items 1 --query 'taskArns[0]' --output text) \
+     --region us-gov-west-1
+   ```
+
+### Investigation & Resolution
+
+#### Phase 1: Identify Root Cause
+```bash
+# Check ECS service events (last 20)
+aws ecs describe-services \
+  --cluster ecs-ghe-runners-us-gov-west-1 \
+  --services SCT-Engineering \
+  --region us-gov-west-1 \
+  --query 'services[0].events[:20]'
+
+# Check CloudWatch Logs for errors
+aws logs tail /ecs-ghe-runners/<org>-<account>-<region> \
+  --since 1h \
+  --region us-gov-west-1 \
+  --filter-pattern "ERROR"
+```
+
+#### Phase 2: Emergency Token Refresh
+If token is at risk of expiring:
+```bash
+# 1. Ensure GITHUB_TOKEN is valid
+echo $GITHUB_TOKEN  # Should be set in environment
+
+# 2. Force Lambda token refresh
+aws lambda invoke \
+  --function-name github-runner-token-refresh-<account> \
+  --region us-gov-west-1 \
+  --log-type Tail \
+  /tmp/lambda-output.json && cat /tmp/lambda-output.json
+
+# 3. Verify new token in Secrets Manager
+aws secretsmanager get-secret-value \
+  --secret-id <secret-arn> \
+  --region us-gov-west-1 \
+  --query 'SecretString' \
+  --output text | jq -r '.token' | wc -c  # Should be ~255 chars
+```
+
+#### Phase 3: Resolve Service Issues
+
+**If Image Pull Failure:**
+```bash
+# Quick fix: Use known good image version
+# Edit default.auto.tfvars: image_version = "1.67.0"
+terraform apply -auto-approve
+```
+
+**If Resource Issues:**
+```bash
+# Check ECS cluster capacity
+aws ecs describe-clusters \
+  --clusters ecs-ghe-runners-us-gov-west-1 \
+  --region us-gov-west-1 \
+  --include ATTACHMENTS
+
+# Verify Fargate capacity providers
+aws ecs describe-capacity-providers \
+  --region us-gov-west-1
+```
+
+**If Network Issues:**
+```bash
+# Verify security group allows outbound HTTPS
+aws ec2 describe-security-groups \
+  --group-ids <security-group-id> \
+  --region us-gov-west-1
+
+# Check subnet has NAT gateway or internet access
+aws ec2 describe-subnets \
+  --subnet-ids <subnet-id> \
+  --region us-gov-west-1
+```
+
+**If Service Definition Issues:**
+```bash
+# Force new deployment with current configuration
+aws ecs update-service \
+  --cluster ecs-ghe-runners-us-gov-west-1 \
+  --service SCT-Engineering \
+  --force-new-deployment \
+  --region us-gov-west-1
+
+# Monitor deployment
+watch -n 5 'aws ecs describe-services \
+  --cluster ecs-ghe-runners-us-gov-west-1 \
+  --services SCT-Engineering \
+  --region us-gov-west-1 \
+  --query "services[0].[runningCount,desiredCount]" \
+  --output table'
+```
+
+### Escalation Criteria
+
+Escalate to senior engineering if:
+- All resolution attempts fail after 30 minutes
+- Token is <5 minutes from expiration and runners won't start
+- Infrastructure issues beyond runner service (AWS outage, VPC issues)
+- Security or compliance concerns
+
+### Post-Incident Requirements
+
+After service is restored:
+- [ ] **Document timeline** in incident tracking system
+- [ ] **Identify root cause** and contributing factors
+- [ ] **Update monitoring** if gaps were identified
+- [ ] **Create prevention tasks** (Jira tickets, backlog items)
+- [ ] **Conduct blameless postmortem** within 48 hours
+- [ ] **Update this runbook** with lessons learned
+
+---
+
+## Quick Reference Commands
+
+### Check Runner Status
+```bash
+aws ecs describe-services \
+  --cluster ecs-ghe-runners-us-gov-west-1 \
+  --services SCT-Engineering \
+  --region us-gov-west-1 \
+  --query 'services[0].[runningCount,desiredCount,deployments[0].status]' \
+  --output table
+```
+
+### View Recent Logs
+```bash
+aws logs tail /ecs-ghe-runners/<org>-<account>-<region> \
+  --follow --region us-gov-west-1
+```
+
+### Force Token Refresh
+```bash
+aws lambda invoke \
+  --function-name github-runner-token-refresh-<account> \
+  --region us-gov-west-1 \
+  /tmp/output.json && cat /tmp/output.json
+```
+
+### Force Service Redeployment
+```bash
+aws ecs update-service \
+  --cluster ecs-ghe-runners-us-gov-west-1 \
+  --service SCT-Engineering \
+  --force-new-deployment \
+  --region us-gov-west-1
+```
+
+### View CloudWatch Dashboard
+```bash
+# Get dashboard URL from Terraform output
+terraform output dashboard_url
+```
+
+---
+
+## Contacts
+
+- **Primary On-Call**: Check PagerDuty/on-call schedule
+- **CSVD Team Lead**: [Name/Contact]
+- **AWS Support**: Enterprise support portal
+- **GitHub Enterprise Support**: github.e.it.census.gov support
+
+## Related Documentation
+
+- [Monitoring Implementation Plan](./MONITORING_IMPLEMENTATION_PLAN.md)
+- [GitHub App Setup](./GITHUB_APP_SETUP.md)
+- [AWS Permissions](./AWS_PERMISSIONS.md)
+- [Security Review](./SECURITY_REVIEW.md)
+
+---
+
+**Version History**
+- 2026-01-15: Initial version created with monitoring implementation
diff --git a/aws_ecs_cluster_capacity_providers.fargate b/aws_ecs_cluster_capacity_providers.fargate
new file mode 100644
index 0000000..b72a076
--- /dev/null
+++ b/aws_ecs_cluster_capacity_providers.fargate
@@ -0,0 +1,42 @@
+{
+  "version": 4,
+  "terraform_version": "1.9.1",
+  "serial": 1,
+  "lineage": "a5f87bd9-a052-6497-cea8-6beed3ccdc9d",
+  "outputs": {},
+  "resources": [
+    {
+      "mode": "managed",
+      "type": "aws_ecs_cluster_capacity_providers",
+      "name": "fargate",
+      "provider": "provider[\"registry.terraform.io/hashicorp/aws\"]",
+      "instances": [
+        {
+          "schema_version": 0,
+          "attributes": {
+            "capacity_providers": [
+              "FARGATE"
+            ],
+            "cluster_name": "ecs-ghe-runners-us-gov-west-1",
+            "default_capacity_provider_strategy": [
+              {
+                "base": 1,
+                "capacity_provider": "FARGATE",
+                "weight": 100
+              }
+            ],
+            "id": "ecs-ghe-runners-us-gov-west-1"
+          },
+          "sensitive_attributes": [],
+          "private": "bnVsbA==",
+          "dependencies": [
+            "aws_ecs_cluster.github-runner",
+            "data.aws_ecs_cluster.github-runner",
+            "data.aws_region.current"
+          ]
+        }
+      ]
+    }
+  ],
+  "check_results": null
+}
diff --git a/default.auto.tfvars b/default.auto.tfvars
index 05e0979..818c65c 100644
--- a/default.auto.tfvars
+++ b/default.auto.tfvars
@@ -27,3 +27,6 @@ certs = {
 
 aws_account = "csvd-dev-ew"
 repo_org    = "SCT-Engineering"
+
+# Monitoring Configuration
+alert_email = "david.j.arnold.jr@census.gov"
diff --git a/example.auto.tfvars b/example.auto.tfvars
deleted file mode 100644
index 013d383..0000000
--- a/example.auto.tfvars
+++ /dev/null
@@ -1,90 +0,0 @@
-# Example Terraform Variables Configuration
-# Copy this file to a workspace-specific .tfvars file and customize
-# Example: csvd-229685449397-us-gov-east-1.auto.tfvars
-
-# =============================================================================
-# GitHub App Authentication (Required)
-# =============================================================================
-# See GITHUB_APP_SETUP.md for setup instructions
-# These values are organization-specific and must be configured per workspace
-
-github_app_id              = "123456"                         # Your GitHub App ID
-github_app_installation_id = "12345678"                       # Installation ID for your org
-github_app_pem_file        = "~/.github-apps/runner-mgmt.pem" # Path to private key
-
-# =============================================================================
-# GitHub Configuration (Required)
-# =============================================================================
-
-repo_org   = "CSVD"                           # GitHub organization name
-server_url = "https://github.e.it.census.gov" # GitHub Enterprise URL
-
-# =============================================================================
-# AWS Configuration (Required)
-# =============================================================================
-
-aws_account = "csvd-dev-ew"   # AWS account identifier
-region      = "us-gov-east-1" # AWS region
-
-# Network Configuration
-vpc_id = "vpc-0abc123def456789" # VPC ID for runner deployment
-subnets = [                     # Private subnet IDs
-  "subnet-0abc123",
-  "subnet-0def456"
-]
-security_groups = ["sg-0xyz789abc"] # Security group IDs
-
-# =============================================================================
-# Runner Configuration (Required)
-# =============================================================================
-
-image_name    = "github-runner" # Container image name
-image_version = "2.311.0"       # GitHub Actions runner version
-desired_count = 3               # Number of concurrent runners
-
-# =============================================================================
-# Task Configuration (Optional)
-# =============================================================================
-
-task_cpu    = 1024 # Task CPU (1 vCPU = 1024)
-task_memory = 2048 # Task memory in MB
-
-# =============================================================================
-# Labels Configuration (Optional)
-# =============================================================================
-# Additional labels for runner identification in workflows
-# Default labels are automatically added: account ID, account name, region
-
-labels = [
-  "ecs",
-  "fargate",
-  "self-hosted"
-]
-
-# =============================================================================
-# Network Configuration (Optional)
-# =============================================================================
-
-assign_public_ip = false                       # Assign public IP to tasks
-proxy_enabled    = true                        # Enable corporate proxy
-proxy_url        = "proxy.tco.census.gov:3128" # Proxy URL
-
-# VPC Endpoints (reduces NAT Gateway costs)
-create_vpc_endpoint = false # Create VPC endpoints for AWS services
-
-# =============================================================================
-# Monitoring Configuration (Optional)
-# =============================================================================
-
-log_retention_days = 7 # CloudWatch log retention
-
-# =============================================================================
-# Tags (Optional)
-# =============================================================================
-
-tags = {
-  Environment = "development"
-  ManagedBy   = "Terraform"
-  Project     = "GitHub Actions Runners"
-  Owner       = "DevOps Team"
-}
diff --git a/example.tfvars.template b/example.tfvars.template
new file mode 100644
index 0000000..013b614
--- /dev/null
+++ b/example.tfvars.template
@@ -0,0 +1,111 @@
+# Example Terraform Variables Configuration
+# Populated from running service: arn:aws-us-gov:ecs:us-gov-west-1:229685449397:service/ecs-ghe-runners-us-gov-west-1/SCT-Engineering
+# Last updated: 2026-01-12
+
+# =============================================================================
+# GitHub App Authentication (Required)
+# =============================================================================
+# See GITHUB_APP_SETUP.md for setup instructions
+# These values are organization-specific and must be configured per workspace
+# NOTE: Configure these with your actual GitHub App credentials
+
+github_app_id              = "YOUR_GITHUB_APP_ID"        # Your GitHub App ID
+github_app_installation_id = "YOUR_INSTALLATION_ID"      # Installation ID for your org
+github_app_pem_file        = "~/.github-apps/runner-mgmt.pem" # Path to private key
+
+# =============================================================================
+# GitHub Configuration (Required)
+# =============================================================================
+
+repo_org   = "SCT-Engineering"                 # GitHub organization name (from running service)
+server_url = "https://github.e.it.census.gov" # GitHub Enterprise URL (from running service)
+
+# =============================================================================
+# AWS Configuration (Required)
+# =============================================================================
+
+aws_account = "cvsd-dev-ew"      # AWS account identifier (from running service)
+region      = "us-gov-west-1"    # AWS region (from running service)
+
+# Network Configuration (from running service)
+vpc_id = "vpc-00576a396ec570b94"    # VPC ID for runner deployment
+subnets = [                          # Private subnet IDs
+  "subnet-0b1992a84536c581b"
+]
+security_groups = ["sg-0641c697588b9aa6b"]  # Security group IDs
+
+# =============================================================================
+# Runner Configuration (Required)
+# =============================================================================
+
+image_name    = "github-runner"  # Container image name (from running service)
+image_version = "1.69.0"         # GitHub Actions runner version (from running service)
+desired_count = 1                # Number of concurrent runners (from running service)
+
+# =============================================================================
+# Task Configuration (from running service)
+# =============================================================================
+
+task_cpu    = 2048  # Task CPU (2 vCPUs) - from running service
+task_memory = 4096  # Task memory in MB (4 GB) - from running service
+
+# =============================================================================
+# Labels Configuration (from running service)
+# =============================================================================
+# Labels extracted from running service:
+# cvsd-dev-ew, sct-engineering, 229685449397-us-gov-west-1, 229685449397,
+# us-gov-west-1, ecs-github-runner, ubuntu-latest
+
+labels = [
+  "ecs",
+  "fargate",
+  "self-hosted"
+]
+
+# =============================================================================
+# Network Configuration (from running service)
+# =============================================================================
+
+assign_public_ip = false                       # Assign public IP to tasks (DISABLED in service)
+proxy_enabled    = true                        # Enable corporate proxy
+proxy_url        = "proxy.tco.census.gov:3128" # Proxy URL (from running service)
+
+# VPC Endpoints (reduces NAT Gateway costs)
+create_vpc_endpoint = false # VPC endpoints not used in running service
+
+# =============================================================================
+# Monitoring Configuration
+# =============================================================================
+
+log_retention_days = 7  # CloudWatch log retention
+alert_email        = "david.j.arnold.jr@census.gov"  # Email for monitoring alerts
+
+# =============================================================================
+# ECS Cluster Configuration
+# =============================================================================
+
+ecs_cluster_name   = "ecs-ghe-runners"  # Cluster name (derived from service)
+create_ecs_cluster = true               # Create ECS cluster if it doesn't exist
+
+# =============================================================================
+# Certificates Configuration (from running service)
+# =============================================================================
+
+certs = {
+  bucket = "csvd-dev-ew-github-actions"  # S3 bucket for certificates
+  key    = "katello-server-ca.pem"       # Certificate file key
+}
+
+# =============================================================================
+# Tags (Optional)
+# =============================================================================
+
+tags = {
+  Environment = "development"
+  ManagedBy   = "Terraform"
+  Project     = "GitHub Actions Runners"
+  Owner       = "DevOps Team"
+  Account     = "229685449397"
+  Region      = "us-gov-west-1"
+}
+
diff --git a/lambda/requirements_pat.txt b/lambda/requirements_pat.txt
new file mode 100644
index 0000000..6de9a68
--- /dev/null
+++ b/lambda/requirements_pat.txt
@@ -0,0 +1,4 @@
+# Python dependencies for GitHub Runner Token Refresh Lambda
+# PAT-based authentication (simplified)
+
+urllib3>=2.0.0
diff --git a/lambda/token_refresh_pat.py b/lambda/token_refresh_pat.py
new file mode 100644
index 0000000..4b15a1c
--- /dev/null
+++ b/lambda/token_refresh_pat.py
@@ -0,0 +1,191 @@
+"""
+Lambda function to refresh GitHub Actions runner registration tokens.
+
+CRITICAL CONTEXT:
+- Runners are persistent, long-running ECS containers (not ephemeral)
+- Registration tokens are ONLY used during container startup
+- Running runners don't need token refresh (already registered)
+- This Lambda ensures valid tokens are available for ECS task restarts
+
+Purpose: Prevent deadlock scenario where:
+1. All runner containers die (crash, deployment failure, etc.)
+2. Registration token expires (1 hour lifetime)
+3. ECS tries to start replacement tasks but they fail (expired token)
+4. Manual intervention required to break the loop
+
+Schedule: Runs every 30 minutes via EventBridge (50% safety margin)
+Authentication: Uses Personal Access Token (PAT) from environment variable
+"""
+
+import boto3
+import json
+import os
+import urllib3
+from typing import Dict, Any
+
+# Initialize AWS clients
+secrets_manager = boto3.client('secretsmanager')
+http = urllib3.PoolManager()
+
+
+def get_github_registration_token(github_url: str, org: str, access_token: str) -> Dict[str, str]:
+    """
+    Retrieve a fresh GitHub Actions registration token from the GitHub API.
+    
+    Args:
+        github_url: Base GitHub Enterprise URL
+        org: GitHub organization name
+        access_token: GitHub Personal Access Token
+        
+    Returns:
+        Dict with 'token' and 'expires_at' keys
+        
+    Raises:
+        Exception: If GitHub API request fails
+    """
+    api_url = f"{github_url}/api/v3/orgs/{org}/actions/runners/registration-token"
+    
+    headers = {
+        'Authorization': f'token {access_token}',
+        'Accept': 'application/vnd.github.v3+json',
+        'User-Agent': 'AWS-Lambda-GitHub-Runner-Token-Refresh'
+    }
+    
+    print(f"Requesting registration token from: {api_url}")
+    
+    response = http.request(
+        'POST',
+        api_url,
+        headers=headers
+    )
+    
+    if response.status == 201:
+        data = json.loads(response.data.decode('utf-8'))
+        token = data.get('token')
+        expires_at = data.get('expires_at')
+        
+        print(f"Successfully retrieved registration token (expires: {expires_at})")
+        return {
+            'token': token,
+            'expires_at': expires_at
+        }
+    else:
+        error_msg = f"GitHub API request failed with status {response.status}: {response.data.decode('utf-8')}"
+        print(error_msg)
+        raise Exception(error_msg)
+
+
+def update_secrets_manager(secret_name: str, token_data: Dict[str, str]) -> None:
+    """
+    Update the GitHub registration token in AWS Secrets Manager.
+    
+    Args:
+        secret_name: Name/ARN of the secret in Secrets Manager
+        token_data: Dict with token and expiration info
+        
+    Raises:
+        Exception: If Secrets Manager update fails
+    """
+    try:
+        print(f"Updating secret: {secret_name}")
+        
+        # Store as JSON with token and metadata
+        secret_value = json.dumps(token_data)
+        
+        secrets_manager.put_secret_value(
+            SecretId=secret_name,
+            SecretString=secret_value
+        )
+        
+        print(f"Successfully updated secret: {secret_name}")
+        
+    except Exception as e:
+        error_msg = f"Failed to update Secrets Manager: {str(e)}"
+        print(error_msg)
+        raise Exception(error_msg)
+
+
+def lambda_handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
+    """
+    Lambda handler function triggered by EventBridge.
+    
+    Environment Variables Required:
+        GITHUB_TOKEN: GitHub Personal Access Token with admin:org scope
+        GITHUB_ORG: GitHub organization name
+        GITHUB_URL: GitHub Enterprise base URL
+        SECRET_NAME: Name/ARN of the secret in Secrets Manager
+        
+    Args:
+        event: EventBridge event (not used)
+        context: Lambda context object
+        
+    Returns:
+        Response dict with status code and message
+    """
+    print("=== GitHub Runner Token Refresh Lambda ===")
+    print(f"Request ID: {context.request_id}")
+    print(f"Function: {context.function_name}")
+    
+    # Get environment variables
+    github_token = os.environ.get('GITHUB_TOKEN')
+    github_org = os.environ.get('GITHUB_ORG')
+    github_url = os.environ.get('GITHUB_URL')
+    secret_name = os.environ.get('SECRET_NAME')
+    
+    # Validate environment variables
+    required_vars = {
+        'GITHUB_TOKEN': github_token,
+        'GITHUB_ORG': github_org,
+        'GITHUB_URL': github_url,
+        'SECRET_NAME': secret_name
+    }
+    
+    missing = [var for var, val in required_vars.items() if not val]
+    if missing:
+        error_msg = f"Missing required environment variables: {', '.join(missing)}"
+        print(error_msg)
+        return {
+            'statusCode': 500,
+            'body': json.dumps({'error': error_msg})
+        }
+
+    # All required environment variables are present
+    assert github_token is not None
+    assert github_org is not None
+    assert github_url is not None
+    assert secret_name is not None
+
+    print(f"GitHub Org: {github_org}")
+    print(f"GitHub URL: {github_url}")
+    print(f"Secret Name: {secret_name}")
+    print(f"GitHub Token: {'*' * (len(github_token) - 4)}{github_token[-4:] if len(github_token) > 4 else '****'}")
+    
+    try:
+        # Get fresh registration token from GitHub using PAT
+        print("Requesting runner registration token...")
+        token_data = get_github_registration_token(github_url, github_org, github_token)
+
+        # Update Secrets Manager with new token
+        update_secrets_manager(secret_name, token_data)
+
+        success_msg = "Token refreshed successfully"
+        print(f"=== {success_msg} ===")
+
+        return {
+            'statusCode': 200,
+            'body': json.dumps({
+                'message': success_msg,
+                'secret_name': secret_name,
+                'github_org': github_org,
+                'expires_at': token_data.get('expires_at')
+            })
+        }
+
+    except Exception as e:
+        error_msg = f"Token refresh failed: {str(e)}"
+        print(f"=== ERROR: {error_msg} ===")
+
+        return {
+            'statusCode': 500,
+            'body': json.dumps({'error': error_msg})
+        }
diff --git a/lambda_token_refresh.tf b/lambda_token_refresh.tf
index ad8b466..76afb3f 100644
--- a/lambda_token_refresh.tf
+++ b/lambda_token_refresh.tf
@@ -1,5 +1,16 @@
 # Lambda function to automatically refresh GitHub Actions registration tokens
-# This prevents token expiration issues by refreshing the token every 30 minutes
+# 
+# IMPORTANT: Runners are persistent, long-running containers. The registration token
+# is ONLY needed during container startup. This Lambda ensures that whenever ECS
+# needs to restart a task (crash, deployment, scale-up), a valid token is available.
+#
+# Token Lifecycle:
+# - Running runners: Already registered, don't need token refresh
+# - Token refresh purpose: Insurance for ECS automatic task recovery
+# - Refresh interval: Every 30 minutes (tokens expire in 1 hour)
+# - Critical for: Preventing deadlock when all runners down + token expires
+#
+# Authentication: Uses Personal Access Token (GITHUB_TOKEN environment variable)
 
 locals {
   lambda_function_name = "github-runner-token-refresh-${var.aws_account}"
@@ -8,8 +19,8 @@ locals {
 # Install Python dependencies locally for Lambda packaging
 resource "null_resource" "lambda_dependencies" {
   triggers = {
-    requirements = filemd5("${path.module}/lambda/requirements.txt")
-    source_code  = filemd5("${path.module}/lambda/token_refresh.py")
+    requirements = filemd5("${path.module}/lambda/requirements_pat.txt")
+    source_code  = filemd5("${path.module}/lambda/token_refresh_pat.py")
   }
 
   provisioner "local-exec" {
@@ -17,8 +28,8 @@ resource "null_resource" "lambda_dependencies" {
       cd ${path.module}/lambda
       rm -rf package
       mkdir -p package
-      pip3 install --target package -r requirements.txt --platform manylinux2014_x86_64 --only-binary=:all:
-      cp token_refresh.py package/
+      pip3 install --target package -r requirements_pat.txt --platform manylinux2014_x86_64 --only-binary=:all:
+      cp token_refresh_pat.py package/token_refresh.py
     EOT
   }
 }
@@ -44,12 +55,10 @@ resource "aws_lambda_function" "token_refresh" {
 
   environment {
     variables = {
-      GITHUB_APP_ID              = var.github_app_id
-      GITHUB_APP_INSTALLATION_ID = var.github_app_installation_id
-      GITHUB_APP_PEM_FILE        = var.github_app_pem_file
-      GITHUB_ORG                 = var.repo_org
-      GITHUB_URL                 = var.server_url
-      SECRET_NAME                = aws_secretsmanager_secret.secret.name
+      GITHUB_TOKEN = var.github_token
+      GITHUB_ORG   = var.repo_org
+      GITHUB_URL   = var.server_url
+      SECRET_NAME  = module.github-runner.secret_name
     }
   }
 
@@ -124,7 +133,7 @@ resource "aws_iam_role_policy" "lambda_refresh_policy" {
           "secretsmanager:GetSecretValue",
           "secretsmanager:PutSecretValue"
         ]
-        Resource = aws_secretsmanager_secret.secret.arn
+        Resource = module.github-runner.secret_arn
       },
       {
         Effect = "Allow"
@@ -133,7 +142,7 @@ resource "aws_iam_role_policy" "lambda_refresh_policy" {
           "logs:CreateLogStream",
           "logs:PutLogEvents"
         ]
-        Resource = "arn:aws:logs:*:*:log-group:/aws/lambda/${local.lambda_function_name}:*"
+        Resource = "arn:${data.aws_partition.current.partition}:logs:*:*:log-group:/aws/lambda/${local.lambda_function_name}:*"
       }
     ]
   })
@@ -150,7 +159,7 @@ resource "aws_cloudwatch_log_group" "lambda_logs" {
   }
 }
 
-# CloudWatch Alarm for Lambda failures
+# CloudWatch Alarm for Lambda failures - connected to SNS
 resource "aws_cloudwatch_metric_alarm" "lambda_errors" {
   alarm_name          = "${local.lambda_function_name}-errors"
   comparison_operator = "GreaterThanThreshold"
@@ -163,6 +172,9 @@ resource "aws_cloudwatch_metric_alarm" "lambda_errors" {
   alarm_description   = "Alert when Lambda token refresh fails"
   treat_missing_data  = "notBreaching"
 
+  alarm_actions = [aws_sns_topic.github_runner_critical_alerts.arn]
+  ok_actions    = [aws_sns_topic.github_runner_critical_alerts.arn]
+
   dimensions = {
     FunctionName = aws_lambda_function.token_refresh.function_name
   }
@@ -173,13 +185,26 @@ resource "aws_cloudwatch_metric_alarm" "lambda_errors" {
   }
 }
 
+# Data source for AWS partition (for ARN construction)
+data "aws_partition" "current" {}
+
 # Output Lambda function details
 output "lambda_token_refresh_function_name" {
   description = "Name of the Lambda function that refreshes GitHub tokens"
   value       = aws_lambda_function.token_refresh.function_name
 }
 
+output "lambda_token_refresh_function_arn" {
+  description = "ARN of the Lambda function that refreshes GitHub tokens"
+  value       = aws_lambda_function.token_refresh.arn
+}
+
 output "lambda_token_refresh_schedule" {
   description = "Schedule for automatic token refresh"
   value       = aws_cloudwatch_event_rule.token_refresh_schedule.schedule_expression
 }
+
+output "lambda_token_refresh_log_group" {
+  description = "CloudWatch log group for Lambda function"
+  value       = aws_cloudwatch_log_group.lambda_logs.name
+}
diff --git a/lambda_token_refresh.tf.tmp b/lambda_token_refresh.tf.tmp
new file mode 100644
index 0000000..ad8b466
--- /dev/null
+++ b/lambda_token_refresh.tf.tmp
@@ -0,0 +1,185 @@
+# Lambda function to automatically refresh GitHub Actions registration tokens
+# This prevents token expiration issues by refreshing the token every 30 minutes
+
+locals {
+  lambda_function_name = "github-runner-token-refresh-${var.aws_account}"
+}
+
+# Install Python dependencies locally for Lambda packaging
+resource "null_resource" "lambda_dependencies" {
+  triggers = {
+    requirements = filemd5("${path.module}/lambda/requirements.txt")
+    source_code  = filemd5("${path.module}/lambda/token_refresh.py")
+  }
+
+  provisioner "local-exec" {
+    command = <<-EOT
+      cd ${path.module}/lambda
+      rm -rf package
+      mkdir -p package
+      pip3 install --target package -r requirements.txt --platform manylinux2014_x86_64 --only-binary=:all:
+      cp token_refresh.py package/
+    EOT
+  }
+}
+
+# Create ZIP file for Lambda deployment with dependencies
+data "archive_file" "token_refresh_lambda" {
+  type        = "zip"
+  source_dir  = "${path.module}/lambda/package"
+  output_path = "${path.module}/lambda/token_refresh.zip"
+
+  depends_on = [null_resource.lambda_dependencies]
+}
+
+# Lambda function
+resource "aws_lambda_function" "token_refresh" {
+  filename         = data.archive_file.token_refresh_lambda.output_path
+  function_name    = local.lambda_function_name
+  role             = aws_iam_role.lambda_refresh_role.arn
+  handler          = "token_refresh.lambda_handler"
+  source_code_hash = data.archive_file.token_refresh_lambda.output_base64sha256
+  runtime          = "python3.11"
+  timeout          = 60
+
+  environment {
+    variables = {
+      GITHUB_APP_ID              = var.github_app_id
+      GITHUB_APP_INSTALLATION_ID = var.github_app_installation_id
+      GITHUB_APP_PEM_FILE        = var.github_app_pem_file
+      GITHUB_ORG                 = var.repo_org
+      GITHUB_URL                 = var.server_url
+      SECRET_NAME                = aws_secretsmanager_secret.secret.name
+    }
+  }
+
+  tags = {
+    Name        = local.lambda_function_name
+    Environment = var.aws_account
+    Purpose     = "GitHub Runner Token Refresh"
+  }
+}
+
+# CloudWatch Event Rule - trigger every 30 minutes
+resource "aws_cloudwatch_event_rule" "token_refresh_schedule" {
+  name                = "${local.lambda_function_name}-schedule"
+  description         = "Refresh GitHub runner registration token every 30 minutes"
+  schedule_expression = "rate(30 minutes)"
+
+  tags = {
+    Name        = "${local.lambda_function_name}-schedule"
+    Environment = var.aws_account
+  }
+}
+
+# CloudWatch Event Target
+resource "aws_cloudwatch_event_target" "token_refresh_target" {
+  rule      = aws_cloudwatch_event_rule.token_refresh_schedule.name
+  target_id = "RefreshTokenLambda"
+  arn       = aws_lambda_function.token_refresh.arn
+}
+
+# Allow EventBridge to invoke Lambda
+resource "aws_lambda_permission" "allow_eventbridge" {
+  statement_id  = "AllowExecutionFromEventBridge"
+  action        = "lambda:InvokeFunction"
+  function_name = aws_lambda_function.token_refresh.function_name
+  principal     = "events.amazonaws.com"
+  source_arn    = aws_cloudwatch_event_rule.token_refresh_schedule.arn
+}
+
+# IAM Role for Lambda
+resource "aws_iam_role" "lambda_refresh_role" {
+  name = "${local.lambda_function_name}-role"
+
+  assume_role_policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [{
+      Action = "sts:AssumeRole"
+      Effect = "Allow"
+      Principal = {
+        Service = "lambda.amazonaws.com"
+      }
+    }]
+  })
+
+  tags = {
+    Name        = "${local.lambda_function_name}-role"
+    Environment = var.aws_account
+  }
+}
+
+# IAM Policy for Lambda
+resource "aws_iam_role_policy" "lambda_refresh_policy" {
+  name = "token-refresh-policy"
+  role = aws_iam_role.lambda_refresh_role.id
+
+  policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Effect = "Allow"
+        Action = [
+          "secretsmanager:UpdateSecret",
+          "secretsmanager:GetSecretValue",
+          "secretsmanager:PutSecretValue"
+        ]
+        Resource = aws_secretsmanager_secret.secret.arn
+      },
+      {
+        Effect = "Allow"
+        Action = [
+          "logs:CreateLogGroup",
+          "logs:CreateLogStream",
+          "logs:PutLogEvents"
+        ]
+        Resource = "arn:aws:logs:*:*:log-group:/aws/lambda/${local.lambda_function_name}:*"
+      }
+    ]
+  })
+}
+
+# CloudWatch Log Group for Lambda
+resource "aws_cloudwatch_log_group" "lambda_logs" {
+  name              = "/aws/lambda/${local.lambda_function_name}"
+  retention_in_days = 7
+
+  tags = {
+    Name        = "${local.lambda_function_name}-logs"
+    Environment = var.aws_account
+  }
+}
+
+# CloudWatch Alarm for Lambda failures
+resource "aws_cloudwatch_metric_alarm" "lambda_errors" {
+  alarm_name          = "${local.lambda_function_name}-errors"
+  comparison_operator = "GreaterThanThreshold"
+  evaluation_periods  = 2
+  metric_name         = "Errors"
+  namespace           = "AWS/Lambda"
+  period              = 300
+  statistic           = "Sum"
+  threshold           = 1
+  alarm_description   = "Alert when Lambda token refresh fails"
+  treat_missing_data  = "notBreaching"
+
+  dimensions = {
+    FunctionName = aws_lambda_function.token_refresh.function_name
+  }
+
+  tags = {
+    Name        = "${local.lambda_function_name}-errors"
+    Environment = var.aws_account
+  }
+}
+
+# Output Lambda function details
+output "lambda_token_refresh_function_name" {
+  description = "Name of the Lambda function that refreshes GitHub tokens"
+  value       = aws_lambda_function.token_refresh.function_name
+}
+
+output "lambda_token_refresh_schedule" {
+  description = "Schedule for automatic token refresh"
+  value       = aws_cloudwatch_event_rule.token_refresh_schedule.schedule_expression
+}
diff --git a/monitoring.tf b/monitoring.tf
new file mode 100644
index 0000000..c2fff76
--- /dev/null
+++ b/monitoring.tf
@@ -0,0 +1,227 @@
+# GitHub Runner Monitoring Infrastructure
+# 
+# This file implements critical monitoring for GitHub Actions runners on ECS Fargate
+#
+# RUNNER MODEL: Persistent, long-running containers (not ephemeral)
+# - Runners stay online 24/7, handling multiple jobs
+# - Only restart on: task failure, manual stop, service deployment
+# - Monitoring tracks runner CONTAINER health, not individual job execution
+#
+# Key monitoring areas:
+# 1. SNS alerting for critical events
+# 2. Lambda token refresh monitoring
+# 3. Runner availability alarms (container count, not job count)
+# 4. CloudWatch dashboard for visibility
+
+# ==============================================================================
+# SNS Topic and Subscriptions for Critical Alerts
+# ==============================================================================
+
+resource "aws_sns_topic" "github_runner_critical_alerts" {
+  name         = "github-runner-critical-alerts-${data.aws_caller_identity.current.account_id}-${data.aws_region.current.name}"
+  display_name = "GitHub Runner Critical Alerts"
+
+  tags = {
+    Name        = "github-runner-critical-alerts"
+    Environment = var.aws_account
+    Purpose     = "Critical alerting for GitHub Actions runners"
+  }
+}
+
+resource "aws_sns_topic_subscription" "alert_email" {
+  topic_arn = aws_sns_topic.github_runner_critical_alerts.arn
+  protocol  = "email"
+  endpoint  = var.alert_email
+}
+
+# ==============================================================================
+# Runner Availability Alarms
+# ==============================================================================
+
+# Critical alarm: Runners below 50% capacity
+resource "aws_cloudwatch_metric_alarm" "runners_critical" {
+  alarm_name          = "github-runners-critical-capacity-${var.aws_account}"
+  comparison_operator = "LessThanThreshold"
+  evaluation_periods  = 2
+  metric_name         = "RunningTasksCount"
+  namespace           = "ECS/ContainerInsights"
+  period              = 300
+  statistic           = "Average"
+  threshold           = ceil(var.desired_count * 0.5)
+  alarm_description   = "CRITICAL: GitHub runners below 50% capacity. Current capacity may not handle workload."
+  treat_missing_data  = "breaching"
+
+  alarm_actions = [aws_sns_topic.github_runner_critical_alerts.arn]
+  ok_actions    = [aws_sns_topic.github_runner_critical_alerts.arn]
+
+  dimensions = {
+    ClusterName = var.create_ecs_cluster ? aws_ecs_cluster.github-runner[0].name : data.aws_ecs_cluster.github-runner[0].cluster_name
+    ServiceName = var.repo_org
+  }
+
+  tags = {
+    Name        = "github-runners-critical-capacity"
+    Environment = var.aws_account
+    Severity    = "critical"
+  }
+}
+
+# Emergency alarm: All runners down
+resource "aws_cloudwatch_metric_alarm" "runners_emergency" {
+  alarm_name          = "github-runners-emergency-all-down-${var.aws_account}"
+  comparison_operator = "LessThanOrEqualToThreshold"
+  evaluation_periods  = 1
+  metric_name         = "RunningTasksCount"
+  namespace           = "ECS/ContainerInsights"
+  period              = 60
+  statistic           = "Maximum"
+  threshold           = 0
+  alarm_description   = "EMERGENCY: All GitHub runners are down! Workflows cannot execute. Token refresh may fail."
+  treat_missing_data  = "breaching"
+
+  alarm_actions = [aws_sns_topic.github_runner_critical_alerts.arn]
+  ok_actions    = [aws_sns_topic.github_runner_critical_alerts.arn]
+
+  dimensions = {
+    ClusterName = var.create_ecs_cluster ? aws_ecs_cluster.github-runner[0].name : data.aws_ecs_cluster.github-runner[0].cluster_name
+    ServiceName = var.repo_org
+  }
+
+  tags = {
+    Name        = "github-runners-emergency-all-down"
+    Environment = var.aws_account
+    Severity    = "emergency"
+  }
+}
+
+# ==============================================================================
+# CloudWatch Dashboard
+# ==============================================================================
+
+resource "aws_cloudwatch_dashboard" "github_runners" {
+  dashboard_name = "github-runners-${var.aws_account}"
+
+  dashboard_body = jsonencode({
+    widgets = [
+      # Runner Count Widget
+      {
+        type = "metric"
+        properties = {
+          metrics = [
+            ["ECS/ContainerInsights", "RunningTasksCount", "ClusterName", var.create_ecs_cluster ? aws_ecs_cluster.github-runner[0].name : data.aws_ecs_cluster.github-runner[0].cluster_name, "ServiceName", var.repo_org, { stat = "Average" }],
+            ["...", { stat = "Maximum" }],
+            ["...", { stat = "Minimum" }]
+          ]
+          title  = "GitHub Runner Count (Running Tasks)"
+          region = data.aws_region.current.name
+          yAxis  = { left = { min = 0 } }
+          period = 300
+          annotations = {
+            horizontal = [
+              {
+                label = "Desired Count"
+                value = var.desired_count
+              },
+              {
+                label = "Critical Threshold (50%)"
+                value = ceil(var.desired_count * 0.5)
+              }
+            ]
+          }
+        }
+        width  = 12
+        height = 6
+        x      = 0
+        y      = 0
+      },
+
+      # ECS Service CPU and Memory
+      {
+        type = "metric"
+        properties = {
+          metrics = [
+            ["ECS/ContainerInsights", "CpuUtilized", "ClusterName", var.create_ecs_cluster ? aws_ecs_cluster.github-runner[0].name : data.aws_ecs_cluster.github-runner[0].cluster_name, "ServiceName", var.repo_org],
+            [".", "MemoryUtilized", ".", ".", ".", ".", { yAxis = "right" }]
+          ]
+          title  = "Runner Resource Utilization"
+          region = data.aws_region.current.name
+          yAxis = {
+            left  = { label = "CPU (vCPU)", min = 0 }
+            right = { label = "Memory (MB)", min = 0 }
+          }
+          period = 300
+        }
+        width  = 12
+        height = 6
+        x      = 12
+        y      = 0
+      },
+
+      # Alarm Status
+      {
+        type = "alarm"
+        properties = {
+          title = "Critical Alarms Status"
+          alarms = [
+            aws_cloudwatch_metric_alarm.runners_critical.arn,
+            aws_cloudwatch_metric_alarm.runners_emergency.arn
+          ]
+        }
+        width  = 8
+        height = 4
+        x      = 0
+        y      = 6
+      },
+
+      # Recent Events Log
+      {
+        type = "log"
+        properties = {
+          query  = <<-EOQ
+            SOURCE '/ecs-ghe-runners/${var.repo_org}-${data.aws_caller_identity.current.account_id}-${data.aws_region.current.name}'
+            | fields @timestamp, @message
+            | filter @message like /error|fail|exception/i
+            | sort @timestamp desc
+            | limit 20
+          EOQ
+          region = data.aws_region.current.name
+          title  = "Recent Error Events"
+        }
+        width  = 16
+        height = 4
+        x      = 8
+        y      = 6
+      }
+    ]
+  })
+}
+
+# ==============================================================================
+# Outputs
+# ==============================================================================
+
+output "sns_topic_arn" {
+  description = "ARN of the SNS topic for critical alerts"
+  value       = aws_sns_topic.github_runner_critical_alerts.arn
+}
+
+output "sns_topic_name" {
+  description = "Name of the SNS topic for critical alerts"
+  value       = aws_sns_topic.github_runner_critical_alerts.name
+}
+
+output "dashboard_name" {
+  description = "Name of the CloudWatch dashboard"
+  value       = aws_cloudwatch_dashboard.github_runners.dashboard_name
+}
+
+output "dashboard_url" {
+  description = "URL to access the CloudWatch dashboard"
+  value       = "https://console.aws.amazon.com/cloudwatch/home?region=${data.aws_region.current.name}#dashboards:name=${aws_cloudwatch_dashboard.github_runners.dashboard_name}"
+}
+
+output "alert_email" {
+  description = "Email address receiving alerts"
+  value       = var.alert_email
+  sensitive   = true
+}
diff --git a/providers.tf b/providers.tf
index 694938d..023b890 100644
--- a/providers.tf
+++ b/providers.tf
@@ -11,20 +11,16 @@ terraform {
   }
 }
 
-# Generate GitHub App token for authentication
-data "github_app_token" "app" {
-  app_id          = var.github_app_id
-  installation_id = var.github_app_installation_id
-  pem_file        = var.github_app_pem_file
-}
-
+# GitHub provider will use GITHUB_TOKEN environment variable
 provider "github" {
-  organization = var.repo_org
-  base_url     = var.base_url
-  token        = data.github_app_token.app.token
+  owner    = var.repo_org
+  base_url = var.base_url
+  # token is automatically read from GITHUB_TOKEN env var
 }
 
 provider "aws" {
+  region = "us-gov-west-1"
+
   default_tags {
     tags = {
       finops_project_name   = "csvd_github_actions"
@@ -33,4 +29,4 @@ provider "aws" {
       organization          = "census:ocio:csvd"
     }
   }
-}x
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/terraform_data_dirs/csvd/environment b/terraform_data_dirs/csvd/environment
deleted file mode 100644
index 58bcd92..0000000
--- a/terraform_data_dirs/csvd/environment
+++ /dev/null
@@ -1 +0,0 @@
-csvd
\ No newline at end of file
diff --git a/terraform_data_dirs/csvd/modules/ecr-clone b/terraform_data_dirs/csvd/modules/ecr-clone
deleted file mode 160000
index 8fa1857..0000000
--- a/terraform_data_dirs/csvd/modules/ecr-clone
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 8fa1857eb18dcd1a79243743cbecca95b5b06b68
diff --git a/terraform_data_dirs/csvd/modules/github-runner b/terraform_data_dirs/csvd/modules/github-runner
deleted file mode 160000
index 88edaff..0000000
--- a/terraform_data_dirs/csvd/modules/github-runner
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 88edaff4267e5d8e43f42e22124154613e79477b
diff --git a/terraform_data_dirs/csvd/modules/modules.json b/terraform_data_dirs/csvd/modules/modules.json
deleted file mode 100644
index d52bb4c..0000000
--- a/terraform_data_dirs/csvd/modules/modules.json
+++ /dev/null
@@ -1 +0,0 @@
-{"Modules":[{"Key":"","Source":"","Dir":"."},{"Key":"ecr-clone","Source":"registry.terraform.io/HappyPathway/ecr-clone/aws","Version":"0.0.30","Dir":"/data/terraform/workspaces/arnol377/git/ghe-runner/terraform_data_dirs/csvd/modules/ecr-clone"},{"Key":"github-runner","Source":"registry.terraform.io/HappyPathway/github-runner/ecs","Version":"0.0.92","Dir":"/data/terraform/workspaces/arnol377/git/ghe-runner/terraform_data_dirs/csvd/modules/github-runner"}]}
\ No newline at end of file
diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64
deleted file mode 120000
index 40fb43e..0000000
--- a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64
+++ /dev/null
@@ -1 +0,0 @@
-/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64
\ No newline at end of file
diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64
deleted file mode 120000
index 33544c3..0000000
--- a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64
+++ /dev/null
@@ -1 +0,0 @@
-/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64
\ No newline at end of file
diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/dns/3.4.3/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/dns/3.4.3/linux_amd64
deleted file mode 120000
index a6fbdd6..0000000
--- a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/dns/3.4.3/linux_amd64
+++ /dev/null
@@ -1 +0,0 @@
-/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/dns/3.4.3/linux_amd64
\ No newline at end of file
diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64
deleted file mode 120000
index d61a361..0000000
--- a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64
+++ /dev/null
@@ -1 +0,0 @@
-/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64
\ No newline at end of file
diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/github/6.6.0/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/github/6.6.0/linux_amd64
deleted file mode 120000
index 095d815..0000000
--- a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/github/6.6.0/linux_amd64
+++ /dev/null
@@ -1 +0,0 @@
-/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/github/6.6.0/linux_amd64
\ No newline at end of file
diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64
deleted file mode 120000
index fe28aef..0000000
--- a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64
+++ /dev/null
@@ -1 +0,0 @@
-/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64
\ No newline at end of file
diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/null/3.2.4/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/null/3.2.4/linux_amd64
deleted file mode 120000
index 75282e6..0000000
--- a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/null/3.2.4/linux_amd64
+++ /dev/null
@@ -1 +0,0 @@
-/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/null/3.2.4/linux_amd64
\ No newline at end of file
diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64
deleted file mode 120000
index 494ac1e..0000000
--- a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64
+++ /dev/null
@@ -1 +0,0 @@
-/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64
\ No newline at end of file
diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/random/3.7.2/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/random/3.7.2/linux_amd64
deleted file mode 120000
index f8eee1f..0000000
--- a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/random/3.7.2/linux_amd64
+++ /dev/null
@@ -1 +0,0 @@
-/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/random/3.7.2/linux_amd64
\ No newline at end of file
diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/5.45.0/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/5.45.0/linux_amd64
deleted file mode 120000
index 15c0b66..0000000
--- a/terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/5.45.0/linux_amd64
+++ /dev/null
@@ -1 +0,0 @@
-/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/integrations/github/5.45.0/linux_amd64
\ No newline at end of file
diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/6.6.0/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/6.6.0/linux_amd64
deleted file mode 120000
index 26dfde5..0000000
--- a/terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/6.6.0/linux_amd64
+++ /dev/null
@@ -1 +0,0 @@
-/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/integrations/github/6.6.0/linux_amd64
\ No newline at end of file
diff --git a/terraform_data_dirs/sct-engineering/modules/github-runner b/terraform_data_dirs/sct-engineering/modules/github-runner
deleted file mode 160000
index 88edaff..0000000
--- a/terraform_data_dirs/sct-engineering/modules/github-runner
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 88edaff4267e5d8e43f42e22124154613e79477b
diff --git a/terraform_data_dirs/sct-engineering/modules/modules.json b/terraform_data_dirs/sct-engineering/modules/modules.json
deleted file mode 100644
index 31f0156..0000000
--- a/terraform_data_dirs/sct-engineering/modules/modules.json
+++ /dev/null
@@ -1 +0,0 @@
-{"Modules":[{"Key":"","Source":"","Dir":"."},{"Key":"github-runner","Source":"registry.terraform.io/HappyPathway/github-runner/ecs","Version":"0.0.92","Dir":"/data/terraform/workspaces/arnol377/git/ghe-runner/terraform_data_dirs/sct-engineering/modules/github-runner"}]}
\ No newline at end of file
diff --git a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64 b/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64
deleted file mode 120000
index 40fb43e..0000000
--- a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64
+++ /dev/null
@@ -1 +0,0 @@
-/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64
\ No newline at end of file
diff --git a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64 b/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64
deleted file mode 120000
index 33544c3..0000000
--- a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64
+++ /dev/null
@@ -1 +0,0 @@
-/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64
\ No newline at end of file
diff --git a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64 b/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64
deleted file mode 120000
index d61a361..0000000
--- a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64
+++ /dev/null
@@ -1 +0,0 @@
-/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64
\ No newline at end of file
diff --git a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/local/2.5.2/linux_amd64 b/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/local/2.5.2/linux_amd64
deleted file mode 120000
index 9e2ab54..0000000
--- a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/local/2.5.2/linux_amd64
+++ /dev/null
@@ -1 +0,0 @@
-/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/local/2.5.2/linux_amd64
\ No newline at end of file
diff --git a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64 b/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64
deleted file mode 120000
index fe28aef..0000000
--- a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64
+++ /dev/null
@@ -1 +0,0 @@
-/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64
\ No newline at end of file
diff --git a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64 b/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64
deleted file mode 120000
index 494ac1e..0000000
--- a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64
+++ /dev/null
@@ -1 +0,0 @@
-/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64
\ No newline at end of file
diff --git a/variables.tf b/variables.tf
index 3ac4317..7f2c896 100644
--- a/variables.tf
+++ b/variables.tf
@@ -8,6 +8,35 @@ variable "ecs_cluster_name" {
   }
 }
 
+variable "github_token" {
+  description = <<-EOT
+    GitHub Personal Access Token for authentication.
+    
+    This token is used for:
+    - GitHub provider authentication (set via GITHUB_TOKEN env var)
+    - Lambda function to refresh runner registration tokens
+    
+    Required Scopes:
+    - admin:org (for managing runner registration tokens)
+    - repo (for accessing repositories)
+    
+    SECURITY WARNING:
+    - NEVER commit this token to version control
+    - Set via environment variable: export GITHUB_TOKEN="your-token"
+    - The token will be stored in Lambda environment variables
+    
+    Note: This is a simpler but less secure alternative to GitHub App authentication.
+    Consider using GitHub App for production environments.
+  EOT
+  type        = string
+  sensitive   = true
+
+  validation {
+    condition     = length(var.github_token) > 20
+    error_message = "GitHub token must be at least 20 characters"
+  }
+}
+
 variable "repo_org" {
   description = "The GitHub organization"
   type        = string
@@ -118,11 +147,14 @@ variable "github_app_id" {
     
     Note: Different organizations may have different GitHub App IDs.
     Set this value in workspace-specific .tfvars files.
+    
+    If not provided, GITHUB_TOKEN environment variable will be used instead.
   EOT
   type        = string
+  default     = null
 
   validation {
-    condition     = can(regex("^[0-9]+$", var.github_app_id))
+    condition     = var.github_app_id == null || can(regex("^[0-9]+$", var.github_app_id))
     error_message = "GitHub App ID must be a numeric string (e.g., '123456')"
   }
 }
@@ -142,11 +174,14 @@ variable "github_app_installation_id" {
     
     Note: This value is organization-specific.
     Set this value in workspace-specific .tfvars files.
+    
+    If not provided, GITHUB_TOKEN environment variable will be used instead.
   EOT
   type        = string
+  default     = null
 
   validation {
-    condition     = can(regex("^[0-9]+$", var.github_app_installation_id))
+    condition     = var.github_app_installation_id == null || can(regex("^[0-9]+$", var.github_app_installation_id))
     error_message = "GitHub App Installation ID must be a numeric string (e.g., '12345678')"
   }
 }
@@ -173,11 +208,25 @@ variable "github_app_pem_file" {
     - Or in .tfvars: github_app_pem_file = "/path/to/private-key.pem"
     
     The PEM file should be accessible from where Terraform runs.
+    
+    If not provided, GITHUB_TOKEN environment variable will be used instead.
   EOT
   type        = string
+  default     = null
 
   validation {
-    condition     = can(regex("\\.pem$", var.github_app_pem_file))
+    condition     = var.github_app_pem_file == null || can(regex("\\.pem$", var.github_app_pem_file))
     error_message = "GitHub App PEM file path must end with .pem"
   }
 }
+
+# Monitoring Configuration
+variable "alert_email" {
+  description = "Email address to receive CloudWatch alarm notifications for runner and Lambda failures"
+  type        = string
+
+  validation {
+    condition     = can(regex("^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$", var.alert_email))
+    error_message = "Must be a valid email address"
+  }
+}