From f3e80d9f4db8aa65334c16da2fc27a869eeba63e Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 16 Jan 2026 13:32:41 -0500 Subject: [PATCH] adding monitoring --- .github/copilot-instructions.md | 131 +++++ .gitignore | 2 + .terraform_commits | 12 + .vscode/settings.json | 11 + DOCUMENTATION_REVIEW_2026-01-15.md | 159 ++++++ MONITORING_DEPLOYMENT_SUMMARY.md | 310 +++++++++++ MONITORING_IMPLEMENTATION_PLAN.md | 363 +++++++++++++ README.md | 31 +- RUNBOOK.md | 500 ++++++++++++++++++ aws_ecs_cluster_capacity_providers.fargate | 42 ++ default.auto.tfvars | 3 + example.auto.tfvars | 90 ---- example.tfvars.template | 111 ++++ lambda/requirements_pat.txt | 4 + lambda/token_refresh_pat.py | 191 +++++++ lambda_token_refresh.tf | 53 +- lambda_token_refresh.tf.tmp | 185 +++++++ monitoring.tf | 227 ++++++++ providers.tf | 18 +- terraform_data_dirs/csvd/environment | 1 - terraform_data_dirs/csvd/modules/ecr-clone | 1 - .../csvd/modules/github-runner | 1 - terraform_data_dirs/csvd/modules/modules.json | 1 - .../hashicorp/aws/5.70.0/linux_amd64 | 1 - .../hashicorp/dns/3.4.2/linux_amd64 | 1 - .../hashicorp/dns/3.4.3/linux_amd64 | 1 - .../hashicorp/github/6.3.1/linux_amd64 | 1 - .../hashicorp/github/6.6.0/linux_amd64 | 1 - .../hashicorp/null/3.2.3/linux_amd64 | 1 - .../hashicorp/null/3.2.4/linux_amd64 | 1 - .../hashicorp/random/3.6.3/linux_amd64 | 1 - .../hashicorp/random/3.7.2/linux_amd64 | 1 - .../integrations/github/5.45.0/linux_amd64 | 1 - .../integrations/github/6.6.0/linux_amd64 | 1 - .../sct-engineering/modules/github-runner | 1 - .../sct-engineering/modules/modules.json | 1 - .../hashicorp/aws/5.70.0/linux_amd64 | 1 - .../hashicorp/dns/3.4.2/linux_amd64 | 1 - .../hashicorp/github/6.3.1/linux_amd64 | 1 - .../hashicorp/local/2.5.2/linux_amd64 | 1 - .../hashicorp/null/3.2.3/linux_amd64 | 1 - .../hashicorp/random/3.6.3/linux_amd64 | 1 - variables.tf | 55 +- 43 files changed, 2375 insertions(+), 146 deletions(-) create mode 100644 .github/copilot-instructions.md create mode 100644 .vscode/settings.json create mode 100644 DOCUMENTATION_REVIEW_2026-01-15.md create mode 100644 MONITORING_DEPLOYMENT_SUMMARY.md create mode 100644 MONITORING_IMPLEMENTATION_PLAN.md create mode 100644 RUNBOOK.md create mode 100644 aws_ecs_cluster_capacity_providers.fargate delete mode 100644 example.auto.tfvars create mode 100644 example.tfvars.template create mode 100644 lambda/requirements_pat.txt create mode 100644 lambda/token_refresh_pat.py create mode 100644 lambda_token_refresh.tf.tmp create mode 100644 monitoring.tf delete mode 100644 terraform_data_dirs/csvd/environment delete mode 160000 terraform_data_dirs/csvd/modules/ecr-clone delete mode 160000 terraform_data_dirs/csvd/modules/github-runner delete mode 100644 terraform_data_dirs/csvd/modules/modules.json delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/dns/3.4.3/linux_amd64 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/github/6.6.0/linux_amd64 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/null/3.2.4/linux_amd64 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/random/3.7.2/linux_amd64 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/5.45.0/linux_amd64 delete mode 120000 terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/6.6.0/linux_amd64 delete mode 160000 terraform_data_dirs/sct-engineering/modules/github-runner delete mode 100644 terraform_data_dirs/sct-engineering/modules/modules.json delete mode 120000 terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64 delete mode 120000 terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64 delete mode 120000 terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64 delete mode 120000 terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/local/2.5.2/linux_amd64 delete mode 120000 terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64 delete mode 120000 terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64 diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000..b176fdf --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,131 @@ +# GitHub Copilot Instructions for ghe-runner Repository + +## General Guidelines + +### Terraform Commands +- **ALWAYS use the `tf` alias instead of `terraform` command** +- The `tf` alias performs important behind-the-scenes operations required for this environment +- Examples: + - ✅ `tf plan` (correct) + - ✅ `tf apply` (correct) + - ❌ `terraform plan` (incorrect) + - ❌ `terraform apply` (incorrect) + +### Terminal Commands +- When running terminal commands, always use the `run_in_terminal` tool +- Set `isBackground=false` for commands that need output +- Set `isBackground=true` for long-running processes (servers, watches) + +### AWS Authentication +- AWS credentials may expire during sessions +- User will refresh credentials manually using `awscreds` command +- Do not attempt to source aws credentials automatically + +### GitHub Authentication +- This project uses **token-only authentication** (GITHUB_TOKEN environment variable) +- GitHub App authentication is optional (variables have default = null) +- Never require GitHub App variables unless explicitly requested + +## Project-Specific Context + +### Infrastructure +- **Region**: us-gov-west-1 (AWS GovCloud) +- **ECS Cluster**: ecs-ghe-runners-us-gov-west-1 +- **GitHub Enterprise**: github.e.it.census.gov +- **Organization**: SCT-Engineering +- **Proxy**: proxy.tco.census.gov:3128 (required for outbound traffic) + +### Critical Understanding: Persistent Runners & Token Lifecycle +⚠️ **IMPORTANT**: Runners are **persistent, long-running containers** (not ephemeral): +- Runners run continuously 24/7, handling multiple jobs over their lifetime +- Registration token is used **only during container startup** (one-time registration) +- Lambda refreshes token every 30 min to ensure valid token for ECS task restarts +- **Deadlock risk**: If all runners die AND token expires, ECS cannot auto-recover + - Running tasks don't need token refresh (already registered) + - Failed tasks being restarted by ECS need valid token from Secrets Manager + - This is why monitoring and quick response are essential + +### File Conventions +- Main configuration: `default.auto.tfvars` +- Example template: `example.tfvars.template` (do NOT rename to `.auto.tfvars`) +- Monitoring: `monitoring.tf` +- Emergency procedures: `RUNBOOK.md` + +### Terraform Modules +- Primary module: `HappyPathway/github-runner/ecs` +- Optional ECR clone: `HappyPathway/ecr-clone/aws` +- Module outputs: Check `outputs.tf` before referencing module attributes + +## Code Editing Guidelines + +### When Making File Changes +1. Always read sufficient context before editing (5+ lines before/after) +2. Use `replace_string_in_file` with exact matches including whitespace +3. Never use placeholder comments like `...existing code...` in edits +4. Verify changes with `tf plan` after modifications + +### When Implementing Features +1. Create a todo list for multi-step work +2. Mark items in-progress before starting +3. Mark items completed immediately after finishing +4. Update the list as new tasks are discovered + +## Monitoring & Alerting + +### Alert Configuration +- Alert email: david.j.arnold.jr@census.gov +- SNS topic: github-runner-critical-alerts +- Critical alarms: runners < 50% capacity, all runners down +- Dashboard: CloudWatch dashboard for visibility + +### Emergency Response +- Refer to `RUNBOOK.md` for incident procedures +- Three critical scenarios documented: + 1. Lambda token refresh failing + 2. Runners at 50% capacity + 3. All runners down (EMERGENCY) + +## Testing & Validation + +### Before Committing +1. Run `tf plan` to validate configuration +2. Check for errors with `get_errors` tool if available +3. Verify outputs are as expected +4. Review changes in context of overall system + +### After Deployment +1. Verify SNS email subscription confirmation +2. Check CloudWatch alarms are configured +3. Test dashboard accessibility +4. Document any lessons learned + +## Common Issues & Solutions + +### "Invalid AWS Region" Error +- Ensure `providers.tf` has `region = "us-gov-west-1"` + +### "Unsupported attribute" on Module Outputs +- Check `outputs.tf` for available module outputs +- Use `var.repo_org` for service name, not `module.github-runner.service_name` + +### Image Pull Failures +- Enable ECR clone: `enable_ecr_clone = true` +- Verify image version exists in source registry + +### Token Expiration Risk +- Monitor Lambda execution via CloudWatch Logs +- Check token age in Secrets Manager +- Manual refresh available via Lambda invoke + +## Resources + +- [Monitoring Plan](./MONITORING_IMPLEMENTATION_PLAN.md) +- [Emergency Runbook](./RUNBOOK.md) +- [GitHub App Setup](./GITHUB_APP_SETUP.md) +- [AWS Permissions](./AWS_PERMISSIONS.md) +- [Security Review](./SECURITY_REVIEW.md) + +--- + +**Last Updated**: January 15, 2026 +**Maintainer**: CSVD Team diff --git a/.gitignore b/.gitignore index 2516318..b28d3f7 100644 --- a/.gitignore +++ b/.gitignore @@ -54,3 +54,5 @@ aws-image-pipeline/aws-image-pipeline automation-repos/automation-repos ghe-runners/ghe-runners docker-image-pipeline/docker-image-pipeline + +terraform_data_dirs \ No newline at end of file diff --git a/.terraform_commits b/.terraform_commits index a239210..7f3ad12 100644 --- a/.terraform_commits +++ b/.terraform_commits @@ -88,5 +88,17 @@ "commit_message": "Add GitHub Actions Runner Setup Guide to README.md", "author": "Your Name", "timestamp": "2025-10-31T13:13:21.490997" + }, + { + "commit_hash": "fa186792281de61333a09ed8477d865d96cb3ae8", + "commit_message": "feat(lambda): Implement GitHub Actions runner token refresh Lambda function\n\n- Added `token_refresh.py` to handle the token refresh logic.\n- Integrated AWS Secrets Manager for storing the GitHub registration token.\n- Utilized GitHub App authentication for secure API access.\n- Scheduled Lambda function to run every 30 minutes using CloudWatch Events.\n- Created necessary IAM roles and policies for Lambda execution.\n\nchore(lambda): Add requirements for token refresh Lambda\n\n- Added `requirements.txt` with dependencies: PyJWT and cryptography.\n\nfeat(terraform): Configure Lambda function and CloudWatch Events\n\n- Created Terraform configuration for the Lambda function and its dependencies.\n- Set up CloudWatch Event Rule to trigger the Lambda function every 30 minutes.\n- Configured IAM roles and policies for Lambda execution and Secrets Manager access.\n\ndocs(scripts): Add monitoring tools for GitHub Runner ECS services\n\n- Created monitoring scripts to track ECS service health and CloudWatch logs.\n- Added README with usage instructions and troubleshooting tips.\n- Implemented a continuous monitoring script using rich for better output formatting.\n\nchore(scripts): Add requirements for monitoring scripts\n\n- Added `requirements.txt` for monitoring scripts with dependencies: boto3, botocore, and rich.\n\nfix(scripts): Update monitoring script to use Terraform outputs\n\n- Modified `monitor_runners.py` to fetch necessary configuration from Terraform outputs.\n- Improved error handling and logging for better visibility.\n\nfeat(varfiles): Add configuration files for Terraform modules\n\n- Created JSON and TFVars files for managing Terraform workspace and GitHub organization settings.", + "author": "Your Name", + "timestamp": "2026-01-12T14:58:24.831561" + }, + { + "commit_hash": "fa186792281de61333a09ed8477d865d96cb3ae8", + "commit_message": "feat(lambda): Implement GitHub Actions runner token refresh Lambda function\n\n- Added `token_refresh.py` to handle the token refresh logic.\n- Integrated AWS Secrets Manager for storing the GitHub registration token.\n- Utilized GitHub App authentication for secure API access.\n- Scheduled Lambda function to run every 30 minutes using CloudWatch Events.\n- Created necessary IAM roles and policies for Lambda execution.\n\nchore(lambda): Add requirements for token refresh Lambda\n\n- Added `requirements.txt` with dependencies: PyJWT and cryptography.\n\nfeat(terraform): Configure Lambda function and CloudWatch Events\n\n- Created Terraform configuration for the Lambda function and its dependencies.\n- Set up CloudWatch Event Rule to trigger the Lambda function every 30 minutes.\n- Configured IAM roles and policies for Lambda execution and Secrets Manager access.\n\ndocs(scripts): Add monitoring tools for GitHub Runner ECS services\n\n- Created monitoring scripts to track ECS service health and CloudWatch logs.\n- Added README with usage instructions and troubleshooting tips.\n- Implemented a continuous monitoring script using rich for better output formatting.\n\nchore(scripts): Add requirements for monitoring scripts\n\n- Added `requirements.txt` for monitoring scripts with dependencies: boto3, botocore, and rich.\n\nfix(scripts): Update monitoring script to use Terraform outputs\n\n- Modified `monitor_runners.py` to fetch necessary configuration from Terraform outputs.\n- Improved error handling and logging for better visibility.\n\nfeat(varfiles): Add configuration files for Terraform modules\n\n- Created JSON and TFVars files for managing Terraform workspace and GitHub organization settings.", + "author": "Your Name", + "timestamp": "2026-01-15T17:53:12.576503" } ] \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..0aeacfb --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,11 @@ +{ + "github.copilot.chat.mcpServers": { + "terraform": { + "command": "/home/a/arnol377/.local/bin/terraform-mcp-server", + "args": [], + "env": { + "TF_WORKSPACE_DIR": "/home/a/arnol377/git/ghe-runner" + } + } + } +} diff --git a/DOCUMENTATION_REVIEW_2026-01-15.md b/DOCUMENTATION_REVIEW_2026-01-15.md new file mode 100644 index 0000000..f65415d --- /dev/null +++ b/DOCUMENTATION_REVIEW_2026-01-15.md @@ -0,0 +1,159 @@ +# Documentation Review - January 15, 2026 + +## Summary of Updates + +Updated documentation to accurately reflect the **persistent, long-running runner architecture** rather than describing them as ephemeral/dynamic containers. + +## Key Architectural Clarifications + +### Runner Model +- ✅ **CORRECT**: Runners are persistent, long-running ECS Fargate containers +- ✅ **CORRECT**: Runners stay active 24/7, polling GitHub for jobs +- ✅ **CORRECT**: Same runner handles multiple workflow jobs over its lifetime +- ✅ **CORRECT**: Runners only restart on: crash, manual stop, service deployment +- ❌ **INCORRECT** (Previous): Runners spin up dynamically per job + +### Token Lifecycle Understanding +- ✅ **CORRECT**: Registration token is ONLY used during container startup +- ✅ **CORRECT**: Running runners don't need token refresh (already registered) +- ✅ **CORRECT**: Lambda token refresh is insurance for ECS task restarts +- ✅ **CORRECT**: Deadlock occurs when: all runners down + token expired +- ❌ **INCORRECT** (Previous): Implied tokens are needed continuously + +### ECS Auto-Recovery Behavior +- ✅ **CORRECT**: If a task dies, ECS automatically starts a replacement +- ✅ **CORRECT**: Replacement task needs valid token from Secrets Manager +- ✅ **CORRECT**: Lambda ensures fresh token available for automatic recovery +- ✅ **CORRECT**: Without valid token, ECS enters crash loop + +## Files Updated + +### 1. README.md +**Changes:** +- Added "Runner Model" note emphasizing persistent containers +- Updated "Key Features" to include "Persistent Runners" and "Automated Token Refresh" +- Rewrote "Architecture" section with "Runner Lifecycle Model" +- Added detailed explanation of startup → active → job execution → restart cycle +- Updated architecture diagram to show "Persistent Runner" and lifecycle states +- Added Lambda Token Refresh component to diagram + +**Key Additions:** +``` +Runner Lifecycle Model: +1. Startup: Reads token, registers with GitHub +2. Active State: Stays online, polls for jobs +3. Job Execution: Executes jobs, returns to polling +4. Restart: Only on failure, manual stop, or update +5. Auto-Recovery: ECS restarts tasks (requires valid token) +``` + +### 2. RUNBOOK.md +**Changes:** +- Renamed section from "Token Lifecycle Dependency" to "Persistent Runners & Token Lifecycle" +- Added "Runner Architecture" subsection explaining 24/7 operation +- Clarified "Token Lifecycle & Deadlock Risk" with focus on startup-only token use +- Added "Why Lambda Token Refresh Matters" section +- Updated Scenario 1 impact assessment to clarify running vs. new runners +- Updated Scenario 2 impact assessment to explain reduced capacity implications +- Expanded Scenario 3 deadlock warning with detailed explanation +- Added "Task Crash Loop" to common root causes table + +**Key Additions:** +``` +Running runners: Already registered, don't need token refresh +Token refresh purpose: Ensures valid token for ECS task restarts +Deadlock scenario: ECS tries to restart → token expired → tasks fail → retry loop +``` + +### 3. .github/copilot-instructions.md +**Changes:** +- Updated "Critical Understanding" section title and content +- Clarified that runners are "persistent, long-running containers (not ephemeral)" +- Explained registration token is "only during container startup (one-time registration)" +- Specified Lambda refreshes for "ECS task restarts" not continuous runner operation +- Detailed deadlock risk with distinction between running vs. restarting tasks + +**Key Additions:** +``` +Runners run continuously 24/7, handling multiple jobs +Registration token used only during container startup +Running tasks don't need token refresh (already registered) +Failed tasks being restarted by ECS need valid token +``` + +### 4. lambda_token_refresh.tf +**Changes:** +- Expanded header comment from 2 lines to 12 lines +- Added "IMPORTANT" note about persistent runners +- Explained token lifecycle in detail +- Clarified purpose as "insurance for ECS automatic task recovery" +- Added critical scenario explanation + +**Key Additions:** +``` +IMPORTANT: Runners are persistent, long-running containers +Registration token ONLY needed during container startup +Token refresh purpose: Insurance for ECS automatic task recovery +Critical for: Preventing deadlock when all runners down + token expires +``` + +### 5. lambda/token_refresh_pat.py +**Changes:** +- Expanded docstring from 7 lines to 17 lines +- Added "CRITICAL CONTEXT" section +- Detailed persistent runner architecture +- Explained deadlock scenario step-by-step +- Added schedule and authentication details + +**Key Additions:** +``` +CRITICAL CONTEXT: +- Runners are persistent, long-running ECS containers (not ephemeral) +- Registration tokens ONLY used during container startup +- Running runners don't need token refresh +- Purpose: Prevent deadlock scenario [detailed explanation] +``` + +### 6. monitoring.tf +**Changes:** +- Expanded header comment from 7 lines to 13 lines +- Added "RUNNER MODEL" section +- Clarified monitoring tracks container health, not job execution +- Updated monitoring area descriptions + +**Key Additions:** +``` +RUNNER MODEL: Persistent, long-running containers (not ephemeral) +- Runners stay online 24/7, handling multiple jobs +- Only restart on: task failure, manual stop, service deployment +- Monitoring tracks runner CONTAINER health, not individual job execution +``` + +## Documentation Consistency + +All documentation now consistently reflects: + +1. **Runner Persistence**: Emphasized that runners are NOT ephemeral +2. **Token Usage**: Clear that tokens are only for startup, not continuous operation +3. **Lambda Purpose**: Reframed as "insurance" for ECS auto-recovery +4. **Deadlock Risk**: Detailed explanation with precise conditions +5. **ECS Behavior**: Clarified automatic task replacement mechanism +6. **Monitoring Context**: Metrics track container health, not job execution + +## Benefits of These Updates + +1. **Operational Understanding**: Clearer picture of how the system actually works +2. **Troubleshooting**: Better context for investigating runner issues +3. **Cost Implications**: Understanding that runners run 24/7 (not per-job) +4. **Monitoring Interpretation**: Metrics represent container state, not workflow state +5. **Emergency Response**: More accurate mental model for incident response + +## No Configuration Changes + +These updates are **documentation-only**. No infrastructure, code logic, or configuration was modified. The system operates exactly as before - we've simply corrected the documentation to match reality. + +--- + +**Review Date**: January 15, 2026 +**Reviewer**: GitHub Copilot (with user guidance) +**Status**: Complete ✅ diff --git a/MONITORING_DEPLOYMENT_SUMMARY.md b/MONITORING_DEPLOYMENT_SUMMARY.md new file mode 100644 index 0000000..a78b599 --- /dev/null +++ b/MONITORING_DEPLOYMENT_SUMMARY.md @@ -0,0 +1,310 @@ +# Monitoring Implementation - Deployment Summary + +**Date**: January 15, 2026 +**Status**: Ready for Deployment +**Estimated Implementation Time**: 6-8 hours (actual: ~2 hours for code) + +## Overview + +All monitoring infrastructure has been implemented and validated via `terraform plan`. The implementation includes critical alerting for GitHub Actions runners on ECS Fargate with a focus on preventing the token lifecycle deadlock. + +## Implemented Resources + +### 1. SNS Topic and Email Subscription +- **Resource**: `aws_sns_topic.github_runner_critical_alerts` +- **Name**: `github-runner-critical-alerts-229685449397-us-gov-west-1` +- **Email**: david.j.arnold.jr@census.gov +- **Purpose**: Central notification channel for all critical alerts + +### 2. Runner Availability Alarms + +#### Critical Capacity Alarm +- **Resource**: `aws_cloudwatch_metric_alarm.runners_critical` +- **Name**: `github-runners-critical-capacity-cvsd-dev-ew` +- **Threshold**: 1 runner (50% of desired count of 1) +- **Evaluation**: 2 periods of 5 minutes (10 minutes total) +- **Trigger**: When running tasks < 50% capacity +- **Actions**: Send notification to SNS topic + +#### Emergency All-Down Alarm +- **Resource**: `aws_cloudwatch_metric_alarm.runners_emergency` +- **Name**: `github-runners-emergency-all-down-cvsd-dev-ew` +- **Threshold**: 0 runners +- **Evaluation**: 1 period of 1 minute (immediate) +- **Trigger**: When all runners are down +- **Actions**: Send notification to SNS topic +- **Critical**: This is the deadlock scenario - immediate attention required + +### 3. CloudWatch Dashboard +- **Resource**: `aws_cloudwatch_dashboard.github_runners` +- **Name**: `github-runners-cvsd-dev-ew` +- **URL**: https://console.aws.amazon.com/cloudwatch/home?region=us-gov-west-1#dashboards:name=github-runners-cvsd-dev-ew + +#### Dashboard Widgets: +1. **Runner Count Trend** (12x6) + - Shows average, maximum, and minimum running task count + - Includes annotations for desired count and critical threshold + - 5-minute periods + +2. **Resource Utilization** (12x6) + - CPU utilization (vCPU) on left axis + - Memory utilization (MB) on right axis + - Helps identify resource constraints + +3. **Alarm Status** (8x4) + - Visual status of both critical alarms + - Quick at-a-glance health check + +4. **Recent Error Events** (16x4) + - Log query showing recent errors/failures + - Last 20 events sorted by timestamp + +### 4. Emergency Runbook +- **File**: `RUNBOOK.md` +- **Scenarios Documented**: + 1. Lambda Token Refresh Failing + 2. Runners at 50% Capacity + 3. All Runners Down (Emergency) + +Each scenario includes: +- Detection methods +- Impact assessment +- Investigation steps with AWS CLI commands +- Common root causes and resolutions +- Post-resolution checklist + +## Deployment Steps + +### Prerequisites +✅ AWS credentials configured and valid +✅ GITHUB_TOKEN environment variable set +✅ Terraform initialized (`tf init`) +✅ Configuration validated (`tf plan` successful) + +### Deploy Monitoring + +```bash +# 1. Review the plan one more time +cd /home/a/arnol377/git/ghe-runner +tf plan + +# 2. Apply the changes +tf apply + +# 3. Confirm the resources to be created (6 resources) +# - aws_sns_topic.github_runner_critical_alerts +# - aws_sns_topic_subscription.alert_email +# - aws_cloudwatch_metric_alarm.runners_critical +# - aws_cloudwatch_metric_alarm.runners_emergency +# - aws_cloudwatch_dashboard.github_runners +# - (1 secret version replacement) + +# Type 'yes' when prompted +``` + +### Post-Deployment Verification + +1. **Confirm Email Subscription** + ```bash + # You will receive an email from AWS Notifications + # Subject: "AWS Notification - Subscription Confirmation" + # Click the "Confirm subscription" link in the email + ``` + +2. **Verify SNS Topic** + ```bash + aws sns list-subscriptions-by-topic \ + --topic-arn $(terraform output -raw sns_topic_arn) \ + --region us-gov-west-1 + + # Should show Status: "Confirmed" after you click the email link + ``` + +3. **Check Alarms Status** + ```bash + aws cloudwatch describe-alarms \ + --alarm-names \ + "github-runners-critical-capacity-cvsd-dev-ew" \ + "github-runners-emergency-all-down-cvsd-dev-ew" \ + --region us-gov-west-1 + ``` + +4. **View Dashboard** + ```bash + # Get the dashboard URL + terraform output dashboard_url + + # Open in browser (you may need to be on VPN/internal network) + ``` + +5. **Test Alarm (Optional)** + ```bash + # Manually trigger the emergency alarm by setting state + aws cloudwatch set-alarm-state \ + --alarm-name "github-runners-emergency-all-down-cvsd-dev-ew" \ + --state-value ALARM \ + --state-reason "Testing alert notification" \ + --region us-gov-west-1 + + # You should receive an email within 1-2 minutes + # Reset to OK after testing + aws cloudwatch set-alarm-state \ + --alarm-name "github-runners-emergency-all-down-cvsd-dev-ew" \ + --state-value OK \ + --state-reason "Test complete" \ + --region us-gov-west-1 + ``` + +## Files Modified/Created + +### New Files +- ✅ `monitoring.tf` - All monitoring infrastructure +- ✅ `RUNBOOK.md` - Emergency response procedures +- ✅ `MONITORING_DEPLOYMENT_SUMMARY.md` - This file + +### Modified Files +- ✅ `providers.tf` - Added region to AWS provider +- ✅ `variables.tf` - Made GitHub App variables optional (token-only auth) +- ✅ `default.auto.tfvars` - Contains alert_email configuration +- ✅ `example.tfvars.template` - Renamed from example.auto.tfvars + +### Existing Files (Not Modified) +- `lambda_token_refresh.tf.tmp` - Lambda token refresh (currently disabled) +- `main.tf` - Main module configuration +- `ecs_cluster.tf` - ECS cluster setup + +## Monitoring Coverage + +### What's Monitored ✅ +- Runner availability (count of running tasks) +- Critical capacity threshold (50%) +- Emergency all-down scenario (0 runners) +- Resource utilization (CPU and memory) +- Recent error events in logs + +### What's NOT Monitored (Future Enhancements) +- ❌ Lambda token refresh failures (Lambda not currently deployed) +- ❌ Task failure rate over time +- ❌ Workflow queue depth +- ❌ Average task startup time +- ❌ Network connectivity issues +- ❌ ECR image pull latency + +## Cost Estimate + +**Monthly Costs** (approximate, us-gov-west-1 pricing): +- SNS topic: $0.00 (under free tier for email) +- CloudWatch Alarms (2): $0.20 per alarm = $0.40/month +- CloudWatch Dashboard (1): $3.00/month +- Log Insights queries: ~$0.50/month (occasional use) + +**Total Estimated Cost**: ~$4/month + +## Known Limitations + +1. **ECS Container Insights Required** + - The alarms rely on `ECS/ContainerInsights` metrics + - Verify Container Insights is enabled for the cluster: + ```bash + aws ecs describe-clusters \ + --clusters ecs-ghe-runners-us-gov-west-1 \ + --region us-gov-west-1 \ + --include SETTINGS + ``` + - If not enabled, alarms may not have data + +2. **Lambda Token Refresh** + - Lambda function exists in `lambda_token_refresh.tf.tmp` but is not deployed + - To enable Lambda monitoring: + - Rename `lambda_token_refresh.tf.tmp` to `lambda_token_refresh.tf` + - Update Lambda alarm to include SNS actions + - Deploy with `terraform apply` + +3. **Alarm Sensitivity** + - With desired_count=1, critical alarm triggers at <1 runner (i.e., 0 runners) + - This means critical and emergency alarms will trigger simultaneously + - Consider increasing desired_count to 2+ for better gradation + +4. **Email Delays** + - SNS email notifications can take 1-5 minutes to arrive + - For faster alerting, consider integrating with PagerDuty or Slack + +## Next Steps + +### Immediate (Deploy Today) +1. ✅ Review this deployment summary +2. ⏳ Run `terraform apply` to deploy monitoring +3. ⏳ Confirm SNS email subscription +4. ⏳ Verify alarms are created and healthy +5. ⏳ Bookmark CloudWatch dashboard URL + +### Short Term (This Week) +1. Test alarm notifications (manual trigger) +2. Review runbook with team +3. Add runbook to on-call documentation +4. Consider enabling Lambda token refresh monitoring + +### Medium Term (Next Sprint) +1. Increase desired_count to 2+ for better alarm gradation +2. Enable ECS Container Insights if not already enabled +3. Create CloudWatch Logs metric filters for specific error patterns +4. Integrate with PagerDuty or other alerting platform + +### Long Term (Next Quarter) +1. Implement automated remediation for common issues +2. Add workflow queue depth monitoring (requires GitHub API integration) +3. Create custom CloudWatch metrics for workflow execution times +4. Develop capacity planning dashboard based on historical data + +## Rollback Plan + +If issues arise after deployment: + +```bash +# Remove all monitoring resources +terraform destroy \ + -target=aws_sns_topic.github_runner_critical_alerts \ + -target=aws_sns_topic_subscription.alert_email \ + -target=aws_cloudwatch_metric_alarm.runners_critical \ + -target=aws_cloudwatch_metric_alarm.runners_emergency \ + -target=aws_cloudwatch_dashboard.github_runners + +# Or remove just the alarms if they're causing issues +terraform destroy \ + -target=aws_cloudwatch_metric_alarm.runners_critical \ + -target=aws_cloudwatch_metric_alarm.runners_emergency +``` + +## Support and Questions + +- **Technical Questions**: Refer to `RUNBOOK.md` for operational procedures +- **Terraform Issues**: Check `terraform plan` output, review AWS provider version +- **AWS CloudWatch Help**: https://docs.aws.amazon.com/cloudwatch/ +- **ECS Monitoring**: https://docs.aws.amazon.com/AmazonECS/latest/developerguide/cloudwatch-metrics.html + +## Success Criteria + +The monitoring implementation is successful when: +- ✅ All 6 resources created successfully +- ✅ SNS email subscription confirmed +- ✅ Both alarms are in "OK" state (runners healthy) +- ✅ CloudWatch dashboard is accessible and showing data +- ✅ Team members can access and understand the runbook +- ✅ Test alarm notification received successfully + +--- + +**Implementation Status**: ✅ READY FOR DEPLOYMENT + +**Terraform Plan**: ✅ VALIDATED (6 resources to add) + +**Documentation**: ✅ COMPLETE +- Monitoring infrastructure code +- Emergency runbook +- Deployment summary + +**Next Action**: Run `terraform apply` to deploy monitoring infrastructure + +--- + +*For questions or issues during deployment, refer to RUNBOOK.md or contact the CSVD team lead.* diff --git a/MONITORING_IMPLEMENTATION_PLAN.md b/MONITORING_IMPLEMENTATION_PLAN.md new file mode 100644 index 0000000..b872655 --- /dev/null +++ b/MONITORING_IMPLEMENTATION_PLAN.md @@ -0,0 +1,363 @@ +# GitHub Runner Monitoring Implementation Plan (ESSENTIAL ONLY) + +## Epic: Critical Monitoring for ECS GitHub Runners + +**Epic Summary**: Implement essential monitoring for GitHub Actions ECS Fargate runners focused on preventing token deadlock scenarios and providing actionable alerts. + +**Business Value**: Prevent critical outages caused by token expiration when all runners are down. Get notified before problems become incidents. + +**Priority**: Critical +**Target Release**: Q1 2026 (Week 1) +**Estimated Effort**: 6-8 hours (Single week implementation) + +--- + +## Critical Stories (MUST HAVE) + +### Story 1: Lambda Token Refresh Monitoring (CRITICAL - 2 hours) + +**Story**: As a DevOps engineer, I need to know immediately when the Lambda token refresh fails so I can intervene before token expires and causes deadlock. + +**Acceptance Criteria**: +- [ ] SNS topic created for critical alerts +- [ ] Email notifications configured +- [ ] Existing Lambda error alarm connected to SNS +- [ ] Alarm fires on Lambda failures (2+ consecutive errors) +- [ ] Test email received and verified + +**Technical Tasks**: +- Create SNS topic: `github_runner_critical_alerts` +- Add email subscription to SNS topic +- Update existing `aws_cloudwatch_metric_alarm.lambda_errors` with SNS action +- Test alarm by simulating Lambda failure + +**Estimated Effort**: 2 hours + +**Definition of Done**: +- SNS topic created and email verified +- Alarm fires and email received during test +- Simple documentation added to README + +--- + +### Story 2: Runner Availability Alerts (CRITICAL - 2 hours) + +**Story**: As a DevOps engineer, I need to be alerted when runner count drops so I can investigate before all runners die and token refresh stops working. + +**Acceptance Criteria**: +- [ ] CRITICAL alarm fires when running count < 50% of desired +- [ ] EMERGENCY alarm fires when running count = 0 +- [ ] Both alarms send to SNS topic +- [ ] Alarms tested with threshold simulation + +**Technical Tasks**: +- Create `monitoring_alerts.tf` file +- Create alarm: `runners_critical` (threshold: `var.desired_count * 0.5`) +- Create alarm: `runners_emergency` (threshold: 0) +- Connect both alarms to existing SNS topic from Story 1 +- Reference existing `var.desired_count` variable (no new variables needed!) + +**Estimated Effort**: 2 hours + +**Definition of Done**: +- Both alarms created +- Alarms tested (manually adjust desired count to trigger) +- Email notifications received + +--- + +### Story 3: Basic Monitoring Dashboard (NICE TO HAVE - 2-3 hours) + +**Story**: As a DevOps engineer, I want a simple dashboard showing runner health and Lambda status so I can quickly check system state. + +**Acceptance Criteria**: +- [ ] Dashboard shows runner count over last 24 hours +- [ ] Dashboard shows Lambda invocations and errors +- [ ] Dashboard shows alarm states +- [ ] Dashboard URL added to outputs + +**Technical Tasks**: +- Create `monitoring_dashboard.tf` file +- Create CloudWatch Dashboard with 4 widgets: + 1. ECS RunningTaskCount (line graph) + 2. Lambda Invocations + Errors (line graph) + 3. Alarm status (number widget) + 4. Recent ECS events (log widget) +- Add dashboard URL to outputs.tf + +**Estimated Effort**: 2-3 hours + +**Definition of Done**: +- Dashboard accessible and displays data +- Dashboard URL documented + +--- + +### Story 4: Emergency Runbook (CRITICAL - 1-2 hours) + +**Story**: As a DevOps engineer, I need clear instructions for what to do when I get an alert so I can resolve issues quickly. + +**Acceptance Criteria**: +- [ ] Runbook created with 3 key scenarios +- [ ] Each scenario has step-by-step instructions +- [ ] Contact escalation documented +- [ ] Runbook linked in alarm descriptions + +**Technical Tasks**: +- Create `MONITORING_RUNBOOK.md` with: + - **Scenario 1**: "Lambda failing - runners still up" (fix Lambda) + - **Scenario 2**: "Runners at 50% or below" (investigate task failures) + - **Scenario 3**: "All runners down" (emergency token regeneration steps) +- Add diagnostic commands for each scenario +- Add escalation contact info +- Update alarm descriptions to reference runbook sections + +**Estimated Effort**: 1-2 hours + +**Definition of Done**: +- Runbook completed and reviewed +- Alarm descriptions updated with runbook links + +--- + +## Technical Architecture (Simplified) + +### Infrastructure Components + +``` +New Terraform Files (ESSENTIAL ONLY): +├── monitoring_alerts.tf (Stories 1-2: ~100 lines) +│ ├── aws_sns_topic.critical_alerts +│ ├── aws_sns_topic_subscription.email +│ ├── aws_cloudwatch_metric_alarm.runners_critical +│ └── aws_cloudwatch_metric_alarm.runners_emergency +│ +└── monitoring_dashboard.tf (Story 3: ~80 lines) + └── aws_cloudwatch_dashboard.runner_health + +Updated Terraform Files: +├── lambda_token_refresh.tf (Story 1: 2 line change) +│ └── Add alarm_actions to existing lambda_errors alarm +│ +├── outputs.tf (Story 3: ~10 lines) +│ ├── output.monitoring_dashboard_url +│ └── output.sns_topic_arn +│ +└── variables.tf (Story 1: ~5 lines) + └── variable.alert_email (only new variable needed) + + Note: Uses existing var.desired_count for alarm thresholds! + +New Documentation: +└── MONITORING_RUNBOOK.md (Story 4: Simple procedures) +``` + +### CloudWatch Alarms (3 Total - ESSENTIAL) + +**CRITICAL Alarms**: +1. `lambda_errors` - Lambda token refresh failures (ALREADY EXISTS - just add SNS) +2. `runners_critical` - <50% runners available (NEW) +3. `runners_emergency` - Zero runners (NEW) + +--- + +## Implementation Sequence (STREAMLINED) + +### Day 1 (3-4 hours) +- **Morning**: Story 1 - Lambda alert SNS setup (2 hours) + - Create SNS topic + - Add email subscription + - Update Lambda alarm + - Test notification +- **Afternoon**: Story 2 - Runner availability alarms (2 hours) + - Create runner count alarms + - Test alarms + - Verify emails + +### Day 2 (2-3 hours) +- **Morning**: Story 3 - Basic dashboard (2-3 hours) + - Create simple 4-widget dashboard + - Test and validate + - Document URL + +### Day 3 (1-2 hours) +- **Morning**: Story 4 - Runbook (1-2 hours) + - Write emergency procedures + - Document 3 scenarios + - Add to repository + +--- + +## Risk Assessment (Simplified) + +| Risk | Probability | Impact | Mitigation | +|------|------------|--------|------------| +| Token expires before alert received | Medium | Critical | Lambda alarm fires immediately on failure | +| False alarm on runner count | Low | Low | 50% threshold gives buffer; test thresholds | +| Email delivery delay | Low | Medium | Use confirmed email; consider SMS backup later | + +--- + +## Success Metrics (Essential) + +**Key Goals**: +- ✅ Get notified within 5 minutes when Lambda fails +- ✅ Get notified when runners drop to critical levels +- ✅ Have clear procedures to follow when alerts fire +- ✅ Zero token deadlock incidents + +**Cost Target**: <$2/month (3 alarms × $0.10 = $0.30 + SNS negligible) + +--- + +## Configuration + +1. **Alert Email**: ✅ **CONFIRMED** + - **Email**: david.j.arnold.jr@census.gov + +2. **Implementation Approach**: + - [ ] Single PR with all changes, OR + - [ ] Multiple PRs (one per story for easier review) + +**Note**: Runner thresholds will automatically use your existing `var.desired_count` variable! +- Critical alarm = `var.desired_count * 0.5` (50% threshold) +- Emergency alarm = `0` (no runners) + +--- + +## Acceptance Criteria for Epic Completion + +### MUST HAVE (Week 1) +- [ ] SNS topic created and email verified +- [ ] Lambda alarm sends notifications +- [ ] Runner count alarms created and tested +- [ ] Basic dashboard accessible +- [ ] Runbook document created + +### VALIDATION +- [ ] Simulate Lambda failure → receive email +- [ ] Manually scale runners down → receive email at thresholds +- [ ] Dashboard displays current state +- [ ] Team knows where to find runbook + +--- + +## What We're NOT Doing (Defer to Later) + +❌ **Token age monitoring** (complex, Lambda alarm is sufficient) +❌ **Log metric filters** (nice to have, not critical) +❌ **CloudTrail integration** (adds complexity) +❌ **Enhanced monitoring script** (existing script works) +❌ **Emergency automation scripts** (manual procedures in runbook are sufficient) +❌ **Predictive/anomaly detection** (future enhancement) +❌ **Multiple SNS topics per severity** (single topic is fine) +❌ **Saved Log Insights queries** (can add later) +❌ **CPU/Memory alarms** (not critical for token deadlock prevention) + +--- + +## File Changes Summary + +**New Files (2)**: +``` +monitoring_alerts.tf (~100 lines - 3 alarms + SNS) +monitoring_dashboard.tf (~80 lines - simple dashboard) +MONITORING_RUNBOOK.md (~50 lines - procedures) +``` + +**Modified Files (2)**: +``` +lambda_token_refresh.tf (Add 1 line: alarm_actions) +outputs.tf (Add 2 outputs: dashboard URL, SNS ARN) +variables.tf (Add 1 variable: alert_email only!) +``` + +**Note**: Reuses existing `var.desired_count` for alarm thresholds - no new runner variables needed! + +**Total New Code**: ~230 lines of Terraform + 1 runbook doc + +--- + +## Implementation Checklist + +### Pre-Implementation +- [x] Confirm alert email address: **david.j.arnold.jr@census.gov** +- [ ] Review plan with team +- [ ] Choose implementation approach (single PR vs multiple PRs) + +**Note**: No need to configure runner count thresholds - will use existing `var.desired_count` automatically! + +### Story 1: Lambda Alerts (2 hours) +- [ ] Create `monitoring_alerts.tf` +- [ ] Add SNS topic resource +- [ ] Add SNS email subscription +- [ ] Update `lambda_token_refresh.tf` with alarm action +- [ ] Add variables to `variables.tf` +- [ ] `terraform plan` and review changes +- [ ] `terraform apply` +- [ ] Confirm email subscription in inbox +- [ ] Test: Trigger alarm manually (set threshold to 0) +- [ ] Verify email received + +### Story 2: Runner Alerts (2 hours) +- [ ] Add runner alarms to `monitoring_alerts.tf` +- [ ] Use `var.desired_count * 0.5` for critical threshold +- [ ] Use `0` for emergency threshold +- [ ] `terraform plan` and review +- [ ] `terraform apply` +- [ ] Test: Scale runners down to trigger critical alarm +- [ ] Verify email received +- [ ] Test: Scale to 0 to trigger emergency alarm +- [ ] Verify email received +- [ ] Scale back to normal + +### Story 3: Dashboard (2-3 hours) +- [ ] Create `monitoring_dashboard.tf` +- [ ] Add 4 basic widgets (runner count, Lambda, alarms, logs) +- [ ] Add outputs to `outputs.tf` +- [ ] `terraform plan` and review +- [ ] `terraform apply` +- [ ] Access dashboard URL from outputs +- [ ] Verify all widgets display data +- [ ] Bookmark dashboard URL + +### Story 4: Runbook (1-2 hours) +- [ ] Create `MONITORING_RUNBOOK.md` +- [ ] Document Scenario 1: Lambda failing +- [ ] Document Scenario 2: Runners at 50% +- [ ] Document Scenario 3: All runners down +- [ ] Add diagnostic commands for each scenario +- [ ] Add escalation contacts +- [ ] Update alarm descriptions with runbook reference +- [ ] Commit and push documentation + +### Final Validation +- [ ] All alarms in OK state +- [ ] Dashboard accessible +- [ ] Email alerts working +- [ ] Runbook reviewed by team +- [ ] Update main README.md with monitoring section + +--- + +## Notes + +- **Focus**: Prevent token deadlock via early Lambda failure detection +- **Simplicity**: Minimal alarms, single email destination +- **Quick Win**: Can be done in 1 week by 1 person +- **Extensible**: Easy to add more sophisticated monitoring later + +--- + +## Related Documentation + +- [lambda_token_refresh.tf](./lambda_token_refresh.tf) - Existing Lambda alarm to be enhanced +- [README.md](./README.md) - Will add monitoring section after implementation +- [MONITORING_RUNBOOK.md](./MONITORING_RUNBOOK.md) - To be created in Story 4 + +--- + +**Document Version**: 2.0 (Streamlined) +**Last Updated**: January 9, 2026 +**Implementation Timeline**: 3 days (6-8 hours total) +**Status**: Ready for Implementation diff --git a/README.md b/README.md index 9dd912a..05d96ad 100644 --- a/README.md +++ b/README.md @@ -6,34 +6,55 @@ Infrastructure as Code (Terraform) for deploying self-hosted GitHub Actions runn This repository manages the deployment of **organization-level self-hosted GitHub Actions runners** using AWS ECS Fargate. Runners are deployed per AWS account and automatically register with your GitHub Enterprise organization, providing secure, scalable, and cost-effective CI/CD execution environments. +**Runner Model**: Runners are **persistent, long-running containers** that stay active 24/7, continuously polling GitHub for jobs. They are not ephemeral - the same runner handles multiple workflow jobs over its lifetime. + ### Key Features -- **Serverless Architecture**: ECS Fargate eliminates server management overhead +- **Persistent Runners**: Long-running containers that stay online and handle multiple jobs +- **Serverless Infrastructure**: ECS Fargate eliminates server management overhead - **Account-Based Isolation**: Each AWS account has its own dedicated runners - **Automatic IAM Authentication**: ECS Task Roles provide seamless AWS access - **Multi-Label Support**: Runners tagged with account ID, name, region, and more - **Proxy-Enabled**: Pre-configured for enterprise proxy environments - **CloudWatch Integration**: Centralized logging and monitoring +- **Automated Token Refresh**: Lambda keeps registration tokens fresh for task restarts - **Scalable**: Adjust runner count based on workload demands ## Architecture +### Runner Lifecycle Model + +Runners are **persistent, long-running containers** that operate continuously: + +1. **Startup**: Container starts, reads registration token from Secrets Manager, registers with GitHub +2. **Active State**: Runner stays online indefinitely, polling GitHub for workflow jobs +3. **Job Execution**: When a job arrives, runner executes it and returns to polling +4. **Restart**: Only stops when task fails, is manually terminated, or service is updated +5. **Auto-Recovery**: If a task dies, ECS automatically starts a replacement (requires valid token) + ``` GitHub Enterprise (github.e.it.census.gov) │ - │ (OAuth App Authentication) + │ (Token Authentication) ▼ ECS Cluster (per account/region) ecs-ghe-runners-{region} │ - ├── Fargate Task (Runner 1) + ├── Fargate Task (Persistent Runner 1) │ ├── Container: github-runner:{version} + │ ├── Lifecycle: Long-running (24/7) + │ ├── Registration: One-time at startup │ ├── IAM Task Role (AWS Auth) │ ├── Labels: Account ID, Name, Region │ └── Logs → CloudWatch │ - ├── Fargate Task (Runner 2) - └── Fargate Task (Runner N) + ├── Fargate Task (Persistent Runner 2) + └── Fargate Task (Persistent Runner N) + + Lambda Token Refresh (Every 30 min) + │ + └──> AWS Secrets Manager + (Keeps registration token fresh for task restarts) ``` **Network Architecture:** diff --git a/RUNBOOK.md b/RUNBOOK.md new file mode 100644 index 0000000..b319930 --- /dev/null +++ b/RUNBOOK.md @@ -0,0 +1,500 @@ +# GitHub Actions Runner Emergency Runbook + +**Last Updated**: January 15, 2026 +**Owner**: CSVD Team +**Alert Email**: david.j.arnold.jr@census.gov + +## Purpose + +This runbook provides step-by-step procedures for responding to critical GitHub Actions runner incidents. GitHub runners are essential for CI/CD workflows - when they're down, no workflows can execute. + +## Critical Understanding: Persistent Runners & Token Lifecycle + +### Runner Architecture +Our GitHub Actions runners are **persistent, long-running ECS Fargate containers** that: +- Run continuously 24/7, not per-job +- Register **once** at startup and maintain connection +- Handle multiple workflow jobs over their lifetime +- Only restart when: task fails, manual termination, or service update + +### Token Lifecycle & Deadlock Risk +⚠️ **CRITICAL**: Registration tokens are only needed during **runner startup**: +- **Running runners**: Already registered, don't need token refresh +- **Token refresh purpose**: Ensures valid token available when ECS restarts failed tasks +- **Deadlock scenario**: If all runners die AND token expires, ECS cannot auto-recover + - ECS tries to start replacement tasks + - New tasks fail registration (expired token) + - Token refresh workflow can't run (no runners available) + - **Manual intervention required** + +### Why Lambda Token Refresh Matters +- Lambda refreshes registration token every 30 minutes (tokens expire in 1 hour) +- This ensures **whenever a task restarts** (crash, deployment, scale-up), a valid token exists +- Running runners are unaffected by token refresh - they're already registered +- This is **insurance for automated ECS task recovery** + +--- + +## Scenario 1: Lambda Token Refresh Failing + +### Detection +- **CloudWatch Alarm**: `github-runner-token-refresh-errors` +- **Symptoms**: Lambda function experiencing errors or timeouts +- **Impact**: If token expires and all runners go down, we cannot recover automatically + +### Impact Assessment +- **Severity**: CRITICAL +- **Time to Impact**: Tokens expire after 1 hour from last refresh +- **Affected Systems**: + - Running runners are NOT affected (already registered) + - New task starts will fail (scale-up, ECS auto-recovery after crash) + - Service deployments will fail (tasks can't register) + - If all runners die during token expiration, deadlock occurs + +### Investigation Steps + +1. **Check Lambda Function Logs** + ```bash + aws logs tail /aws/lambda/github-runner-token-refresh- --follow --region us-gov-west-1 + ``` + +2. **Verify Lambda Execution** + ```bash + aws lambda invoke --function-name github-runner-token-refresh- \ + --region us-gov-west-1 \ + --log-type Tail \ + /tmp/lambda-output.json + ``` + +3. **Check GitHub API Connectivity** + ```bash + curl -v https://github.e.it.census.gov/api/v3/ + ``` + +4. **Verify Secrets Manager Access** + ```bash + aws secretsmanager get-secret-value \ + --secret-id /github-runners/sct-engineering--/SCT-Engineering-* \ + --region us-gov-west-1 + ``` + +### Common Root Causes + +| Cause | Detection | Resolution | +|-------|-----------|------------| +| **GitHub API Rate Limiting** | HTTP 429 errors in logs | Wait for rate limit reset (check X-RateLimit-Reset header) | +| **Network Connectivity** | Connection timeout errors | Verify VPC endpoints, security groups, proxy configuration | +| **IAM Permission Issues** | Access Denied errors | Review Lambda execution role permissions | +| **GitHub Token Invalid** | HTTP 401/403 errors | Manually refresh GITHUB_TOKEN environment variable | +| **Secrets Manager Issues** | SecretNotFound errors | Verify secret exists and Lambda has access | + +### Resolution Steps + +#### If GitHub Token is Invalid: +```bash +# 1. Generate new token from GitHub +# Navigate to: https://github.e.it.census.gov/settings/tokens + +# 2. Update environment variable (where Terraform runs) +export GITHUB_TOKEN="your-new-token" + +# 3. Trigger Lambda manually to refresh registration token +aws lambda invoke --function-name github-runner-token-refresh- \ + --region us-gov-west-1 \ + /tmp/lambda-output.json + +# 4. Verify token was updated in Secrets Manager +aws secretsmanager get-secret-value \ + --secret-id \ + --region us-gov-west-1 +``` + +#### If Network Connectivity Issue: +```bash +# 1. Check security group rules +aws ec2 describe-security-groups \ + --group-ids \ + --region us-gov-west-1 + +# 2. Verify VPC endpoint connectivity (if applicable) +aws ec2 describe-vpc-endpoints --region us-gov-west-1 + +# 3. Test from Lambda VPC context if needed +``` + +#### If IAM Permission Issue: +```bash +# 1. Review Lambda execution role +aws iam get-role-policy \ + --role-name github-runner-token-refresh--role \ + --policy-name token-refresh-policy + +# 2. Add missing permissions via Terraform (if needed) +# Edit lambda_token_refresh.tf.tmp and apply +``` + +### Post-Resolution +- [ ] Verify Lambda is executing successfully (check CloudWatch Logs) +- [ ] Confirm alarm has cleared +- [ ] Check that runners can still start successfully +- [ ] Document root cause in incident tracking system + +--- + +## Scenario 2: Runners at 50% Capacity + +### Detection +- **CloudWatch Alarm**: `github-runners-critical-capacity` +- **Symptoms**: Running task count below 50% of desired count +- **Impact**: Reduced workflow capacity, potential build queue buildup + +### Impact Assessment +- **Severity**: HIGH +- **Time to Impact**: Immediate (reduced capacity for workflow execution) +- **Affected Systems**: + - Reduced CI/CD pipeline throughput + - Some workflows may queue waiting for available runners + - **Critical**: Fewer runners = higher risk if remaining runners fail + +### Investigation Steps + +1. **Check Current Runner Status** + ```bash + aws ecs describe-services \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --services SCT-Engineering \ + --region us-gov-west-1 + ``` + +2. **Check Task Status and Failures** + ```bash + # Get recent stopped tasks + aws ecs list-tasks \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --service-name SCT-Engineering \ + --desired-status STOPPED \ + --region us-gov-west-1 \ + --max-items 10 + + # Describe a stopped task to see reason + aws ecs describe-tasks \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --tasks \ + --region us-gov-west-1 + ``` + +3. **Check Container Logs** + ```bash + aws logs tail /ecs-ghe-runners/-- \ + --follow \ + --region us-gov-west-1 \ + --filter-pattern "error" + ``` + +4. **Verify ECS Service Health** + ```bash + aws ecs describe-services \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --services SCT-Engineering \ + --region us-gov-west-1 \ + --query 'services[0].events[:10]' + ``` + +### Common Root Causes + +| Cause | Detection | Resolution | +|-------|-----------|------------| +| **Image Pull Failures** | `CannotPullContainerError` | Verify ECR image exists, check IAM permissions, enable ECR clone | +| **Resource Constraints** | `OutOfMemoryError` or CPU throttling | Increase task CPU/memory in terraform | +| **Network Issues** | Tasks fail to start | Check VPC, subnets, security groups, NAT gateway | +| **Registration Token Expired** | Tasks start but fail to register with GitHub | Check Lambda token refresh, verify token in Secrets Manager | +| **ECS Service Issues** | Deployment failures, task placement errors | Check service events, task definition validity | +| **Task Crash Loop** | Tasks repeatedly starting and stopping | Check container logs for application errors, verify GitHub connectivity | + +### Resolution Steps + +#### If Image Pull Failures: +```bash +# 1. Verify ECR image exists +aws ecr describe-images \ + --repository-name github-runners/github-runner \ + --region us-gov-west-1 + +# 2. If image doesn't exist, enable ECR cloning in Terraform +# Edit default.auto.tfvars: +# enable_ecr_clone = true +# ecr_clone_images = ["github-runner"] + +# 3. Apply Terraform +terraform apply -target=module.ecr-clone +``` + +#### If Resource Constraints: +```bash +# 1. Edit default.auto.tfvars +# task_cpu = 4096 # Increase from 2048 +# task_memory = 8192 # Increase from 4096 + +# 2. Apply Terraform +terraform apply +``` + +#### If Registration Token Issue: +```bash +# 1. Check token in Secrets Manager +aws secretsmanager get-secret-value \ + --secret-id \ + --region us-gov-west-1 + +# 2. Manually trigger token refresh +aws lambda invoke --function-name github-runner-token-refresh- \ + --region us-gov-west-1 \ + /tmp/output.json + +# 3. Force service update to pick up new token +aws ecs update-service \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --service SCT-Engineering \ + --force-new-deployment \ + --region us-gov-west-1 +``` + +### Post-Resolution +- [ ] Verify runner count has returned to desired count +- [ ] Check alarm has cleared +- [ ] Monitor for 30 minutes to ensure stability +- [ ] Review and address any similar issues in logs + +--- + +## Scenario 3: All Runners Down (EMERGENCY) + +### Detection +- **CloudWatch Alarm**: `github-runners-emergency-all-down` +- **Symptoms**: Zero running tasks, all workflows blocked +- **Impact**: Complete CI/CD outage + +### Impact Assessment +- **Severity**: EMERGENCY +- **Time to Impact**: IMMEDIATE +- **Affected Systems**: All GitHub workflows, entire CI/CD pipeline + +⚠️ **CRITICAL WARNING**: Deadlock Scenario Risk +- **Normal operation**: ECS automatically restarts failed tasks +- **Deadlock occurs when**: All runners down + registration token expired +- **Why deadlock happens**: + - ECS tries to start replacement tasks + - Tasks need valid registration token from Secrets Manager + - If token expired, new tasks fail to register with GitHub + - Tasks crash, ECS retries indefinitely with same expired token + - Manual intervention required to break the loop +- **Time window**: Token expires 1 hour after last Lambda refresh +- **Prevention**: Lambda refreshes every 30 min (50% safety margin) + +### Immediate Response (First 5 Minutes) + +1. **Assess Token Expiration Risk** + ```bash + # Check when token was last refreshed + aws secretsmanager describe-secret \ + --secret-id \ + --region us-gov-west-1 \ + --query 'LastChangedDate' + + # If token is >50 minutes old, we have ~10 minutes before deadlock + ``` + +2. **Quick Status Check** + ```bash + # Check service status + aws ecs describe-services \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --services SCT-Engineering \ + --region us-gov-west-1 \ + --query 'services[0].[runningCount,desiredCount,deployments[0].status]' + ``` + +3. **Check Recent Task Failures** + ```bash + # Get most recent task failure + aws ecs describe-tasks \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --tasks $(aws ecs list-tasks --cluster ecs-ghe-runners-us-gov-west-1 \ + --service-name SCT-Engineering --desired-status STOPPED \ + --region us-gov-west-1 --max-items 1 --query 'taskArns[0]' --output text) \ + --region us-gov-west-1 + ``` + +### Investigation & Resolution + +#### Phase 1: Identify Root Cause +```bash +# Check ECS service events (last 20) +aws ecs describe-services \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --services SCT-Engineering \ + --region us-gov-west-1 \ + --query 'services[0].events[:20]' + +# Check CloudWatch Logs for errors +aws logs tail /ecs-ghe-runners/-- \ + --since 1h \ + --region us-gov-west-1 \ + --filter-pattern "ERROR" +``` + +#### Phase 2: Emergency Token Refresh +If token is at risk of expiring: +```bash +# 1. Ensure GITHUB_TOKEN is valid +echo $GITHUB_TOKEN # Should be set in environment + +# 2. Force Lambda token refresh +aws lambda invoke \ + --function-name github-runner-token-refresh- \ + --region us-gov-west-1 \ + --log-type Tail \ + /tmp/lambda-output.json && cat /tmp/lambda-output.json + +# 3. Verify new token in Secrets Manager +aws secretsmanager get-secret-value \ + --secret-id \ + --region us-gov-west-1 \ + --query 'SecretString' \ + --output text | jq -r '.token' | wc -c # Should be ~255 chars +``` + +#### Phase 3: Resolve Service Issues + +**If Image Pull Failure:** +```bash +# Quick fix: Use known good image version +# Edit default.auto.tfvars: image_version = "1.67.0" +terraform apply -auto-approve +``` + +**If Resource Issues:** +```bash +# Check ECS cluster capacity +aws ecs describe-clusters \ + --clusters ecs-ghe-runners-us-gov-west-1 \ + --region us-gov-west-1 \ + --include ATTACHMENTS + +# Verify Fargate capacity providers +aws ecs describe-capacity-providers \ + --region us-gov-west-1 +``` + +**If Network Issues:** +```bash +# Verify security group allows outbound HTTPS +aws ec2 describe-security-groups \ + --group-ids \ + --region us-gov-west-1 + +# Check subnet has NAT gateway or internet access +aws ec2 describe-subnets \ + --subnet-ids \ + --region us-gov-west-1 +``` + +**If Service Definition Issues:** +```bash +# Force new deployment with current configuration +aws ecs update-service \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --service SCT-Engineering \ + --force-new-deployment \ + --region us-gov-west-1 + +# Monitor deployment +watch -n 5 'aws ecs describe-services \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --services SCT-Engineering \ + --region us-gov-west-1 \ + --query "services[0].[runningCount,desiredCount]" \ + --output table' +``` + +### Escalation Criteria + +Escalate to senior engineering if: +- All resolution attempts fail after 30 minutes +- Token is <5 minutes from expiration and runners won't start +- Infrastructure issues beyond runner service (AWS outage, VPC issues) +- Security or compliance concerns + +### Post-Incident Requirements + +After service is restored: +- [ ] **Document timeline** in incident tracking system +- [ ] **Identify root cause** and contributing factors +- [ ] **Update monitoring** if gaps were identified +- [ ] **Create prevention tasks** (Jira tickets, backlog items) +- [ ] **Conduct blameless postmortem** within 48 hours +- [ ] **Update this runbook** with lessons learned + +--- + +## Quick Reference Commands + +### Check Runner Status +```bash +aws ecs describe-services \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --services SCT-Engineering \ + --region us-gov-west-1 \ + --query 'services[0].[runningCount,desiredCount,deployments[0].status]' \ + --output table +``` + +### View Recent Logs +```bash +aws logs tail /ecs-ghe-runners/-- \ + --follow --region us-gov-west-1 +``` + +### Force Token Refresh +```bash +aws lambda invoke \ + --function-name github-runner-token-refresh- \ + --region us-gov-west-1 \ + /tmp/output.json && cat /tmp/output.json +``` + +### Force Service Redeployment +```bash +aws ecs update-service \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --service SCT-Engineering \ + --force-new-deployment \ + --region us-gov-west-1 +``` + +### View CloudWatch Dashboard +```bash +# Get dashboard URL from Terraform output +terraform output dashboard_url +``` + +--- + +## Contacts + +- **Primary On-Call**: Check PagerDuty/on-call schedule +- **CSVD Team Lead**: [Name/Contact] +- **AWS Support**: Enterprise support portal +- **GitHub Enterprise Support**: github.e.it.census.gov support + +## Related Documentation + +- [Monitoring Implementation Plan](./MONITORING_IMPLEMENTATION_PLAN.md) +- [GitHub App Setup](./GITHUB_APP_SETUP.md) +- [AWS Permissions](./AWS_PERMISSIONS.md) +- [Security Review](./SECURITY_REVIEW.md) + +--- + +**Version History** +- 2026-01-15: Initial version created with monitoring implementation diff --git a/aws_ecs_cluster_capacity_providers.fargate b/aws_ecs_cluster_capacity_providers.fargate new file mode 100644 index 0000000..b72a076 --- /dev/null +++ b/aws_ecs_cluster_capacity_providers.fargate @@ -0,0 +1,42 @@ +{ + "version": 4, + "terraform_version": "1.9.1", + "serial": 1, + "lineage": "a5f87bd9-a052-6497-cea8-6beed3ccdc9d", + "outputs": {}, + "resources": [ + { + "mode": "managed", + "type": "aws_ecs_cluster_capacity_providers", + "name": "fargate", + "provider": "provider[\"registry.terraform.io/hashicorp/aws\"]", + "instances": [ + { + "schema_version": 0, + "attributes": { + "capacity_providers": [ + "FARGATE" + ], + "cluster_name": "ecs-ghe-runners-us-gov-west-1", + "default_capacity_provider_strategy": [ + { + "base": 1, + "capacity_provider": "FARGATE", + "weight": 100 + } + ], + "id": "ecs-ghe-runners-us-gov-west-1" + }, + "sensitive_attributes": [], + "private": "bnVsbA==", + "dependencies": [ + "aws_ecs_cluster.github-runner", + "data.aws_ecs_cluster.github-runner", + "data.aws_region.current" + ] + } + ] + } + ], + "check_results": null +} diff --git a/default.auto.tfvars b/default.auto.tfvars index 05e0979..818c65c 100644 --- a/default.auto.tfvars +++ b/default.auto.tfvars @@ -27,3 +27,6 @@ certs = { aws_account = "csvd-dev-ew" repo_org = "SCT-Engineering" + +# Monitoring Configuration +alert_email = "david.j.arnold.jr@census.gov" diff --git a/example.auto.tfvars b/example.auto.tfvars deleted file mode 100644 index 013d383..0000000 --- a/example.auto.tfvars +++ /dev/null @@ -1,90 +0,0 @@ -# Example Terraform Variables Configuration -# Copy this file to a workspace-specific .tfvars file and customize -# Example: csvd-229685449397-us-gov-east-1.auto.tfvars - -# ============================================================================= -# GitHub App Authentication (Required) -# ============================================================================= -# See GITHUB_APP_SETUP.md for setup instructions -# These values are organization-specific and must be configured per workspace - -github_app_id = "123456" # Your GitHub App ID -github_app_installation_id = "12345678" # Installation ID for your org -github_app_pem_file = "~/.github-apps/runner-mgmt.pem" # Path to private key - -# ============================================================================= -# GitHub Configuration (Required) -# ============================================================================= - -repo_org = "CSVD" # GitHub organization name -server_url = "https://github.e.it.census.gov" # GitHub Enterprise URL - -# ============================================================================= -# AWS Configuration (Required) -# ============================================================================= - -aws_account = "csvd-dev-ew" # AWS account identifier -region = "us-gov-east-1" # AWS region - -# Network Configuration -vpc_id = "vpc-0abc123def456789" # VPC ID for runner deployment -subnets = [ # Private subnet IDs - "subnet-0abc123", - "subnet-0def456" -] -security_groups = ["sg-0xyz789abc"] # Security group IDs - -# ============================================================================= -# Runner Configuration (Required) -# ============================================================================= - -image_name = "github-runner" # Container image name -image_version = "2.311.0" # GitHub Actions runner version -desired_count = 3 # Number of concurrent runners - -# ============================================================================= -# Task Configuration (Optional) -# ============================================================================= - -task_cpu = 1024 # Task CPU (1 vCPU = 1024) -task_memory = 2048 # Task memory in MB - -# ============================================================================= -# Labels Configuration (Optional) -# ============================================================================= -# Additional labels for runner identification in workflows -# Default labels are automatically added: account ID, account name, region - -labels = [ - "ecs", - "fargate", - "self-hosted" -] - -# ============================================================================= -# Network Configuration (Optional) -# ============================================================================= - -assign_public_ip = false # Assign public IP to tasks -proxy_enabled = true # Enable corporate proxy -proxy_url = "proxy.tco.census.gov:3128" # Proxy URL - -# VPC Endpoints (reduces NAT Gateway costs) -create_vpc_endpoint = false # Create VPC endpoints for AWS services - -# ============================================================================= -# Monitoring Configuration (Optional) -# ============================================================================= - -log_retention_days = 7 # CloudWatch log retention - -# ============================================================================= -# Tags (Optional) -# ============================================================================= - -tags = { - Environment = "development" - ManagedBy = "Terraform" - Project = "GitHub Actions Runners" - Owner = "DevOps Team" -} diff --git a/example.tfvars.template b/example.tfvars.template new file mode 100644 index 0000000..013b614 --- /dev/null +++ b/example.tfvars.template @@ -0,0 +1,111 @@ +# Example Terraform Variables Configuration +# Populated from running service: arn:aws-us-gov:ecs:us-gov-west-1:229685449397:service/ecs-ghe-runners-us-gov-west-1/SCT-Engineering +# Last updated: 2026-01-12 + +# ============================================================================= +# GitHub App Authentication (Required) +# ============================================================================= +# See GITHUB_APP_SETUP.md for setup instructions +# These values are organization-specific and must be configured per workspace +# NOTE: Configure these with your actual GitHub App credentials + +github_app_id = "YOUR_GITHUB_APP_ID" # Your GitHub App ID +github_app_installation_id = "YOUR_INSTALLATION_ID" # Installation ID for your org +github_app_pem_file = "~/.github-apps/runner-mgmt.pem" # Path to private key + +# ============================================================================= +# GitHub Configuration (Required) +# ============================================================================= + +repo_org = "SCT-Engineering" # GitHub organization name (from running service) +server_url = "https://github.e.it.census.gov" # GitHub Enterprise URL (from running service) + +# ============================================================================= +# AWS Configuration (Required) +# ============================================================================= + +aws_account = "cvsd-dev-ew" # AWS account identifier (from running service) +region = "us-gov-west-1" # AWS region (from running service) + +# Network Configuration (from running service) +vpc_id = "vpc-00576a396ec570b94" # VPC ID for runner deployment +subnets = [ # Private subnet IDs + "subnet-0b1992a84536c581b" +] +security_groups = ["sg-0641c697588b9aa6b"] # Security group IDs + +# ============================================================================= +# Runner Configuration (Required) +# ============================================================================= + +image_name = "github-runner" # Container image name (from running service) +image_version = "1.69.0" # GitHub Actions runner version (from running service) +desired_count = 1 # Number of concurrent runners (from running service) + +# ============================================================================= +# Task Configuration (from running service) +# ============================================================================= + +task_cpu = 2048 # Task CPU (2 vCPUs) - from running service +task_memory = 4096 # Task memory in MB (4 GB) - from running service + +# ============================================================================= +# Labels Configuration (from running service) +# ============================================================================= +# Labels extracted from running service: +# cvsd-dev-ew, sct-engineering, 229685449397-us-gov-west-1, 229685449397, +# us-gov-west-1, ecs-github-runner, ubuntu-latest + +labels = [ + "ecs", + "fargate", + "self-hosted" +] + +# ============================================================================= +# Network Configuration (from running service) +# ============================================================================= + +assign_public_ip = false # Assign public IP to tasks (DISABLED in service) +proxy_enabled = true # Enable corporate proxy +proxy_url = "proxy.tco.census.gov:3128" # Proxy URL (from running service) + +# VPC Endpoints (reduces NAT Gateway costs) +create_vpc_endpoint = false # VPC endpoints not used in running service + +# ============================================================================= +# Monitoring Configuration +# ============================================================================= + +log_retention_days = 7 # CloudWatch log retention +alert_email = "david.j.arnold.jr@census.gov" # Email for monitoring alerts + +# ============================================================================= +# ECS Cluster Configuration +# ============================================================================= + +ecs_cluster_name = "ecs-ghe-runners" # Cluster name (derived from service) +create_ecs_cluster = true # Create ECS cluster if it doesn't exist + +# ============================================================================= +# Certificates Configuration (from running service) +# ============================================================================= + +certs = { + bucket = "csvd-dev-ew-github-actions" # S3 bucket for certificates + key = "katello-server-ca.pem" # Certificate file key +} + +# ============================================================================= +# Tags (Optional) +# ============================================================================= + +tags = { + Environment = "development" + ManagedBy = "Terraform" + Project = "GitHub Actions Runners" + Owner = "DevOps Team" + Account = "229685449397" + Region = "us-gov-west-1" +} + diff --git a/lambda/requirements_pat.txt b/lambda/requirements_pat.txt new file mode 100644 index 0000000..6de9a68 --- /dev/null +++ b/lambda/requirements_pat.txt @@ -0,0 +1,4 @@ +# Python dependencies for GitHub Runner Token Refresh Lambda +# PAT-based authentication (simplified) + +urllib3>=2.0.0 diff --git a/lambda/token_refresh_pat.py b/lambda/token_refresh_pat.py new file mode 100644 index 0000000..4b15a1c --- /dev/null +++ b/lambda/token_refresh_pat.py @@ -0,0 +1,191 @@ +""" +Lambda function to refresh GitHub Actions runner registration tokens. + +CRITICAL CONTEXT: +- Runners are persistent, long-running ECS containers (not ephemeral) +- Registration tokens are ONLY used during container startup +- Running runners don't need token refresh (already registered) +- This Lambda ensures valid tokens are available for ECS task restarts + +Purpose: Prevent deadlock scenario where: +1. All runner containers die (crash, deployment failure, etc.) +2. Registration token expires (1 hour lifetime) +3. ECS tries to start replacement tasks but they fail (expired token) +4. Manual intervention required to break the loop + +Schedule: Runs every 30 minutes via EventBridge (50% safety margin) +Authentication: Uses Personal Access Token (PAT) from environment variable +""" + +import boto3 +import json +import os +import urllib3 +from typing import Dict, Any + +# Initialize AWS clients +secrets_manager = boto3.client('secretsmanager') +http = urllib3.PoolManager() + + +def get_github_registration_token(github_url: str, org: str, access_token: str) -> Dict[str, str]: + """ + Retrieve a fresh GitHub Actions registration token from the GitHub API. + + Args: + github_url: Base GitHub Enterprise URL + org: GitHub organization name + access_token: GitHub Personal Access Token + + Returns: + Dict with 'token' and 'expires_at' keys + + Raises: + Exception: If GitHub API request fails + """ + api_url = f"{github_url}/api/v3/orgs/{org}/actions/runners/registration-token" + + headers = { + 'Authorization': f'token {access_token}', + 'Accept': 'application/vnd.github.v3+json', + 'User-Agent': 'AWS-Lambda-GitHub-Runner-Token-Refresh' + } + + print(f"Requesting registration token from: {api_url}") + + response = http.request( + 'POST', + api_url, + headers=headers + ) + + if response.status == 201: + data = json.loads(response.data.decode('utf-8')) + token = data.get('token') + expires_at = data.get('expires_at') + + print(f"Successfully retrieved registration token (expires: {expires_at})") + return { + 'token': token, + 'expires_at': expires_at + } + else: + error_msg = f"GitHub API request failed with status {response.status}: {response.data.decode('utf-8')}" + print(error_msg) + raise Exception(error_msg) + + +def update_secrets_manager(secret_name: str, token_data: Dict[str, str]) -> None: + """ + Update the GitHub registration token in AWS Secrets Manager. + + Args: + secret_name: Name/ARN of the secret in Secrets Manager + token_data: Dict with token and expiration info + + Raises: + Exception: If Secrets Manager update fails + """ + try: + print(f"Updating secret: {secret_name}") + + # Store as JSON with token and metadata + secret_value = json.dumps(token_data) + + secrets_manager.put_secret_value( + SecretId=secret_name, + SecretString=secret_value + ) + + print(f"Successfully updated secret: {secret_name}") + + except Exception as e: + error_msg = f"Failed to update Secrets Manager: {str(e)}" + print(error_msg) + raise Exception(error_msg) + + +def lambda_handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]: + """ + Lambda handler function triggered by EventBridge. + + Environment Variables Required: + GITHUB_TOKEN: GitHub Personal Access Token with admin:org scope + GITHUB_ORG: GitHub organization name + GITHUB_URL: GitHub Enterprise base URL + SECRET_NAME: Name/ARN of the secret in Secrets Manager + + Args: + event: EventBridge event (not used) + context: Lambda context object + + Returns: + Response dict with status code and message + """ + print("=== GitHub Runner Token Refresh Lambda ===") + print(f"Request ID: {context.request_id}") + print(f"Function: {context.function_name}") + + # Get environment variables + github_token = os.environ.get('GITHUB_TOKEN') + github_org = os.environ.get('GITHUB_ORG') + github_url = os.environ.get('GITHUB_URL') + secret_name = os.environ.get('SECRET_NAME') + + # Validate environment variables + required_vars = { + 'GITHUB_TOKEN': github_token, + 'GITHUB_ORG': github_org, + 'GITHUB_URL': github_url, + 'SECRET_NAME': secret_name + } + + missing = [var for var, val in required_vars.items() if not val] + if missing: + error_msg = f"Missing required environment variables: {', '.join(missing)}" + print(error_msg) + return { + 'statusCode': 500, + 'body': json.dumps({'error': error_msg}) + } + + # All required environment variables are present + assert github_token is not None + assert github_org is not None + assert github_url is not None + assert secret_name is not None + + print(f"GitHub Org: {github_org}") + print(f"GitHub URL: {github_url}") + print(f"Secret Name: {secret_name}") + print(f"GitHub Token: {'*' * (len(github_token) - 4)}{github_token[-4:] if len(github_token) > 4 else '****'}") + + try: + # Get fresh registration token from GitHub using PAT + print("Requesting runner registration token...") + token_data = get_github_registration_token(github_url, github_org, github_token) + + # Update Secrets Manager with new token + update_secrets_manager(secret_name, token_data) + + success_msg = "Token refreshed successfully" + print(f"=== {success_msg} ===") + + return { + 'statusCode': 200, + 'body': json.dumps({ + 'message': success_msg, + 'secret_name': secret_name, + 'github_org': github_org, + 'expires_at': token_data.get('expires_at') + }) + } + + except Exception as e: + error_msg = f"Token refresh failed: {str(e)}" + print(f"=== ERROR: {error_msg} ===") + + return { + 'statusCode': 500, + 'body': json.dumps({'error': error_msg}) + } diff --git a/lambda_token_refresh.tf b/lambda_token_refresh.tf index ad8b466..76afb3f 100644 --- a/lambda_token_refresh.tf +++ b/lambda_token_refresh.tf @@ -1,5 +1,16 @@ # Lambda function to automatically refresh GitHub Actions registration tokens -# This prevents token expiration issues by refreshing the token every 30 minutes +# +# IMPORTANT: Runners are persistent, long-running containers. The registration token +# is ONLY needed during container startup. This Lambda ensures that whenever ECS +# needs to restart a task (crash, deployment, scale-up), a valid token is available. +# +# Token Lifecycle: +# - Running runners: Already registered, don't need token refresh +# - Token refresh purpose: Insurance for ECS automatic task recovery +# - Refresh interval: Every 30 minutes (tokens expire in 1 hour) +# - Critical for: Preventing deadlock when all runners down + token expires +# +# Authentication: Uses Personal Access Token (GITHUB_TOKEN environment variable) locals { lambda_function_name = "github-runner-token-refresh-${var.aws_account}" @@ -8,8 +19,8 @@ locals { # Install Python dependencies locally for Lambda packaging resource "null_resource" "lambda_dependencies" { triggers = { - requirements = filemd5("${path.module}/lambda/requirements.txt") - source_code = filemd5("${path.module}/lambda/token_refresh.py") + requirements = filemd5("${path.module}/lambda/requirements_pat.txt") + source_code = filemd5("${path.module}/lambda/token_refresh_pat.py") } provisioner "local-exec" { @@ -17,8 +28,8 @@ resource "null_resource" "lambda_dependencies" { cd ${path.module}/lambda rm -rf package mkdir -p package - pip3 install --target package -r requirements.txt --platform manylinux2014_x86_64 --only-binary=:all: - cp token_refresh.py package/ + pip3 install --target package -r requirements_pat.txt --platform manylinux2014_x86_64 --only-binary=:all: + cp token_refresh_pat.py package/token_refresh.py EOT } } @@ -44,12 +55,10 @@ resource "aws_lambda_function" "token_refresh" { environment { variables = { - GITHUB_APP_ID = var.github_app_id - GITHUB_APP_INSTALLATION_ID = var.github_app_installation_id - GITHUB_APP_PEM_FILE = var.github_app_pem_file - GITHUB_ORG = var.repo_org - GITHUB_URL = var.server_url - SECRET_NAME = aws_secretsmanager_secret.secret.name + GITHUB_TOKEN = var.github_token + GITHUB_ORG = var.repo_org + GITHUB_URL = var.server_url + SECRET_NAME = module.github-runner.secret_name } } @@ -124,7 +133,7 @@ resource "aws_iam_role_policy" "lambda_refresh_policy" { "secretsmanager:GetSecretValue", "secretsmanager:PutSecretValue" ] - Resource = aws_secretsmanager_secret.secret.arn + Resource = module.github-runner.secret_arn }, { Effect = "Allow" @@ -133,7 +142,7 @@ resource "aws_iam_role_policy" "lambda_refresh_policy" { "logs:CreateLogStream", "logs:PutLogEvents" ] - Resource = "arn:aws:logs:*:*:log-group:/aws/lambda/${local.lambda_function_name}:*" + Resource = "arn:${data.aws_partition.current.partition}:logs:*:*:log-group:/aws/lambda/${local.lambda_function_name}:*" } ] }) @@ -150,7 +159,7 @@ resource "aws_cloudwatch_log_group" "lambda_logs" { } } -# CloudWatch Alarm for Lambda failures +# CloudWatch Alarm for Lambda failures - connected to SNS resource "aws_cloudwatch_metric_alarm" "lambda_errors" { alarm_name = "${local.lambda_function_name}-errors" comparison_operator = "GreaterThanThreshold" @@ -163,6 +172,9 @@ resource "aws_cloudwatch_metric_alarm" "lambda_errors" { alarm_description = "Alert when Lambda token refresh fails" treat_missing_data = "notBreaching" + alarm_actions = [aws_sns_topic.github_runner_critical_alerts.arn] + ok_actions = [aws_sns_topic.github_runner_critical_alerts.arn] + dimensions = { FunctionName = aws_lambda_function.token_refresh.function_name } @@ -173,13 +185,26 @@ resource "aws_cloudwatch_metric_alarm" "lambda_errors" { } } +# Data source for AWS partition (for ARN construction) +data "aws_partition" "current" {} + # Output Lambda function details output "lambda_token_refresh_function_name" { description = "Name of the Lambda function that refreshes GitHub tokens" value = aws_lambda_function.token_refresh.function_name } +output "lambda_token_refresh_function_arn" { + description = "ARN of the Lambda function that refreshes GitHub tokens" + value = aws_lambda_function.token_refresh.arn +} + output "lambda_token_refresh_schedule" { description = "Schedule for automatic token refresh" value = aws_cloudwatch_event_rule.token_refresh_schedule.schedule_expression } + +output "lambda_token_refresh_log_group" { + description = "CloudWatch log group for Lambda function" + value = aws_cloudwatch_log_group.lambda_logs.name +} diff --git a/lambda_token_refresh.tf.tmp b/lambda_token_refresh.tf.tmp new file mode 100644 index 0000000..ad8b466 --- /dev/null +++ b/lambda_token_refresh.tf.tmp @@ -0,0 +1,185 @@ +# Lambda function to automatically refresh GitHub Actions registration tokens +# This prevents token expiration issues by refreshing the token every 30 minutes + +locals { + lambda_function_name = "github-runner-token-refresh-${var.aws_account}" +} + +# Install Python dependencies locally for Lambda packaging +resource "null_resource" "lambda_dependencies" { + triggers = { + requirements = filemd5("${path.module}/lambda/requirements.txt") + source_code = filemd5("${path.module}/lambda/token_refresh.py") + } + + provisioner "local-exec" { + command = <<-EOT + cd ${path.module}/lambda + rm -rf package + mkdir -p package + pip3 install --target package -r requirements.txt --platform manylinux2014_x86_64 --only-binary=:all: + cp token_refresh.py package/ + EOT + } +} + +# Create ZIP file for Lambda deployment with dependencies +data "archive_file" "token_refresh_lambda" { + type = "zip" + source_dir = "${path.module}/lambda/package" + output_path = "${path.module}/lambda/token_refresh.zip" + + depends_on = [null_resource.lambda_dependencies] +} + +# Lambda function +resource "aws_lambda_function" "token_refresh" { + filename = data.archive_file.token_refresh_lambda.output_path + function_name = local.lambda_function_name + role = aws_iam_role.lambda_refresh_role.arn + handler = "token_refresh.lambda_handler" + source_code_hash = data.archive_file.token_refresh_lambda.output_base64sha256 + runtime = "python3.11" + timeout = 60 + + environment { + variables = { + GITHUB_APP_ID = var.github_app_id + GITHUB_APP_INSTALLATION_ID = var.github_app_installation_id + GITHUB_APP_PEM_FILE = var.github_app_pem_file + GITHUB_ORG = var.repo_org + GITHUB_URL = var.server_url + SECRET_NAME = aws_secretsmanager_secret.secret.name + } + } + + tags = { + Name = local.lambda_function_name + Environment = var.aws_account + Purpose = "GitHub Runner Token Refresh" + } +} + +# CloudWatch Event Rule - trigger every 30 minutes +resource "aws_cloudwatch_event_rule" "token_refresh_schedule" { + name = "${local.lambda_function_name}-schedule" + description = "Refresh GitHub runner registration token every 30 minutes" + schedule_expression = "rate(30 minutes)" + + tags = { + Name = "${local.lambda_function_name}-schedule" + Environment = var.aws_account + } +} + +# CloudWatch Event Target +resource "aws_cloudwatch_event_target" "token_refresh_target" { + rule = aws_cloudwatch_event_rule.token_refresh_schedule.name + target_id = "RefreshTokenLambda" + arn = aws_lambda_function.token_refresh.arn +} + +# Allow EventBridge to invoke Lambda +resource "aws_lambda_permission" "allow_eventbridge" { + statement_id = "AllowExecutionFromEventBridge" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.token_refresh.function_name + principal = "events.amazonaws.com" + source_arn = aws_cloudwatch_event_rule.token_refresh_schedule.arn +} + +# IAM Role for Lambda +resource "aws_iam_role" "lambda_refresh_role" { + name = "${local.lambda_function_name}-role" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "lambda.amazonaws.com" + } + }] + }) + + tags = { + Name = "${local.lambda_function_name}-role" + Environment = var.aws_account + } +} + +# IAM Policy for Lambda +resource "aws_iam_role_policy" "lambda_refresh_policy" { + name = "token-refresh-policy" + role = aws_iam_role.lambda_refresh_role.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "secretsmanager:UpdateSecret", + "secretsmanager:GetSecretValue", + "secretsmanager:PutSecretValue" + ] + Resource = aws_secretsmanager_secret.secret.arn + }, + { + Effect = "Allow" + Action = [ + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents" + ] + Resource = "arn:aws:logs:*:*:log-group:/aws/lambda/${local.lambda_function_name}:*" + } + ] + }) +} + +# CloudWatch Log Group for Lambda +resource "aws_cloudwatch_log_group" "lambda_logs" { + name = "/aws/lambda/${local.lambda_function_name}" + retention_in_days = 7 + + tags = { + Name = "${local.lambda_function_name}-logs" + Environment = var.aws_account + } +} + +# CloudWatch Alarm for Lambda failures +resource "aws_cloudwatch_metric_alarm" "lambda_errors" { + alarm_name = "${local.lambda_function_name}-errors" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 2 + metric_name = "Errors" + namespace = "AWS/Lambda" + period = 300 + statistic = "Sum" + threshold = 1 + alarm_description = "Alert when Lambda token refresh fails" + treat_missing_data = "notBreaching" + + dimensions = { + FunctionName = aws_lambda_function.token_refresh.function_name + } + + tags = { + Name = "${local.lambda_function_name}-errors" + Environment = var.aws_account + } +} + +# Output Lambda function details +output "lambda_token_refresh_function_name" { + description = "Name of the Lambda function that refreshes GitHub tokens" + value = aws_lambda_function.token_refresh.function_name +} + +output "lambda_token_refresh_schedule" { + description = "Schedule for automatic token refresh" + value = aws_cloudwatch_event_rule.token_refresh_schedule.schedule_expression +} diff --git a/monitoring.tf b/monitoring.tf new file mode 100644 index 0000000..c2fff76 --- /dev/null +++ b/monitoring.tf @@ -0,0 +1,227 @@ +# GitHub Runner Monitoring Infrastructure +# +# This file implements critical monitoring for GitHub Actions runners on ECS Fargate +# +# RUNNER MODEL: Persistent, long-running containers (not ephemeral) +# - Runners stay online 24/7, handling multiple jobs +# - Only restart on: task failure, manual stop, service deployment +# - Monitoring tracks runner CONTAINER health, not individual job execution +# +# Key monitoring areas: +# 1. SNS alerting for critical events +# 2. Lambda token refresh monitoring +# 3. Runner availability alarms (container count, not job count) +# 4. CloudWatch dashboard for visibility + +# ============================================================================== +# SNS Topic and Subscriptions for Critical Alerts +# ============================================================================== + +resource "aws_sns_topic" "github_runner_critical_alerts" { + name = "github-runner-critical-alerts-${data.aws_caller_identity.current.account_id}-${data.aws_region.current.name}" + display_name = "GitHub Runner Critical Alerts" + + tags = { + Name = "github-runner-critical-alerts" + Environment = var.aws_account + Purpose = "Critical alerting for GitHub Actions runners" + } +} + +resource "aws_sns_topic_subscription" "alert_email" { + topic_arn = aws_sns_topic.github_runner_critical_alerts.arn + protocol = "email" + endpoint = var.alert_email +} + +# ============================================================================== +# Runner Availability Alarms +# ============================================================================== + +# Critical alarm: Runners below 50% capacity +resource "aws_cloudwatch_metric_alarm" "runners_critical" { + alarm_name = "github-runners-critical-capacity-${var.aws_account}" + comparison_operator = "LessThanThreshold" + evaluation_periods = 2 + metric_name = "RunningTasksCount" + namespace = "ECS/ContainerInsights" + period = 300 + statistic = "Average" + threshold = ceil(var.desired_count * 0.5) + alarm_description = "CRITICAL: GitHub runners below 50% capacity. Current capacity may not handle workload." + treat_missing_data = "breaching" + + alarm_actions = [aws_sns_topic.github_runner_critical_alerts.arn] + ok_actions = [aws_sns_topic.github_runner_critical_alerts.arn] + + dimensions = { + ClusterName = var.create_ecs_cluster ? aws_ecs_cluster.github-runner[0].name : data.aws_ecs_cluster.github-runner[0].cluster_name + ServiceName = var.repo_org + } + + tags = { + Name = "github-runners-critical-capacity" + Environment = var.aws_account + Severity = "critical" + } +} + +# Emergency alarm: All runners down +resource "aws_cloudwatch_metric_alarm" "runners_emergency" { + alarm_name = "github-runners-emergency-all-down-${var.aws_account}" + comparison_operator = "LessThanOrEqualToThreshold" + evaluation_periods = 1 + metric_name = "RunningTasksCount" + namespace = "ECS/ContainerInsights" + period = 60 + statistic = "Maximum" + threshold = 0 + alarm_description = "EMERGENCY: All GitHub runners are down! Workflows cannot execute. Token refresh may fail." + treat_missing_data = "breaching" + + alarm_actions = [aws_sns_topic.github_runner_critical_alerts.arn] + ok_actions = [aws_sns_topic.github_runner_critical_alerts.arn] + + dimensions = { + ClusterName = var.create_ecs_cluster ? aws_ecs_cluster.github-runner[0].name : data.aws_ecs_cluster.github-runner[0].cluster_name + ServiceName = var.repo_org + } + + tags = { + Name = "github-runners-emergency-all-down" + Environment = var.aws_account + Severity = "emergency" + } +} + +# ============================================================================== +# CloudWatch Dashboard +# ============================================================================== + +resource "aws_cloudwatch_dashboard" "github_runners" { + dashboard_name = "github-runners-${var.aws_account}" + + dashboard_body = jsonencode({ + widgets = [ + # Runner Count Widget + { + type = "metric" + properties = { + metrics = [ + ["ECS/ContainerInsights", "RunningTasksCount", "ClusterName", var.create_ecs_cluster ? aws_ecs_cluster.github-runner[0].name : data.aws_ecs_cluster.github-runner[0].cluster_name, "ServiceName", var.repo_org, { stat = "Average" }], + ["...", { stat = "Maximum" }], + ["...", { stat = "Minimum" }] + ] + title = "GitHub Runner Count (Running Tasks)" + region = data.aws_region.current.name + yAxis = { left = { min = 0 } } + period = 300 + annotations = { + horizontal = [ + { + label = "Desired Count" + value = var.desired_count + }, + { + label = "Critical Threshold (50%)" + value = ceil(var.desired_count * 0.5) + } + ] + } + } + width = 12 + height = 6 + x = 0 + y = 0 + }, + + # ECS Service CPU and Memory + { + type = "metric" + properties = { + metrics = [ + ["ECS/ContainerInsights", "CpuUtilized", "ClusterName", var.create_ecs_cluster ? aws_ecs_cluster.github-runner[0].name : data.aws_ecs_cluster.github-runner[0].cluster_name, "ServiceName", var.repo_org], + [".", "MemoryUtilized", ".", ".", ".", ".", { yAxis = "right" }] + ] + title = "Runner Resource Utilization" + region = data.aws_region.current.name + yAxis = { + left = { label = "CPU (vCPU)", min = 0 } + right = { label = "Memory (MB)", min = 0 } + } + period = 300 + } + width = 12 + height = 6 + x = 12 + y = 0 + }, + + # Alarm Status + { + type = "alarm" + properties = { + title = "Critical Alarms Status" + alarms = [ + aws_cloudwatch_metric_alarm.runners_critical.arn, + aws_cloudwatch_metric_alarm.runners_emergency.arn + ] + } + width = 8 + height = 4 + x = 0 + y = 6 + }, + + # Recent Events Log + { + type = "log" + properties = { + query = <<-EOQ + SOURCE '/ecs-ghe-runners/${var.repo_org}-${data.aws_caller_identity.current.account_id}-${data.aws_region.current.name}' + | fields @timestamp, @message + | filter @message like /error|fail|exception/i + | sort @timestamp desc + | limit 20 + EOQ + region = data.aws_region.current.name + title = "Recent Error Events" + } + width = 16 + height = 4 + x = 8 + y = 6 + } + ] + }) +} + +# ============================================================================== +# Outputs +# ============================================================================== + +output "sns_topic_arn" { + description = "ARN of the SNS topic for critical alerts" + value = aws_sns_topic.github_runner_critical_alerts.arn +} + +output "sns_topic_name" { + description = "Name of the SNS topic for critical alerts" + value = aws_sns_topic.github_runner_critical_alerts.name +} + +output "dashboard_name" { + description = "Name of the CloudWatch dashboard" + value = aws_cloudwatch_dashboard.github_runners.dashboard_name +} + +output "dashboard_url" { + description = "URL to access the CloudWatch dashboard" + value = "https://console.aws.amazon.com/cloudwatch/home?region=${data.aws_region.current.name}#dashboards:name=${aws_cloudwatch_dashboard.github_runners.dashboard_name}" +} + +output "alert_email" { + description = "Email address receiving alerts" + value = var.alert_email + sensitive = true +} diff --git a/providers.tf b/providers.tf index 694938d..023b890 100644 --- a/providers.tf +++ b/providers.tf @@ -11,20 +11,16 @@ terraform { } } -# Generate GitHub App token for authentication -data "github_app_token" "app" { - app_id = var.github_app_id - installation_id = var.github_app_installation_id - pem_file = var.github_app_pem_file -} - +# GitHub provider will use GITHUB_TOKEN environment variable provider "github" { - organization = var.repo_org - base_url = var.base_url - token = data.github_app_token.app.token + owner = var.repo_org + base_url = var.base_url + # token is automatically read from GITHUB_TOKEN env var } provider "aws" { + region = "us-gov-west-1" + default_tags { tags = { finops_project_name = "csvd_github_actions" @@ -33,4 +29,4 @@ provider "aws" { organization = "census:ocio:csvd" } } -}x \ No newline at end of file +} \ No newline at end of file diff --git a/terraform_data_dirs/csvd/environment b/terraform_data_dirs/csvd/environment deleted file mode 100644 index 58bcd92..0000000 --- a/terraform_data_dirs/csvd/environment +++ /dev/null @@ -1 +0,0 @@ -csvd \ No newline at end of file diff --git a/terraform_data_dirs/csvd/modules/ecr-clone b/terraform_data_dirs/csvd/modules/ecr-clone deleted file mode 160000 index 8fa1857..0000000 --- a/terraform_data_dirs/csvd/modules/ecr-clone +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 8fa1857eb18dcd1a79243743cbecca95b5b06b68 diff --git a/terraform_data_dirs/csvd/modules/github-runner b/terraform_data_dirs/csvd/modules/github-runner deleted file mode 160000 index 88edaff..0000000 --- a/terraform_data_dirs/csvd/modules/github-runner +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 88edaff4267e5d8e43f42e22124154613e79477b diff --git a/terraform_data_dirs/csvd/modules/modules.json b/terraform_data_dirs/csvd/modules/modules.json deleted file mode 100644 index d52bb4c..0000000 --- a/terraform_data_dirs/csvd/modules/modules.json +++ /dev/null @@ -1 +0,0 @@ -{"Modules":[{"Key":"","Source":"","Dir":"."},{"Key":"ecr-clone","Source":"registry.terraform.io/HappyPathway/ecr-clone/aws","Version":"0.0.30","Dir":"/data/terraform/workspaces/arnol377/git/ghe-runner/terraform_data_dirs/csvd/modules/ecr-clone"},{"Key":"github-runner","Source":"registry.terraform.io/HappyPathway/github-runner/ecs","Version":"0.0.92","Dir":"/data/terraform/workspaces/arnol377/git/ghe-runner/terraform_data_dirs/csvd/modules/github-runner"}]} \ No newline at end of file diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64 deleted file mode 120000 index 40fb43e..0000000 --- a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64 +++ /dev/null @@ -1 +0,0 @@ -/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64 deleted file mode 120000 index 33544c3..0000000 --- a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64 +++ /dev/null @@ -1 +0,0 @@ -/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/dns/3.4.3/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/dns/3.4.3/linux_amd64 deleted file mode 120000 index a6fbdd6..0000000 --- a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/dns/3.4.3/linux_amd64 +++ /dev/null @@ -1 +0,0 @@ -/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/dns/3.4.3/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64 deleted file mode 120000 index d61a361..0000000 --- a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64 +++ /dev/null @@ -1 +0,0 @@ -/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/github/6.6.0/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/github/6.6.0/linux_amd64 deleted file mode 120000 index 095d815..0000000 --- a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/github/6.6.0/linux_amd64 +++ /dev/null @@ -1 +0,0 @@ -/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/github/6.6.0/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64 deleted file mode 120000 index fe28aef..0000000 --- a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64 +++ /dev/null @@ -1 +0,0 @@ -/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/null/3.2.4/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/null/3.2.4/linux_amd64 deleted file mode 120000 index 75282e6..0000000 --- a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/null/3.2.4/linux_amd64 +++ /dev/null @@ -1 +0,0 @@ -/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/null/3.2.4/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64 deleted file mode 120000 index 494ac1e..0000000 --- a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64 +++ /dev/null @@ -1 +0,0 @@ -/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/random/3.7.2/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/random/3.7.2/linux_amd64 deleted file mode 120000 index f8eee1f..0000000 --- a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/random/3.7.2/linux_amd64 +++ /dev/null @@ -1 +0,0 @@ -/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/random/3.7.2/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/5.45.0/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/5.45.0/linux_amd64 deleted file mode 120000 index 15c0b66..0000000 --- a/terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/5.45.0/linux_amd64 +++ /dev/null @@ -1 +0,0 @@ -/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/integrations/github/5.45.0/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/6.6.0/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/6.6.0/linux_amd64 deleted file mode 120000 index 26dfde5..0000000 --- a/terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/6.6.0/linux_amd64 +++ /dev/null @@ -1 +0,0 @@ -/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/integrations/github/6.6.0/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/sct-engineering/modules/github-runner b/terraform_data_dirs/sct-engineering/modules/github-runner deleted file mode 160000 index 88edaff..0000000 --- a/terraform_data_dirs/sct-engineering/modules/github-runner +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 88edaff4267e5d8e43f42e22124154613e79477b diff --git a/terraform_data_dirs/sct-engineering/modules/modules.json b/terraform_data_dirs/sct-engineering/modules/modules.json deleted file mode 100644 index 31f0156..0000000 --- a/terraform_data_dirs/sct-engineering/modules/modules.json +++ /dev/null @@ -1 +0,0 @@ -{"Modules":[{"Key":"","Source":"","Dir":"."},{"Key":"github-runner","Source":"registry.terraform.io/HappyPathway/github-runner/ecs","Version":"0.0.92","Dir":"/data/terraform/workspaces/arnol377/git/ghe-runner/terraform_data_dirs/sct-engineering/modules/github-runner"}]} \ No newline at end of file diff --git a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64 b/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64 deleted file mode 120000 index 40fb43e..0000000 --- a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64 +++ /dev/null @@ -1 +0,0 @@ -/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/aws/5.70.0/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64 b/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64 deleted file mode 120000 index 33544c3..0000000 --- a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64 +++ /dev/null @@ -1 +0,0 @@ -/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/dns/3.4.2/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64 b/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64 deleted file mode 120000 index d61a361..0000000 --- a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64 +++ /dev/null @@ -1 +0,0 @@ -/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/github/6.3.1/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/local/2.5.2/linux_amd64 b/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/local/2.5.2/linux_amd64 deleted file mode 120000 index 9e2ab54..0000000 --- a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/local/2.5.2/linux_amd64 +++ /dev/null @@ -1 +0,0 @@ -/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/local/2.5.2/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64 b/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64 deleted file mode 120000 index fe28aef..0000000 --- a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64 +++ /dev/null @@ -1 +0,0 @@ -/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/null/3.2.3/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64 b/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64 deleted file mode 120000 index 494ac1e..0000000 --- a/terraform_data_dirs/sct-engineering/providers/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64 +++ /dev/null @@ -1 +0,0 @@ -/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/random/3.6.3/linux_amd64 \ No newline at end of file diff --git a/variables.tf b/variables.tf index 3ac4317..7f2c896 100644 --- a/variables.tf +++ b/variables.tf @@ -8,6 +8,35 @@ variable "ecs_cluster_name" { } } +variable "github_token" { + description = <<-EOT + GitHub Personal Access Token for authentication. + + This token is used for: + - GitHub provider authentication (set via GITHUB_TOKEN env var) + - Lambda function to refresh runner registration tokens + + Required Scopes: + - admin:org (for managing runner registration tokens) + - repo (for accessing repositories) + + SECURITY WARNING: + - NEVER commit this token to version control + - Set via environment variable: export GITHUB_TOKEN="your-token" + - The token will be stored in Lambda environment variables + + Note: This is a simpler but less secure alternative to GitHub App authentication. + Consider using GitHub App for production environments. + EOT + type = string + sensitive = true + + validation { + condition = length(var.github_token) > 20 + error_message = "GitHub token must be at least 20 characters" + } +} + variable "repo_org" { description = "The GitHub organization" type = string @@ -118,11 +147,14 @@ variable "github_app_id" { Note: Different organizations may have different GitHub App IDs. Set this value in workspace-specific .tfvars files. + + If not provided, GITHUB_TOKEN environment variable will be used instead. EOT type = string + default = null validation { - condition = can(regex("^[0-9]+$", var.github_app_id)) + condition = var.github_app_id == null || can(regex("^[0-9]+$", var.github_app_id)) error_message = "GitHub App ID must be a numeric string (e.g., '123456')" } } @@ -142,11 +174,14 @@ variable "github_app_installation_id" { Note: This value is organization-specific. Set this value in workspace-specific .tfvars files. + + If not provided, GITHUB_TOKEN environment variable will be used instead. EOT type = string + default = null validation { - condition = can(regex("^[0-9]+$", var.github_app_installation_id)) + condition = var.github_app_installation_id == null || can(regex("^[0-9]+$", var.github_app_installation_id)) error_message = "GitHub App Installation ID must be a numeric string (e.g., '12345678')" } } @@ -173,11 +208,25 @@ variable "github_app_pem_file" { - Or in .tfvars: github_app_pem_file = "/path/to/private-key.pem" The PEM file should be accessible from where Terraform runs. + + If not provided, GITHUB_TOKEN environment variable will be used instead. EOT type = string + default = null validation { - condition = can(regex("\\.pem$", var.github_app_pem_file)) + condition = var.github_app_pem_file == null || can(regex("\\.pem$", var.github_app_pem_file)) error_message = "GitHub App PEM file path must end with .pem" } } + +# Monitoring Configuration +variable "alert_email" { + description = "Email address to receive CloudWatch alarm notifications for runner and Lambda failures" + type = string + + validation { + condition = can(regex("^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$", var.alert_email)) + error_message = "Must be a valid email address" + } +}