diff --git a/README.md b/README.md index 05d96ad..9b34c32 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Infrastructure as Code (Terraform) for deploying self-hosted GitHub Actions runn This repository manages the deployment of **organization-level self-hosted GitHub Actions runners** using AWS ECS Fargate. Runners are deployed per AWS account and automatically register with your GitHub Enterprise organization, providing secure, scalable, and cost-effective CI/CD execution environments. -**Runner Model**: Runners are **persistent, long-running containers** that stay active 24/7, continuously polling GitHub for jobs. They are not ephemeral - the same runner handles multiple workflow jobs over its lifetime. +**Runner Model**: Runners are **persistent, long-running containers** that stay active 24/7, continuously polling GitHub for jobs. They are not ephemeral — the same runner handles multiple workflow jobs over its lifetime. ### Key Features @@ -16,8 +16,9 @@ This repository manages the deployment of **organization-level self-hosted GitHu - **Automatic IAM Authentication**: ECS Task Roles provide seamless AWS access - **Multi-Label Support**: Runners tagged with account ID, name, region, and more - **Proxy-Enabled**: Pre-configured for enterprise proxy environments -- **CloudWatch Integration**: Centralized logging and monitoring -- **Automated Token Refresh**: Lambda keeps registration tokens fresh for task restarts +- **CloudWatch Integration**: Centralized logging, monitoring, dashboard, and SNS alerting +- **Automated Token Refresh**: Lambda (optional) keeps registration tokens fresh for task restarts +- **ECR Mirroring**: Mirrors public runner image into private ECR by default (`enable_ecr_clone = true`) - **Scalable**: Adjust runner count based on workload demands ## Architecture @@ -34,27 +35,33 @@ Runners are **persistent, long-running containers** that operate continuously: ``` GitHub Enterprise (github.e.it.census.gov) - │ - │ (Token Authentication) - ▼ + | + | (Token Authentication via GITHUB_TOKEN) + v ECS Cluster (per account/region) - ecs-ghe-runners-{region} - │ - ├── Fargate Task (Persistent Runner 1) - │ ├── Container: github-runner:{version} - │ ├── Lifecycle: Long-running (24/7) - │ ├── Registration: One-time at startup - │ ├── IAM Task Role (AWS Auth) - │ ├── Labels: Account ID, Name, Region - │ └── Logs → CloudWatch - │ - ├── Fargate Task (Persistent Runner 2) - └── Fargate Task (Persistent Runner N) - - Lambda Token Refresh (Every 30 min) - │ - └──> AWS Secrets Manager - (Keeps registration token fresh for task restarts) + {ecs_cluster_name}-{region} e.g. ecs-ghe-runners-us-gov-west-1 + | + +-- Fargate Task (Persistent Runner 1) + | +-- Image: {account_id}.dkr.ecr.{region}.amazonaws.com/github-runners/github-runner:{version} + | +-- Lifecycle: Long-running (24/7) + | +-- Registration: One-time at startup (token from Secrets Manager) + | +-- IAM Task Role (AWS Auth for workflows) + | +-- Labels: Account ID, Name, Region, self-hosted, etc. + | +-- Logs -> CloudWatch + | + +-- Fargate Task (Persistent Runner 2) + +-- Fargate Task (Persistent Runner N) + + Lambda Token Refresh (every 30 min, when enable_lambda_token_refresh = true) + | + +---> AWS Secrets Manager + (Keeps registration token fresh for ECS task restarts) + + Monitoring (always deployed) + +-- SNS Topic: github-runner-critical-alerts-{account_id}-{region} + +-- CloudWatch Alarm: runners < 50% capacity (critical) + +-- CloudWatch Alarm: all runners down (emergency) + +-- CloudWatch Dashboard: github-runners-{aws_account} ``` **Network Architecture:** @@ -72,41 +79,15 @@ You need an AWS account with appropriate IAM permissions to deploy the infrastru **Terraform Deployment Permissions:** - **IAM**: Create and manage IAM roles, policies, and policy attachments - - `iam:CreateRole`, `iam:CreatePolicy`, `iam:AttachRolePolicy`, `iam:PutRolePolicy` - - `iam:GetRole`, `iam:GetPolicy`, `iam:ListAttachedRolePolicies` - - `iam:DeleteRole`, `iam:DeletePolicy`, `iam:DetachRolePolicy`, `iam:DeleteRolePolicy` - - `iam:TagRole`, `iam:UntagRole` - **ECS**: Create and manage ECS clusters, services, and task definitions - - `ecs:CreateCluster`, `ecs:CreateService`, `ecs:RegisterTaskDefinition` - - `ecs:DescribeCluster`, `ecs:DescribeServices`, `ecs:DescribeTaskDefinition` - - `ecs:UpdateService`, `ecs:DeleteService`, `ecs:DeregisterTaskDefinition` - - `ecs:PutClusterCapacityProviders` - - `ecs:TagResource`, `ecs:UntagResource` -- **CloudWatch Logs**: Create and manage log groups - - `logs:CreateLogGroup`, `logs:DescribeLogGroups`, `logs:DeleteLogGroup` - - `logs:PutRetentionPolicy`, `logs:TagLogGroup` +- **CloudWatch Logs**: Create and manage log groups and dashboards - **Secrets Manager**: Create and manage secrets for GitHub tokens - - `secretsmanager:CreateSecret`, `secretsmanager:DescribeSecret` - - `secretsmanager:PutSecretValue`, `secretsmanager:GetSecretValue` - - `secretsmanager:DeleteSecret`, `secretsmanager:TagResource` - **EC2 (VPC)**: Query VPC resources and optionally create VPC endpoints - - `ec2:DescribeVpcs`, `ec2:DescribeSubnets`, `ec2:DescribeSecurityGroups` - - `ec2:CreateVpcEndpoint`, `ec2:DescribeVpcEndpoints`, `ec2:DeleteVpcEndpoint` (if using VPC endpoints) - - `ec2:ModifyVpcEndpoint` -- **ECR**: Pull container images (if using ECR mirroring) - - `ecr:GetAuthorizationToken`, `ecr:BatchCheckLayerAvailability` - - `ecr:GetDownloadUrlForLayer`, `ecr:BatchGetImage` - - `ecr:CreateRepository`, `ecr:PutImage` (if enabling ECR clone) -- **Lambda**: Deploy token refresh Lambda function (optional, for automated token refresh) - - `lambda:CreateFunction`, `lambda:GetFunction`, `lambda:UpdateFunctionCode` - - `lambda:DeleteFunction`, `lambda:AddPermission`, `lambda:RemovePermission` - - `lambda:TagResource` -- **EventBridge**: Create scheduled events for Lambda (optional) - - `events:PutRule`, `events:DescribeRule`, `events:DeleteRule` - - `events:PutTargets`, `events:RemoveTargets` +- **ECR**: Pull and push container images (ECR mirroring is enabled by default) +- **SNS**: Create topics and email subscriptions for alerting +- **Lambda**: Deploy token refresh Lambda function (when `enable_lambda_token_refresh = true`) +- **EventBridge**: Create scheduled events for Lambda token refresh - **S3/DynamoDB**: Access Terraform state backend - - `s3:GetObject`, `s3:PutObject`, `s3:ListBucket` (for state bucket) - - `dynamodb:GetItem`, `dynamodb:PutItem`, `dynamodb:DeleteItem` (for state locking) See **[AWS_PERMISSIONS.md](./AWS_PERMISSIONS.md)** for detailed permission policies and example IAM policy documents. @@ -120,15 +101,21 @@ See **[AWS_PERMISSIONS.md](./AWS_PERMISSIONS.md)** for detailed permission polic ### GitHub Requirements - GitHub Enterprise organization admin access -- **GitHub App** configured for runner authentication - - See **[GITHUB_APP_SETUP.md](./GITHUB_APP_SETUP.md)** for complete setup instructions - - Required: App ID, Installation ID, and Private Key (PEM file) - - Permissions: Repository Administration (R/W), Actions (R/W), Self-hosted Runners (R/W) +- **GitHub Personal Access Token (PAT)** — the primary authentication method + - Set via environment variable: `export GITHUB_TOKEN="your-token"` + - Required scopes: `admin:org`, `repo` + - Used by the Terraform GitHub provider and, when enabled, the Lambda token refresh function + - **Never commit this token to version control or store it in `.tfvars` files** +- (Optional) **GitHub App** authentication — see **[GITHUB_APP_SETUP.md](./GITHUB_APP_SETUP.md)** + - Variables `github_app_id`, `github_app_installation_id`, and `github_app_pem_file` all default to `null` ### Tools Required - Terraform >= 1.0 + - **Local shell**: use the [`tf` script](https://github.e.it.census.gov/gist/arnol377/21b70dd6790d2680a119a9f86369eced). It wraps `terraform` and automatically sets `TF_DATA_DIR`, `TF_CLI_ARGS_*` var-file injection, and loads workspace-specific JSON env files — all things the plain `terraform` binary does not do on its own. + - **CodeBuild**: uses native `terraform` commands. The buildspec replicates the `tf` script's behavior explicitly via environment variables (see [Automation (CodeBuild)](#automation-codebuild)). - AWS CLI configured -- Access to organization's Terraform state backend +- Python 3 + pip (required for Lambda package build when `enable_lambda_token_refresh = true`) +- Access to the organization's Terraform state backend ## Quick Start @@ -139,144 +126,283 @@ git clone git@github.e.it.census.gov:CSVD/ghe-runners.git cd ghe-runner ``` -### 2. Configure Variables +### 2. Export Required Environment Variables -Create or update your workspace-specific `.tfvars` file (e.g., `csvd-229685449397-us-gov-east-1.auto.tfvars`): +```bash +# Required: GitHub PAT used by Terraform GitHub provider and Lambda token refresh +export GITHUB_TOKEN="ghp_your_personal_access_token" -```hcl -# GitHub App Authentication (Required - see GITHUB_APP_SETUP.md) -github_app_id = "123456" # GitHub App ID -github_app_installation_id = "12345678" # Installation ID for your org -github_app_pem_file = "~/.github-apps/runner.pem" # Path to private key - -# GitHub Configuration -repo_org = "CSVD" # Your GitHub organization -server_url = "https://github.e.it.census.gov" # GitHub Enterprise URL - -# AWS Configuration -aws_account = "csvd-dev-ew" # Account identifier -vpc_id = "vpc-0abc123def456" # VPC ID -subnets = ["subnet-abc123", "subnet-def456"] # Private subnet IDs -security_groups = ["sg-xyz789"] # Security group IDs - -# Runner Configuration -image_name = "github-runner" # Container image name -image_version = "2.311.0" # Runner version -desired_count = 3 # Number of runners - -# Optional: VPC Endpoints -create_vpc_endpoint = false # Set to true if needed +# AWS credentials (refresh via awscreds if expired) +awscreds ``` -**Important:** GitHub App credentials are organization-specific. See [GITHUB_APP_SETUP.md](./GITHUB_APP_SETUP.md) for setup instructions. +### 3. Configure Variables + +Edit `default.auto.tfvars` or create a workspace-specific `.auto.tfvars` file: + +```hcl +# GitHub Configuration (token is set via GITHUB_TOKEN env var — not here) +repo_org = "SCT-Engineering" # Your GitHub organization +server_url = "https://github.e.it.census.gov" # GitHub Enterprise URL + +# ECS Cluster +ecs_cluster_name = "ecs-ghe-runners" # Base name; region is appended automatically +create_ecs_cluster = true # Set false to use an existing cluster + +# AWS +aws_account = "csvd-dev-ew" # Account identifier (also used as runner label) +namespace = "csvd" # Namespace for resource naming +vpc_id = "vpc-0abc123def456" # VPC ID +subnets = ["subnet-abc123", "subnet-def456"] # Private subnet IDs +security_groups = ["sg-xyz789"] # Security group IDs + +# Runner image (mirrored from public.ecr.aws/h1g9x7n8/ into private ECR) +image_name = "github-runner" +image_version = "1.67.0" +enable_ecr_clone = true # default: true — mirrors image to private ECR at apply time + +# Runner count +desired_count = 1 -### 3. Initialize Terraform +# Monitoring (required) +alert_email = "your.email@census.gov" # Receives CloudWatch alarm SNS notifications + +# Optional: automated token refresh Lambda +enable_lambda_token_refresh = false # Set true to prevent token-expiry deadlocks +``` + +### 4. Initialize Terraform ```bash -terraform init +tf init ``` -### 4. Create or Select Workspace +### 5. Create or Select Workspace ```bash # Create new workspace -terraform workspace new 229685449397-us-gov-east-1 +tf workspace new csvd-dev-ew # Or select existing workspace -terraform workspace select 229685449397-us-gov-east-1 +tf workspace select default ``` -### 5. Deploy Runners +### 6. Deploy Runners ```bash # Review planned changes -terraform plan +tf plan # Apply configuration -terraform apply +tf apply ``` -### 6. Verify Deployment +> **Important:** Always use the [`tf` script](https://github.e.it.census.gov/gist/arnol377/21b70dd6790d2680a119a9f86369eced) instead of `terraform` directly in your local shell. It sets `TF_DATA_DIR`, injects the workspace-specific var-file via `TF_CLI_ARGS_*`, and loads env vars from the JSON file — plain `terraform` will not do this automatically. + +### 7. Confirm SNS Email Subscription + +After the first deployment, check your inbox for an AWS SNS subscription confirmation email and click **Confirm subscription** to activate alerting. + +### 8. Verify Deployment **Check ECS:** ```bash -aws ecs list-tasks --cluster ecs-ghe-runners-us-gov-east-1 +aws ecs list-tasks --cluster ecs-ghe-runners-us-gov-west-1 ``` **Check GitHub:** -1. Navigate to `https://github.e.it.census.gov/organizations/CSVD/settings/actions/runners` +1. Navigate to `https://github.e.it.census.gov/organizations/SCT-Engineering/settings/actions/runners` 2. Verify runners appear with status "Idle" 3. Confirm runner labels match configuration +## Automation (CodeBuild) + +The `codebuild/` directory contains a Terraform module that deploys a scheduled CodeBuild project (`ghe-runner-daily-{workspace}`) that runs `tf apply` daily. The buildspec downloads the [`tf` script](https://github.e.it.census.gov/gist/arnol377/21b70dd6790d2680a119a9f86369eced) from the team gist during the `install` phase and uses it throughout, so all workspace-aware behavior (`TF_DATA_DIR`, var-file injection, JSON env loading) is handled automatically — no manual replication needed. + +### How Workspace Selection Works in CodeBuild + +The `tf` script is downloaded once at install time and used natively: + +| Responsibility | How it's handled | +|----------------|------------------| +| `TF_DATA_DIR`, `TF_CLI_ARGS_*`, JSON env vars | Downloaded [`tf` script](https://github.e.it.census.gov/gist/arnol377/21b70dd6790d2680a119a9f86369eced) handles everything automatically | +| Target workspace | `TF_WORKSPACE` env var injected by CodeBuild at build time | +| Backend config | `pre_build` selects `backend-configs/${TF_WORKSPACE}.tf` if present | +| GitHub PAT | `GITHUB_TOKEN` (from Secrets Manager) exported as `TF_VAR_github_token` | + +The `tf` binary resolver falls back to `terraform` in `PATH`, which is the binary installed earlier in `install` — no extra configuration required. + +### CodeBuild Job Phases + +| Phase | What happens | +|-------|--------------| +| `install` | Downloads Terraform `${TF_VERSION}`; downloads `tf` script from gist; installs `requests` | +| `pre_build` | Exports `TF_VAR_github_token`; runs `tf init` (with optional backend-config); runs `tf workspace select` | +| `build` | `tf apply -auto-approve`; invokes Lambda token refresh (so token is fresh before containers start); forces new ECS deployment | +| `post_build` | Logs completion timestamp and active workspace | + +### Deploying the CodeBuild Module + +The CodeBuild infrastructure lives in `codebuild/` and is managed separately from the runner infrastructure. + +**Step 1: Add workspace files** (if targeting a new workspace) + +``` +varfiles/{workspace}.tfvars # required — same vars as for local tf apply +varfiles/{workspace}.json # optional — extra env vars loaded by tf script +backend-configs/{workspace}.tf # optional — workspace-specific backend config +``` + +**Step 2: Deploy the CodeBuild project** + +```bash +cd codebuild/ +tf init +tf workspace select -or-create default # CodeBuild itself uses the local tf alias +tf apply -var tf_workspace=default \ + -var github_token_secret_arn=arn:aws-us-gov:secretsmanager:... +``` + +**Step 3: Verify** + +```bash +# Trigger a manual build to test +aws codebuild start-build \ + --project-name ghe-runner-daily-default \ + --environment-variables-override name=TF_WORKSPACE,value=default,type=PLAINTEXT +``` + +### Targeting a Different Workspace Without Redeploying + +You can override `TF_WORKSPACE` at build invocation time without changing the CodeBuild project's default: + +```bash +aws codebuild start-build \ + --project-name ghe-runner-daily-default \ + --environment-variables-override name=TF_WORKSPACE,value=sct-engineering,type=PLAINTEXT +``` + +This works because the buildspec reads `$TF_WORKSPACE` at runtime — `terraform workspace select -or-create "$TF_WORKSPACE"` will create the workspace if it does not yet exist. + +### GitHub PAT in CodeBuild + +The PAT is stored in AWS Secrets Manager and injected as `GITHUB_TOKEN` by CodeBuild at build start. The buildspec then maps it to `TF_VAR_github_token` so the Terraform `github_token` variable is populated without ever writing the token to disk or a var-file: + +```yaml +- export TF_VAR_github_token="$GITHUB_TOKEN" +``` + +The Terraform GitHub provider reads `GITHUB_TOKEN` automatically; the Lambda variable (`var.github_token`) is satisfied by `TF_VAR_github_token`. + ## Configuration Reference ### Required Variables | Variable | Description | Example | |----------|-------------|---------| -| `github_app_id` | GitHub App ID (see GITHUB_APP_SETUP.md) | `"123456"` | -| `github_app_installation_id` | App Installation ID for organization | `"12345678"` | -| `github_app_pem_file` | Path to GitHub App private key | `"~/.github-apps/key.pem"` | -| `repo_org` | GitHub organization name | `"CSVD"` | -| `aws_account` | AWS account identifier | `"csvd-dev-ew"` | -| `server_url` | GitHub Enterprise Server URL | `"https://github.e.it.census.gov"` | -| `vpc_id` | VPC ID for runner deployment | `"vpc-0abc123"` | -| `subnets` | List of subnet IDs (private subnets) | `["subnet-abc", "subnet-def"]` | -| `security_groups` | Security group IDs for runners | `["sg-xyz789"]` | +| `ecs_cluster_name` | Base ECS cluster name (region is appended automatically) | `"ecs-ghe-runners"` | +| `repo_org` | GitHub organization name | `"SCT-Engineering"` | +| `namespace` | Namespace for resource naming | `"csvd"` | +| `aws_account` | AWS account identifier (also used as a runner label) | `"csvd-dev-ew"` | +| `vpc_id` | VPC ID for runner and Lambda deployment | `"vpc-0abc123"` | | `image_name` | GitHub runner container image name | `"github-runner"` | -| `image_version` | Container image version tag | `"2.311.0"` | +| `image_version` | Container image version tag | `"1.67.0"` | +| `alert_email` | Email for CloudWatch alarm notifications | `"team@census.gov"` | + +**Authentication — set via environment variable, not in `.tfvars`:** + +| Variable | Description | How to Set | +|----------|-------------|------------| +| `github_token` | GitHub PAT for Terraform provider and Lambda token refresh | `export GITHUB_TOKEN="ghp_..."` | ### Optional Variables | Variable | Description | Default | |----------|-------------|---------| -| `desired_count` | Number of runner instances | `2` | -| `create_vpc_endpoint` | Create VPC endpoints for AWS services | `false` | -| `create_ecs_cluster` | Create new ECS cluster | `false` | -| `assign_public_ip` | Assign public IP to runners | `false` | -| `log_retention_days` | CloudWatch log retention period | `90` | - -### Environment Variables (Set in Task Definition) +| `subnets` | List of private subnet IDs for runners and Lambda | `[]` | +| `security_groups` | Security group IDs for runners and Lambda | `[]` | +| `desired_count` | Number of runner instances | `3` | +| `create_ecs_cluster` | Create a new ECS cluster (false = use existing) | `false` | +| `assign_public_ip` | Assign public IP to runner tasks | `false` | +| `create_vpc_endpoint` | Create Interface VPC endpoints for AWS services | `false` | +| `enable_ecr_clone` | Mirror runner image from public ECR to private ECR | `true` | +| `enable_lambda_token_refresh` | Deploy Lambda for automatic 30-min token refresh | `false` | +| `server_url` | GitHub Enterprise URL for runner registration | `""` | +| `base_url` | GitHub base URL for Terraform provider | `"https://github.e.it.census.gov"` | +| `cluster_size` | ECS cluster capacity setting | `1` | +| `certs` | S3 `{ bucket, key }` for CA certificate to inject into runners | `null` | +| `github_app_id` | GitHub App ID (optional alternative to PAT) | `null` | +| `github_app_installation_id` | GitHub App Installation ID | `null` | +| `github_app_pem_file` | Path to GitHub App private key `.pem` | `null` | + +### Environment Variables (Set in ECS Task Definition) ```hcl HTTP_PROXY = "http://proxy.tco.census.gov:3128" HTTPS_PROXY = "http://proxy.tco.census.gov:3128" -NO_PROXY = "169.254.170.2,.census.gov,169.254.169.254,148.129.0.0/16,10.0.0.0/8,172.16.0/12,.eks.amazonaws.com,.s3.amazonaws.com,.amazonaws.com" +NO_PROXY = "169.254.170.2,.census.gov,169.254.169.254,148.129.0.0/16,10.0.0.0/8,,172.16.0/12,.eks.amazonaws.com,.s3.amazonaws.com,.amazonaws.com,.gcr.io,.pkg.dev" ``` -## Runner Labels +> **Note:** `NO_PROXY` is built dynamically at plan time. GitHub Enterprise IP addresses are resolved via DNS (`data.dns_a_record_set.github`) and included automatically. + +## Runner Image -Each runner is automatically configured with multiple labels for flexible workflow targeting: +By default (`enable_ecr_clone = true`), the runner image is mirrored from the public ECR registry into your account's private ECR during `tf apply` using the `HappyPathway/ecr-clone/aws` Terraform module: -| Label Type | Format | Example | Usage | -|------------|--------|---------|-------| -| Account ID | `{account_id}` | `229685449397` | `runs-on: ["229685449397"]` | -| Account Name | `{account_name}` | `csvd-dev-ew` | `runs-on: ["csvd-dev-ew"]` | -| Account-Region | `{account_id}-{region}` | `229685449397-us-gov-east-1` | `runs-on: ["229685449397-us-gov-east-1"]` | -| Region | `{region}` | `us-gov-east-1` | `runs-on: ["us-gov-east-1"]` | -| Organization | `{org_name}` | `CSVD` | `runs-on: ["CSVD"]` | -| Runner Type | Fixed | `ecs-github-runner` | `runs-on: ["ecs-github-runner"]` | -| Compatibility | Fixed | `ubuntu-latest` | `runs-on: ["ubuntu-latest"]` | +| Setting | Value | +|---------|-------| +| Source registry | `public.ecr.aws` | +| Source image | `h1g9x7n8/github-runner:{version}` | +| Private ECR repository | `github-runners/github-runner` | +| Image URI (ECR clone enabled) | `{account_id}.dkr.ecr.{region}.amazonaws.com/github-runners/github-runner:{version}` | +| Image URI (ECR clone disabled) | `public.ecr.aws/h1g9x7n8/github-runner:{version}` | + +When you update `image_version`, the new image is automatically mirrored to private ECR during the next `tf apply`. + +## Runner Labels + +Each runner is automatically configured with the following labels: + +| Label | Source | Example | +|-------|--------|---------| +| `self-hosted` | Fixed | `self-hosted` | +| `ecs` | Fixed | `ecs` | +| `github-runner` | Fixed | `github-runner` | +| `{aws_account}` | `var.aws_account` | `csvd-dev-ew` | +| `{org_name_lowercase}` | `lower(var.repo_org)` | `sct-engineering` | +| `{account_id}` | AWS caller identity | `229685449397` | +| `{account_id}-{region}` | AWS caller identity + region | `229685449397-us-gov-west-1` | +| `{region}` | AWS region | `us-gov-west-1` | +| `ecs-github-runner` | Fixed | `ecs-github-runner` | +| `ubuntu-latest` | Fixed (compatibility) | `ubuntu-latest` | **Most Common Usage:** ```yaml jobs: build: - runs-on: ["229685449397"] # Target specific AWS account by ID + runs-on: ["self-hosted", "229685449397"] # Target specific AWS account by ID ``` +## Runner Group + +The runner group is automatically created in GitHub and named after the AWS **account ID** (e.g., `229685449397`). Group configuration: + +- `allows_public_repositories = true` +- `restricted_to_workflows = false` +- `visibility = "all"` + ## Network Configuration ### Proxy Settings -Runners are pre-configured to use the corporate proxy for internet access: +Runners are pre-configured to route traffic through the corporate proxy: - **HTTP/HTTPS Proxy**: `http://proxy.tco.census.gov:3128` -- **No Proxy**: Internal Census Bureau networks and AWS services +- **No Proxy**: Internal Census Bureau networks, AWS services, and GitHub Enterprise IPs (resolved dynamically at plan time) ### VPC Endpoints (Optional) -When `create_vpc_endpoint = true`, the following endpoints are created: +When `create_vpc_endpoint = true`, the following Interface endpoints are created in the runner VPC: - **ECR API**: `com.amazonaws.{region}.ecr.api` - **ECR Docker**: `com.amazonaws.{region}.ecr.dkr` @@ -285,774 +411,312 @@ When `create_vpc_endpoint = true`, the following endpoints are created: **Benefits:** - Reduced data transfer costs -- Improved security (no internet egress required) -- Faster access to AWS services +- Improved security (no internet egress required for AWS services) +- Faster access to AWS services from runner tasks ## IAM Permissions -The runner infrastructure uses two distinct IAM roles with different purposes: +The runner infrastructure uses two distinct IAM roles: ### ECS Task Role -The **task role** is assumed by the running container and provides the GitHub Actions runner with permissions to interact with AWS services during workflow execution. This is what your workflows use to deploy infrastructure, access S3, etc. - -**Purpose:** Grants permissions for GitHub Actions workflows to manage AWS resources - -**Role Name Pattern:** `{hostname}-task-role` (e.g., `CSVD-task-role`) +Assumed by the running container — grants GitHub Actions workflows AWS permissions. -**Permissions Provided:** -This role is configured via the `github_runner_permissions_arn` variable, which by default points to `${var.repo_org}-admin` policy. +**Role Name Pattern:** `{repo_org}-task-role` (e.g., `SCT-Engineering-task-role`) -**Default Policy** (`iam_policy/admin.json`): +**Default Policy** (`iam_policy/admin.json`) — full admin, configured via `github_runner_permissions_arn`: ```json { - "Statement": [ - { - "Action": "*", - "Effect": "Allow", - "Resource": "*" - } - ], + "Statement": [{ "Action": "*", "Effect": "Allow", "Resource": "*" }], "Version": "2012-10-17" } ``` -**Additional Permissions Automatically Attached:** -- **Secrets Manager**: Read GitHub registration token - ```json - { - "Effect": "Allow", - "Action": "secretsmanager:GetSecretValue", - "Resource": "arn:aws:secretsmanager:*:*:secret:/github-runners/{namespace}/*" - } - ``` -- **S3**: Access certificates (if `certs` variable is configured) - ```json - { - "Effect": "Allow", - "Action": ["s3:GetObject", "s3:ListBucket"], - "Resource": "arn:aws:s3:::{certs_bucket}/*" - } - ``` +**Automatically Attached Permissions:** +- **Secrets Manager**: Read GitHub registration token from `/github-runners/{namespace}/*` +- **S3**: Access CA certificate (when `certs` variable is configured) **Customization:** -To use a custom policy instead of the default admin policy, create your own IAM policy and reference it in your `.tfvars`: +To use a least-privilege policy instead of the default admin policy: ```hcl github_runner_permissions_arn = "arn:aws:iam::{account_id}:policy/custom-runner-policy" ``` -**Recommended Permissions for Terraform Workflows:** -- S3: Read/write Terraform state (`s3:GetObject`, `s3:PutObject`, `s3:ListBucket`) -- DynamoDB: State locking (`dynamodb:GetItem`, `dynamodb:PutItem`, `dynamodb:DeleteItem`) -- Service-specific permissions based on what your Terraform code manages (EC2, RDS, VPC, etc.) - -**Security Best Practice:** -Replace the wildcard admin policy with least-privilege permissions based on your specific workflow requirements. See **[AWS_PERMISSIONS.md](./AWS_PERMISSIONS.md)** for example policies. - -**Important Security Note:** -Infrastructure-as-Code workflows require broad AWS permissions to manage diverse resource types. The admin policy is intentionally permissive to support Terraform's dynamic provisioning needs. Security is enforced through: -- Repository access controls (who can push code) -- Pull request reviews (all changes peer-reviewed) -- Branch protection rules (required approvals) -- CloudTrail audit logging (full attribution) -- Account isolation (separate runners per account) - -This is the industry-standard approach used by Terraform Cloud, Spacelift, and other IaC platforms. For security team discussions, see **[Addressing Security Concerns](./AWS_PERMISSIONS.md#addressing-security-concerns)** in the permissions documentation. +**Security Note:** The admin policy is intentionally permissive to support Terraform's dynamic provisioning. Security is enforced through repository access controls, pull request reviews, branch protection rules, and CloudTrail audit logging. See **[AWS_PERMISSIONS.md](./AWS_PERMISSIONS.md)** and **[SECURITY_REVIEW.md](./SECURITY_REVIEW.md)**. ### ECS Execution Role -The **execution role** is used by the ECS service itself to set up and manage the container lifecycle. This role is never directly used by your workflows. - -**Purpose:** Allows ECS to pull images, write logs, and retrieve secrets needed to start the container - -**Role Name Pattern:** `{hostname}-task-execution-role` (e.g., `CSVD-task-execution-role`) +Used by ECS to manage the container lifecycle (image pull, log write, secret retrieval). -**Managed Policy Automatically Attached:** -- `arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy` - -This AWS-managed policy provides: -```json -{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "ecr:GetAuthorizationToken", - "ecr:BatchCheckLayerAvailability", - "ecr:GetDownloadUrlForLayer", - "ecr:BatchGetImage", - "logs:CreateLogStream", - "logs:PutLogEvents" - ], - "Resource": "*" - } - ] -} -``` +**Role Name Pattern:** `{repo_org}-task-execution-role` -**When to Modify:** -You typically don't need to modify this role unless: -- Using private ECR repositories in a different account (add cross-account ECR permissions) -- Pulling images from additional registries -- Need to access secrets from Secrets Manager for environment variables (already included by the module) +**Managed Policy:** `arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy` ## Monitoring & Logging -### CloudWatch Logs - -**Log Group:** `/ecs-ghe-runners/{workspace}-{account_id}-{region}` +All monitoring resources are deployed automatically as part of this Terraform configuration. No additional setup is needed beyond confirming the initial SNS email subscription. -**Log Streams:** One per ECS task (runner instance) +### CloudWatch Log Group -**Retention:** 90 days (configurable via `log_retention_days`) +| Property | Value | +|----------|-------| +| Log group pattern | `/ecs-ghe-runners/{workspace}-{account_id}-{region}` | +| Retention | 90 days | -**View Logs:** ```bash -# List log streams -aws logs describe-log-streams \ - --log-group-name /ecs-ghe-runners/229685449397-229685449397-us-gov-east-1 - -# Tail logs -aws logs tail /ecs-ghe-runners/229685449397-229685449397-us-gov-east-1 --follow +# Tail runner logs +aws logs tail /ecs-ghe-runners/default-229685449397-us-gov-west-1 --follow ``` -### ECS Metrics - -Available in CloudWatch: -- CPU utilization -- Memory utilization -- Task count -- Running task count - -### GitHub Actions UI - -Monitor runner status: -1. Navigate to: `https://github.e.it.census.gov/organizations/{org}/settings/actions/runners` -2. View runner status (Idle, Active, Offline) -3. Check runner labels and version - -## Troubleshooting - -### Runner Not Appearing in GitHub - -**Symptoms:** ECS tasks running, but runners not visible in GitHub organization settings - -**Troubleshooting Steps:** -1. Check CloudWatch logs for authentication errors -2. Verify OAuth App credentials in Secrets Manager -3. Confirm OAuth App has organization-level access -4. Check security group allows outbound HTTPS to GitHub - -**Common Causes:** -- Invalid or expired OAuth App credentials -- OAuth App not installed at organization level -- Network connectivity issues (proxy, security groups) - -### Runner Shows as Offline - -**Symptoms:** Runner appears in GitHub but status is "Offline" - -**Troubleshooting Steps:** -1. Check ECS task status in AWS Console -2. Verify task is in RUNNING state -3. Review CloudWatch logs for errors -4. Check proxy connectivity - -**Common Causes:** -- ECS task crashed or stopped -- Network connectivity lost -- GitHub token expired or revoked - -### Authentication Failures in Workflows - -**Symptoms:** Workflows fail with AWS permission errors - -**Troubleshooting Steps:** -1. Verify ECS Task Role has required permissions -2. Check IAM policy attached to task role -3. Review CloudTrail for access denied events -4. Confirm runner is in correct AWS account - -**Common Causes:** -- Insufficient IAM permissions on task role -- Wrong AWS account (runner in different account) -- Cross-account role assumption issues - -### High Memory/CPU Usage - -**Symptoms:** Runners consuming excessive resources +### SNS Alerting -**Troubleshooting Steps:** -1. Review CloudWatch metrics for task -2. Check workflow jobs for resource-intensive operations -3. Consider increasing task size or runner count +| Resource | Name Pattern | +|----------|-------------| +| SNS topic | `github-runner-critical-alerts-{account_id}-{region}` | +| Email subscription | `var.alert_email` | -**Solutions:** -- Increase `desired_count` for more parallel capacity -- Optimize workflow jobs (caching, parallelization) -- Scale task CPU/memory in task definition +> After first `tf apply`, **confirm the SNS subscription** from the AWS email sent to `alert_email`. -### Workflow Jobs Stuck in Queue +### CloudWatch Alarms -**Symptoms:** Jobs pending with "Waiting for a runner to pick up this job" +| Alarm | Trigger | Severity | +|-------|---------|----------| +| `github-runners-critical-capacity-{aws_account}` | `RunningTaskCount` (ECS/ContainerInsights) < 50% of `desired_count` for 10 min | Critical | +| `github-runners-emergency-all-down-{aws_account}` | `RunningTaskCount` = 0 for 1 min | Emergency | +| `{lambda_function_name}-errors` | Lambda error count > 1 (only when Lambda enabled) | Warning | -**Troubleshooting Steps:** -1. Verify runners are online and idle -2. Check runner labels match workflow `runs-on` -3. Confirm sufficient runner capacity +Both runner alarms also send **OK** notifications upon recovery. -**Solutions:** -- Increase `desired_count` for more runners -- Verify `runs-on` label matches deployed runners -- Check for long-running jobs blocking runners - -## Scaling Runners +### CloudWatch Dashboard -### Manual Scaling +A dashboard named `github-runners-{aws_account}` is created automatically and includes: -Update `desired_count` in `default.auto.tfvars`: - -```hcl -desired_count = 5 # Increase from 3 to 5 -``` +- Runner count over time (Average / Maximum / Minimum, with desired-count and 50% threshold annotations) +- ECS CPU and Memory utilization +- Critical alarm status panel +- Recent error events (CloudWatch Logs Insights query) -Apply changes: -```bash -terraform apply +**Access the dashboard:** ``` - -### Considerations - -- **Startup Time**: Fargate tasks take 1-2 minutes to start -- **Cost**: Each runner incurs Fargate compute costs -- **Capacity Planning**: Monitor workflow queue times and runner utilization -- **Regional Limits**: Check AWS service quotas for ECS tasks - -## Operations - -### Refreshing Runners - -Runners may need to be refreshed when they stop responding or fail to come back online. Common causes include: -- Expired GitHub OAuth token -- ECS tasks stuck in a failed state -- Network connectivity issues -- GitHub API issues -- Orphaned runner registrations in GitHub - -#### Standard Refresh Procedure (Terraform Taint) - -This is the **primary method** for refreshing runners and resolving most issues: - -**Step 1: Clean Up GitHub Runner Group and Runners** - -Before tainting in Terraform, manually remove stale runners from GitHub: - -```bash -# Navigate to GitHub Organization Settings -# URL: https://github.e.it.census.gov/organizations/CSVD/settings/actions/runner-groups +https://console.aws.amazon.com/cloudwatch/home?region=us-gov-west-1#dashboards:name=github-runners-{aws_account} ``` -1. **Remove Offline/Stale Runners:** - - Go to: **Organization Settings** → **Actions** → **Runners** - - Find runners with labels matching your account (e.g., `229685449397`) - - For each offline or problematic runner: - - Click the runner name - - Click **Remove runner** - - Confirm removal +## Automated Token Refresh (Lambda) -2. **Delete the Runner Group (if needed):** - - Go to: **Organization Settings** → **Actions** → **Runner groups** - - Find the runner group (typically named with account ID, e.g., `229685449397`) - - Click on the runner group - - Click **Delete runner group** - - Confirm deletion +The Lambda token refresh is **deployed and controlled by `enable_lambda_token_refresh`** (default: `false`). Enable it for production deployments to prevent runner recovery deadlocks. - **Note:** The runner group will be automatically recreated by Terraform during the next apply. +### Critical: Token Expiry Deadlock Risk -**Step 2: Taint Terraform Resources** +⚠️ GitHub registration tokens expire after **~1 hour**. If **all** runners go down AND the token in Secrets Manager has expired, ECS cannot auto-recover — new tasks need a valid token to register. The Lambda prevents this by refreshing the token every 30 minutes. -Once GitHub is cleaned up, taint the resources to force recreation: +### How It Works -```bash -# Taint the runner group (this will recreate it) -terraform taint 'module.github_runner.github_actions_runner_group.runner_group' - -# Taint the ECS service (this will recreate all runners) -terraform taint 'module.ecs_service.aws_ecs_service.github_runner' - -# Review what will be recreated -terraform plan -``` +1. EventBridge triggers the Lambda every 30 minutes +2. Lambda calls the GitHub API (`api/v3/orgs/{org}/actions/runners/registration-token`) using `GITHUB_TOKEN` +3. Fresh registration token is written to Secrets Manager at the path ECS tasks use on startup +4. Whenever a task is restarted for any reason, a valid token is available -**Step 3: Apply Changes** +### Enable Lambda Token Refresh -```bash -# Apply the changes - this will: -# 1. Destroy old ECS service and tasks -# 2. Recreate runner group in GitHub -# 3. Create new ECS service with fresh tasks -# 4. Register new runners automatically -terraform apply +```hcl +# In default.auto.tfvars +enable_lambda_token_refresh = true ``` -**Step 4: Verify New Runners** - ```bash -# Check ECS tasks are running -aws ecs list-tasks --cluster ecs-ghe-runners-us-gov-east-1 - -# Monitor CloudWatch logs for successful registration -aws logs tail /ecs-ghe-runners/229685449397-229685449397-us-gov-east-1 --follow - -# Verify in GitHub UI -# https://github.e.it.census.gov/organizations/CSVD/settings/actions/runners -# New runners should appear as "Idle" +tf apply ``` -**Why Manual GitHub Cleanup is Required:** +### Lambda Reference -- Terraform doesn't always properly deregister runners when they're in a failed state -- Orphaned runner registrations prevent new runners from registering with the same labels -- GitHub API rate limiting can cause Terraform to skip cleanup -- Runner groups with active (but offline) runners cannot be deleted automatically -- Expired GitHub OAuth tokens prevent automatic deregistration - -**Important:** Do NOT use `aws ecs update-service --force-new-deployment` as a workaround. This will fail if the GitHub OAuth token has expired, which is the most common cause of runner failures. Always use the Terraform taint method above after updating the token in Secrets Manager. +| Property | Value | +|----------|-------| +| Function name | `github-runner-token-refresh-{aws_account}` | +| Runtime | Python 3.11 | +| Timeout | 60 seconds | +| Schedule | Every 30 minutes (EventBridge) | +| VPC | Deployed in the same `subnets` + `security_groups` as runners | +| Authentication | `GITHUB_TOKEN` env var (value of `var.github_token`, set at deploy time) | +| Log group | `/aws/lambda/github-runner-token-refresh-{aws_account}` (7-day retention) | +| Secret path prefix | `/github-runners/{lower(repo_org)}-{account_id}-{region}` | -#### Restart Individual Tasks (For Testing Only) +> **Important:** The Lambda runs inside the VPC. It requires outbound internet access through the corporate proxy to reach the GitHub API. Ensure the subnets and security groups allow egress to `github.e.it.census.gov:443` via the proxy. -To restart a specific problematic runner: +### Invoke Lambda Manually ```bash -# List running tasks -aws ecs list-tasks --cluster ecs-ghe-runners-us-gov-east-1 - -# Stop a specific task (ECS will automatically start a replacement) -aws ecs stop-task \ - --cluster ecs-ghe-runners-us-gov-east-1 \ - --task arn:aws:ecs:us-gov-east-1:123456789012:task/ecs-ghe-runners-us-gov-east-1/abc123def456 -``` - -**Note:** Stopping a task will cause ECS to automatically start a new one if the service's desired count is maintained. - -### Handling Expired GitHub Tokens - -GitHub Actions registration tokens have a **limited lifetime (typically 1 hour)**. When tokens expire, runners cannot register and will fail to come online. - -#### Understanding the Token Architecture - -The current implementation: -1. Terraform retrieves a registration token via `data.github_actions_organization_registration_token` -2. Token is stored in AWS Secrets Manager at `/github-runners/{namespace}/{hostname}-{random_pet_id}` -3. ECS tasks read the token from Secrets Manager on startup -4. **Problem**: Registration tokens expire after ~1 hour, but the secret in Secrets Manager is not automatically updated - -#### Solution: Automated Token Refresh - -To prevent token expiration issues, implement automated token refresh using one of these approaches: - -##### Option 1: Lambda Function with EventBridge (Recommended) - -Create a Lambda function that periodically refreshes the token: - -```python -# lambda_function.py -import boto3 -import os -import requests - -def lambda_handler(event, context): - github_token = os.environ['GITHUB_TOKEN'] # GitHub PAT with admin:org - org = os.environ['GITHUB_ORG'] - github_url = os.environ['GITHUB_URL'] - secret_name = os.environ['SECRET_NAME'] - - # Get fresh registration token from GitHub API - response = requests.post( - f'{github_url}/api/v3/orgs/{org}/actions/runners/registration-token', - headers={ - 'Authorization': f'token {github_token}', - 'Accept': 'application/vnd.github.v3+json' - } - ) - - if response.status_code == 201: - new_token = response.json()['token'] - - # Update Secrets Manager - sm_client = boto3.client('secretsmanager') - sm_client.update_secret( - SecretId=secret_name, - SecretString=new_token - ) - - return {'statusCode': 200, 'body': 'Token refreshed successfully'} - else: - raise Exception(f'Failed to get token: {response.status_code}') +aws lambda invoke \ + --function-name github-runner-token-refresh-csvd-dev-ew \ + --log-type Tail \ + /tmp/lambda-output.json && cat /tmp/lambda-output.json ``` -**Terraform to deploy Lambda:** - -```hcl -# token_refresh_lambda.tf (add to ghe-runner repo) - -resource "aws_lambda_function" "token_refresh" { - filename = "token_refresh.zip" - function_name = "github-runner-token-refresh-${var.aws_account}" - role = aws_iam_role.lambda_refresh_role.arn - handler = "lambda_function.lambda_handler" - runtime = "python3.11" - timeout = 60 - - environment { - variables = { - GITHUB_TOKEN = var.github_token # GitHub PAT from provider - GITHUB_ORG = var.repo_org - GITHUB_URL = var.server_url - SECRET_NAME = aws_secretsmanager_secret.secret.name - } - } -} - -resource "aws_cloudwatch_event_rule" "token_refresh_schedule" { - name = "github-runner-token-refresh-${var.aws_account}" - description = "Refresh GitHub runner token every 30 minutes" - schedule_expression = "rate(30 minutes)" -} - -resource "aws_cloudwatch_event_target" "token_refresh_target" { - rule = aws_cloudwatch_event_rule.token_refresh_schedule.name - target_id = "RefreshTokenLambda" - arn = aws_lambda_function.token_refresh.arn -} - -resource "aws_lambda_permission" "allow_eventbridge" { - statement_id = "AllowExecutionFromEventBridge" - action = "lambda:InvokeFunction" - function_name = aws_lambda_function.token_refresh.function_name - principal = "events.amazonaws.com" - source_arn = aws_cloudwatch_event_rule.token_refresh_schedule.arn -} - -resource "aws_iam_role" "lambda_refresh_role" { - name = "github-runner-token-refresh-${var.aws_account}" - - assume_role_policy = jsonencode({ - Version = "2012-10-17" - Statement = [{ - Action = "sts:AssumeRole" - Effect = "Allow" - Principal = { - Service = "lambda.amazonaws.com" - } - }] - }) -} - -resource "aws_iam_role_policy" "lambda_refresh_policy" { - name = "token-refresh-policy" - role = aws_iam_role.lambda_refresh_role.id - - policy = jsonencode({ - Version = "2012-10-17" - Statement = [ - { - Effect = "Allow" - Action = [ - "secretsmanager:UpdateSecret", - "secretsmanager:GetSecretValue" - ] - Resource = aws_secretsmanager_secret.secret.arn - }, - { - Effect = "Allow" - Action = [ - "logs:CreateLogGroup", - "logs:CreateLogStream", - "logs:PutLogEvents" - ] - Resource = "arn:aws:logs:*:*:*" - } - ] - }) -} -``` - -**Benefits:** -- Token automatically refreshes every 30 minutes -- Runners can be restarted anytime without token concerns -- No manual intervention required -- `force-new-deployment` becomes viable - -##### Option 2: Terraform Lifecycle Ignore Changes - -If implementing automated refresh is not feasible immediately, configure Terraform to ignore token changes: - -```hcl -# In main.tf -resource "aws_secretsmanager_secret_version" "secret" { - secret_id = aws_secretsmanager_secret.secret.id - secret_string = local.token - - lifecycle { - ignore_changes = [secret_string] - } -} -``` - -Then manually refresh tokens via AWS CLI when needed: +### View Lambda Logs ```bash -# Get fresh token from GitHub -GITHUB_TOKEN="your-pat-token" -ORG="CSVD" -GITHUB_URL="https://github.e.it.census.gov" - -NEW_TOKEN=$(curl -X POST \ - -H "Authorization: token $GITHUB_TOKEN" \ - -H "Accept: application/vnd.github.v3+json" \ - "$GITHUB_URL/api/v3/orgs/$ORG/actions/runners/registration-token" \ - | jq -r '.token') - -# Update Secrets Manager -aws secretsmanager update-secret \ - --secret-id /github-runners/csvd/runner-name-xyz123 \ - --secret-string "$NEW_TOKEN" - -# Force new deployment to pick up new token -aws ecs update-service \ - --cluster ecs-ghe-runners-us-gov-east-1 \ - --service github-runner-service \ - --force-new-deployment +aws logs tail /aws/lambda/github-runner-token-refresh-csvd-dev-ew --follow ``` -**Benefits:** -- Allows manual token management outside Terraform -- Enables `force-new-deployment` for quick refreshes -- No Lambda infrastructure needed - -**Drawbacks:** -- Requires manual intervention when tokens expire -- Need to remember to refresh tokens periodically - -#### Current Manual Refresh Procedure (Until Automation) - -Until automated token refresh is implemented, use this procedure: - -#### 1. Verify Token Expiration +## Troubleshooting -Check CloudWatch logs for authentication errors: +### Runner Not Appearing in GitHub -```bash -aws logs tail /ecs-ghe-runners/229685449397-229685449397-us-gov-east-1 \ - --follow \ - --filter-pattern "401\|authentication\|token\|expired" -``` +**Symptoms:** ECS tasks running, but runners not visible in GitHub organization settings -Look for messages like: -- `HTTP 401: Unauthorized` -- `Failed to register runner` -- `Authentication failed` -- `Token expired` +1. Check CloudWatch logs for authentication errors (look for `401`, `failed`, `token`) +2. Verify `GITHUB_TOKEN` is valid and has `admin:org` scope +3. If `enable_lambda_token_refresh = false`, the token in Secrets Manager may be expired (> 1 h) +4. Verify security group allows outbound HTTPS (443) via the proxy to `github.e.it.census.gov` -#### 2. Clean Up GitHub (Runners and Runner Group) +### Runner Shows as Offline -#### 2. Clean Up GitHub (Runners and Runner Group) +**Symptoms:** Runner appears in GitHub but status is "Offline" -Before applying Terraform changes, manually remove stale runners: +1. Check ECS task status — verify task is in `RUNNING` state +2. Review CloudWatch logs for errors +3. Verify proxy connectivity from the VPC -1. **Remove Offline/Stale Runners:** - - Go to: `https://github.e.it.census.gov/organizations/CSVD/settings/actions/runners` - - Find runners with your account label (e.g., `229685449397`) - - Click each runner → **Remove runner** → Confirm +### Authentication Failures in Workflows -2. **Delete the Runner Group:** - - Go to: `https://github.e.it.census.gov/organizations/CSVD/settings/actions/runner-groups` - - Find the runner group (e.g., `229685449397`) - - Click the runner group → **Delete runner group** → Confirm +**Symptoms:** Workflows fail with AWS permission errors -#### 3. Taint and Apply Terraform +1. Verify ECS Task Role has required permissions (`{repo_org}-task-role`) +2. Confirm the correct IAM policy is attached +3. Review CloudTrail for `AccessDenied` events +4. Confirm the runner is in the correct AWS account -#### 3. Taint and Apply Terraform +### ECR Pull Failures -Taint the resources to force recreation with a fresh token: +**Symptoms:** Tasks fail with `CannotPullContainerError` ```bash -# Taint the secret to force new token retrieval -terraform taint 'aws_secretsmanager_secret_version.secret' - -# Taint the runner group -terraform taint 'module.github_runner.github_actions_runner_group.runner_group' - -# Taint the ECS service -terraform taint 'module.ecs_service.aws_ecs_service.github_runner' - -# Apply changes - Terraform will: -# 1. Retrieve fresh registration token from GitHub via data source -# 2. Update secret in Secrets Manager automatically -# 3. Recreate runner group in GitHub -# 4. Recreate ECS service and tasks -# 5. New runners will register using the fresh token -terraform apply +# Verify image exists in private ECR (when enable_ecr_clone = true) +aws ecr describe-images \ + --repository-name github-runners/github-runner \ + --image-ids imageTag=1.67.0 ``` -#### 4. Verify Runners Come Online +If `enable_ecr_clone = false`, verify the public ECR image is accessible from the VPC. -Monitor runner registration: +### High Memory/CPU Usage -```bash -# Watch CloudWatch logs -aws logs tail /ecs-ghe-runners/229685449397-229685449397-us-gov-east-1 --follow +1. Review the `github-runners-{aws_account}` CloudWatch dashboard +2. Consider increasing `desired_count` for more parallel capacity +3. Optimize workflow jobs (caching, parallelization) -# Check ECS task status -aws ecs describe-services \ - --cluster ecs-ghe-runners-us-gov-east-1 \ - --services github-runner-service \ - --query 'services[0].{Running:runningCount,Desired:desiredCount,Pending:pendingCount}' -``` +### Workflow Jobs Stuck in Queue -Expected log output on successful registration: -``` -Runner successfully registered -Runner listening for Jobs -Runner connected to GitHub -``` +**Symptoms:** Jobs stay in "Waiting for a runner to pick up this job" state -Check GitHub UI: -1. Navigate to `https://github.e.it.census.gov/organizations/CSVD/settings/actions/runners` -2. Verify runners show status "Idle" (not "Offline") +1. Verify runners are online and idle in GitHub UI +2. Check runner labels match the `runs-on` value in the workflow +3. Increase `desired_count` if all runners are occupied -**Important:** Terraform automatically retrieves and manages the GitHub token via data source. You should never manually update tokens in Terraform variables or AWS Secrets Manager. +## Scaling Runners -### Handling Stuck ECS Tasks +### Via Terraform (Persistent) -If tasks are stuck in `PENDING` or `STOPPED` state and not recovering: +Update `desired_count` in `default.auto.tfvars`: -#### Diagnose the Issue +```hcl +desired_count = 5 +``` ```bash -# Check task status and error messages -aws ecs describe-tasks \ - --cluster ecs-ghe-runners-us-gov-east-1 \ - --tasks $(aws ecs list-tasks --cluster ecs-ghe-runners-us-gov-east-1 --query 'taskArns[0]' --output text) \ - --query 'tasks[0].{Status:lastStatus,Reason:stoppedReason,Containers:containers[0].reason}' +tf apply ``` -Common issues: -- **"CannotPullContainerError"**: ECR access issues or invalid image -- **"ResourceInitializationError"**: Network or security group issues -- **"TaskFailedToStart"**: Task role or execution role permission issues +### Via AWS CLI (Temporary) -#### Solutions - -**1. ECR Access Issues:** ```bash -# Verify ECR image exists -aws ecr describe-images \ - --repository-name github-runner \ - --image-ids imageTag=2.311.0 +# Scale up +aws ecs update-service \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --service SCT-Engineering \ + --desired-count 5 -# Check task execution role has ECR permissions -aws iam get-role-policy \ - --role-name ecsTaskExecutionRole \ - --policy-name ECRAccessPolicy +# Scale down +aws ecs update-service \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --service SCT-Engineering \ + --desired-count 2 ``` -**2. Network Issues:** -```bash -# Verify security group allows outbound HTTPS -aws ec2 describe-security-groups \ - --group-ids sg-xyz789 \ - --query 'SecurityGroups[0].IpPermissionsEgress' - -# Check subnets have route to NAT Gateway or proxy -aws ec2 describe-route-tables \ - --filters "Name=association.subnet-id,Values=subnet-abc123" -``` +> CLI changes are temporary and will be overwritten by the next `tf apply`. -**3. Force Clean Restart:** +**Warning:** Scaling down terminates runner tasks. Active jobs may be cancelled unless they complete before ECS stops the task. -If tasks remain stuck, scale down to zero and back up: +### Emergency Scale to Zero ```bash -# Scale down to 0 +# Stop all runners immediately aws ecs update-service \ - --cluster ecs-ghe-runners-us-gov-east-1 \ - --service github-runner-service \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --service SCT-Engineering \ --desired-count 0 -# Wait 30 seconds for tasks to stop -sleep 30 - -# Scale back up to desired count +# Restore aws ecs update-service \ - --cluster ecs-ghe-runners-us-gov-east-1 \ - --service github-runner-service \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --service SCT-Engineering \ --desired-count 3 ``` -### Scaling Operations - -#### Scale Up During High Demand +## Operations -```bash -# Increase runner count to 10 -aws ecs update-service \ - --cluster ecs-ghe-runners-us-gov-east-1 \ - --service github-runner-service \ - --desired-count 10 - -# Or via Terraform -# Update default.auto.tfvars: -# desired_count = 10 -terraform apply -``` +### Refreshing Runners -#### Scale Down During Low Activity +Use the Terraform taint method when runners stop responding, fail to register, or tokens have expired. -```bash -# Decrease runner count to 2 -aws ecs update-service \ - --cluster ecs-ghe-runners-us-gov-east-1 \ - --service github-runner-service \ - --desired-count 2 -``` +**Why Manual GitHub Cleanup is Required First:** +- Orphaned runner registrations prevent new runners from registering with the same labels +- Runner groups that contain offline runners cannot be deleted automatically -**Warning:** Scaling down will terminate runners. Active jobs will be cancelled unless they complete before the grace period expires. +**Step 1: Clean Up GitHub** -#### Emergency Scale to Zero +1. Go to: `https://github.e.it.census.gov/organizations/SCT-Engineering/settings/actions/runners` +2. For each offline/stale runner: click the runner name → **Remove runner** → Confirm +3. Go to: `https://github.e.it.census.gov/organizations/SCT-Engineering/settings/actions/runner-groups` +4. Find the runner group named after the AWS account ID → **Delete runner group** → Confirm -In case of issues requiring all runners to be stopped: +**Step 2: Taint Resources and Apply** ```bash -# Stop all runners -aws ecs update-service \ - --cluster ecs-ghe-runners-us-gov-east-1 \ - --service github-runner-service \ - --desired-count 0 +# Force recreation of the GitHub runner group +tf taint 'module.github-runner.github_actions_runner_group.runner_group' + +# Force recreation of the ECS service (all runner tasks restarted) +tf taint 'module.github-runner.module.ecs_service.aws_ecs_service.github_runner' + +# Force fresh token retrieval from GitHub +tf taint 'module.github-runner.aws_secretsmanager_secret_version.secret' -# Active workflows will fail -# Runners will deregister from GitHub +# Review and apply +tf plan +tf apply ``` -To restore: +**Step 3: Verify Recovery** + ```bash -aws ecs update-service \ - --cluster ecs-ghe-runners-us-gov-east-1 \ - --service github-runner-service \ - --desired-count 3 +# Watch ECS task count recover +watch -n 5 'aws ecs describe-services \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --services SCT-Engineering \ + --query "services[0].{Desired:desiredCount,Running:runningCount,Pending:pendingCount}"' + +# Stream runner registration events +aws logs tail /ecs-ghe-runners/default-229685449397-us-gov-west-1 --follow ``` -### Monitoring Runner Health +**Expected Timeline:** ~10–15 minutes for full recovery. -#### Check Runner Status +### Check Runner Status ```bash -# Quick status check aws ecs describe-services \ - --cluster ecs-ghe-runners-us-gov-east-1 \ - --services github-runner-service \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --services SCT-Engineering \ --query 'services[0].{Desired:desiredCount,Running:runningCount,Pending:pendingCount,Deployments:deployments[*].{Status:status,Running:runningCount,Desired:desiredCount}}' ``` @@ -1062,228 +726,76 @@ Healthy state: "Desired": 3, "Running": 3, "Pending": 0, - "Deployments": [ - { - "Status": "PRIMARY", - "Running": 3, - "Desired": 3 - } - ] -} -``` - -#### Monitor GitHub Runner Registration - -Create a script to check runner status: - -```bash -#!/bin/bash -# check-runners.sh - -EXPECTED_COUNT=3 -ORG="CSVD" -TOKEN="your-github-token" # Or use gh cli - -# Get runner count from GitHub API -RUNNER_COUNT=$(curl -s \ - -H "Authorization: Bearer $TOKEN" \ - https://github.e.it.census.gov/api/v3/orgs/$ORG/actions/runners \ - | jq '[.runners[] | select(.labels[].name | contains("229685449397")) | select(.status == "online")] | length') - -echo "Expected: $EXPECTED_COUNT" -echo "Online: $RUNNER_COUNT" - -if [ "$RUNNER_COUNT" -lt "$EXPECTED_COUNT" ]; then - echo "WARNING: Runners below expected count!" - exit 1 -fi -``` - -#### Set Up CloudWatch Alarms - -Create alarms for runner health monitoring: - -```hcl -resource "aws_cloudwatch_metric_alarm" "runner_count_low" { - alarm_name = "github-runners-count-low-229685449397" - comparison_operator = "LessThanThreshold" - evaluation_periods = 2 - metric_name = "RunningTaskCount" - namespace = "AWS/ECS" - period = 300 - statistic = "Average" - threshold = 2 - alarm_description = "Alert when runner count drops below 2" - - dimensions = { - ServiceName = "github-runner-service" - ClusterName = "ecs-ghe-runners-us-gov-east-1" - } - - alarm_actions = [aws_sns_topic.ops_alerts.arn] -} - -resource "aws_cloudwatch_metric_alarm" "runner_task_failed" { - alarm_name = "github-runners-task-failures-229685449397" - comparison_operator = "GreaterThanThreshold" - evaluation_periods = 1 - metric_name = "TasksStoppedReason" - namespace = "AWS/ECS" - period = 300 - statistic = "Sum" - threshold = 3 - alarm_description = "Alert when multiple runner tasks fail" - - dimensions = { - ServiceName = "github-runner-service" - ClusterName = "ecs-ghe-runners-us-gov-east-1" - } - - alarm_actions = [aws_sns_topic.ops_alerts.arn] + "Deployments": [{ "Status": "PRIMARY", "Running": 3, "Desired": 3 }] } ``` -### Common Operational Tasks - -#### View Recent Runner Activity +### View Recent Runner Activity ```bash -# Check CloudWatch logs for recent job executions -aws logs tail /ecs-ghe-runners/229685449397-229685449397-us-gov-east-1 \ +aws logs tail /ecs-ghe-runners/default-229685449397-us-gov-west-1 \ --since 1h \ - --filter-pattern "Job\|Running\|Completed" -``` - -#### Identify Long-Running Jobs - -```bash -# List tasks with start times -aws ecs list-tasks \ - --cluster ecs-ghe-runners-us-gov-east-1 \ - --query 'taskArns[]' \ - --output text | xargs -I {} aws ecs describe-tasks \ - --cluster ecs-ghe-runners-us-gov-east-1 \ - --tasks {} \ - --query 'tasks[].{TaskArn:taskArn,StartedAt:startedAt,Status:lastStatus}' + --filter-pattern "Job|Running|Completed|error|fail" ``` -#### Gracefully Drain a Runner +### Handling Expired Tokens (Without Lambda) -To remove a runner without interrupting active jobs: +If `enable_lambda_token_refresh = false` and the Secrets Manager token has expired: ```bash -# Reduce desired count by 1 -aws ecs update-service \ - --cluster ecs-ghe-runners-us-gov-east-1 \ - --service github-runner-service \ - --desired-count 2 - -# ECS will wait for a runner to complete its current job before stopping -# This may take up to 1 hour (default stop timeout) -``` - -### Disaster Recovery - -#### Complete Runner Recovery - -If all runners are offline, unresponsive, or completely broken, follow this complete recovery procedure: - -**Step 1: Manual GitHub Cleanup** - -Before any infrastructure changes, clean up GitHub completely: - -```bash -# 1. Remove all runners from the organization -# URL: https://github.e.it.census.gov/organizations/CSVD/settings/actions/runners - -# For each runner with your account label (e.g., 229685449397): -# - Click the runner -# - Click "Remove runner" -# - Confirm deletion -``` - -```bash -# 2. Delete the runner group -# URL: https://github.e.it.census.gov/organizations/CSVD/settings/actions/runner-groups +# Fetch a fresh token (requires valid GITHUB_TOKEN env var) +NEW_TOKEN=$(curl -s -X POST \ + -H "Authorization: token $GITHUB_TOKEN" \ + -H "Accept: application/vnd.github.v3+json" \ + "https://github.e.it.census.gov/api/v3/orgs/SCT-Engineering/actions/runners/registration-token" \ + | jq -r '.token') -# Find the runner group (e.g., "229685449397") -# - Click the runner group name -# - Click "Delete runner group" -# - Confirm deletion +# Update the secret (find the secret name in AWS Console or from tf outputs) +aws secretsmanager update-secret \ + --secret-id /github-runners/sct-engineering-229685449397-us-gov-west-1/... \ + --secret-string "$NEW_TOKEN" ``` -**Step 2: Verify GitHub OAuth App (if authentication issues)** - -If runners are failing due to authentication, verify the OAuth App configuration: - -1. Navigate to: **GitHub Organization Settings** → **Developer settings** → **OAuth Apps** -2. Ensure the OAuth App exists and is active -3. Verify Terraform's GitHub provider is authenticated correctly - -**Note:** Terraform automatically retrieves the GitHub token using a data source. You do NOT need to manually update tokens. - -**Step 3: Terraform Taint and Recreate** - -```bash -# Taint GitHub runner group -terraform taint 'module.github_runner.github_actions_runner_group.runner_group' - -# Taint ECS service -terraform taint 'module.ecs_service.aws_ecs_service.github_runner' +Then follow the taint-and-apply procedure above to restart the ECS service. -# Review planned changes -terraform plan - -# Apply changes to recreate everything -# Terraform will automatically retrieve a fresh token from GitHub -terraform apply -``` - -**Step 4: Monitor Recovery** +### Handling Stuck ECS Tasks ```bash -# Watch ECS tasks start -watch -n 5 'aws ecs describe-services \ - --cluster ecs-ghe-runners-us-gov-east-1 \ - --services github-runner-service \ - --query "services[0].{Desired:desiredCount,Running:runningCount,Pending:pendingCount}"' - -# Monitor runner registration in logs -aws logs tail /ecs-ghe-runners/229685449397-229685449397-us-gov-east-1 --follow +# Diagnose task stop reason +aws ecs describe-tasks \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --tasks $(aws ecs list-tasks \ + --cluster ecs-ghe-runners-us-gov-west-1 \ + --query 'taskArns[0]' --output text) \ + --query 'tasks[0].{Status:lastStatus,Reason:stoppedReason,Container:containers[0].reason}' ``` -**Step 5: Verify in GitHub** +Common stop reasons: -Check that runners are online and ready: -1. Go to: `https://github.e.it.census.gov/organizations/CSVD/settings/actions/runners` -2. Verify runner count matches desired count (e.g., 3 runners) -3. Verify all runners show status "Idle" (not "Offline") -4. Verify runner labels are correct (account ID, region, etc.) +| Reason | Likely Cause | +|--------|-------------| +| `CannotPullContainerError` | ECR access issue or invalid `image_version` | +| `ResourceInitializationError` | Network or security group misconfiguration | +| `TaskFailedToStart` | Task role or execution role permission issue | -**Expected Timeline:** -- GitHub cleanup: 2-5 minutes -- Terraform apply: 3-5 minutes -- Runner registration: 2-3 minutes -- **Total recovery time: ~10-15 minutes** +## Disaster Recovery -#### Terraform State Recovery +See **[RUNBOOK.md](./RUNBOOK.md)** for complete incident response procedures covering: +1. Lambda token refresh failing +2. Runners at 50% capacity +3. All runners down (EMERGENCY) -If Terraform state is corrupted or lost: +### Terraform State Recovery ```bash -# 1. Re-import ECS service -terraform import module.ecs_service.aws_ecs_service.main \ - ecs-ghe-runners-us-gov-east-1/github-runner-service +# Re-import ECS cluster (when create_ecs_cluster = true) +tf import aws_ecs_cluster.github-runner[0] ecs-ghe-runners-us-gov-west-1 -# 2. Re-import task definition -terraform import module.task_definition.aws_ecs_task_definition.main \ - github-runner-229685449397 +# Verify state +tf plan -# 3. Verify state -terraform plan - -# 4. Apply any drift corrections -terraform apply +# Apply drift corrections +tf apply ``` ## Maintenance @@ -1291,71 +803,68 @@ terraform apply ### Updating Runner Version 1. Update `image_version` in `default.auto.tfvars`: -```hcl -image_version = "2.312.0" # New version -``` - -2. Apply changes: -```bash -terraform apply -``` + ```hcl + image_version = "1.68.0" + ``` +2. Apply: + ```bash + tf apply + ``` -ECS will perform a rolling update, replacing tasks one at a time. +When `enable_ecr_clone = true`, the new image is automatically mirrored to private ECR during apply. ECS performs a rolling update, replacing one task at a time. ### Updating Configuration -All configuration changes should be made via Terraform: +All configuration changes must be made via Terraform: 1. Modify variables in `default.auto.tfvars` -2. Run `terraform plan` to preview changes -3. Run `terraform apply` to apply changes +2. `tf plan` — preview changes +3. `tf apply` — apply changes -**Never modify resources directly in AWS Console** - changes will be overwritten by Terraform. +**Never modify resources directly in the AWS Console** — changes will be overwritten by the next `tf apply`. -### Backup and Disaster Recovery +### Terraform State Backend -**State Management:** -- Terraform state is stored in S3 backend -- State locking via DynamoDB -- State versioning enabled - -**Recovery Process:** -1. Clone repository -2. Initialize Terraform with existing state -3. Run `terraform plan` to verify state -4. Run `terraform apply` to recreate resources if needed +| Property | Value | +|----------|-------| +| Backend type | S3 (configured in `backend.tf` and `backend-configs/`) | +| State locking | DynamoDB | +| Workspace isolation | One state file per workspace | ## Security Best Practices -1. **Use Private Subnets**: Deploy runners in private subnets without public IPs -2. **Minimize IAM Permissions**: Grant only necessary permissions to task role -3. **Rotate OAuth App Credentials**: Periodically rotate GitHub OAuth App credentials -4. **Enable VPC Endpoints**: Reduce internet egress and improve security -5. **Monitor Logs**: Regularly review CloudWatch logs for suspicious activity -6. **Update Runner Images**: Keep runner container images up to date -7. **Restrict Security Groups**: Allow only necessary outbound traffic +1. **Use Private Subnets**: Deploy runners without public IPs +2. **Never Commit `GITHUB_TOKEN`**: Always set via environment variable, never in `.tfvars` +3. **Minimize IAM Permissions**: Replace the default admin task role policy with least-privilege permissions +4. **Rotate GitHub PAT**: Periodically rotate the PAT and redeploy (`tf apply`) to update Lambda env vars +5. **Enable VPC Endpoints**: Set `create_vpc_endpoint = true` to reduce internet egress +6. **Respond to Alarms**: Act on SNS alerting emails promptly to prevent runner outages +7. **Update Runner Images**: Keep `image_version` current +8. **Restrict Security Groups**: Allow only necessary outbound traffic ## Cost Optimization **Fargate Pricing Factors:** -- CPU and memory allocation -- Task run duration +- CPU and memory allocation per task +- Continuous run duration (runners are 24/7) - Number of concurrent tasks **Optimization Strategies:** -1. Right-size task CPU/memory for workload -2. Scale `desired_count` based on actual usage -3. Use workflow job timeouts to prevent runaway jobs -4. Enable VPC endpoints to reduce data transfer costs -5. Use caching strategies in workflows to reduce execution time +1. Set `desired_count` to match actual workload (start at `1` for low-traffic accounts) +2. Right-size task CPU/memory using CloudWatch utilization metrics from the dashboard +3. Enable VPC endpoints (`create_vpc_endpoint = true`) to reduce data transfer costs +4. Use workflow job timeouts to prevent runaway jobs +5. Use caching strategies in workflows to reduce execution time per job ## Related Documentation -- [GitHub Actions Architecture](https://github.e.it.census.gov/CSVD/github-actions/blob/main/GITHUB_ACTIONS_ARCHITECTURE.md) - Overall GitHub Actions ecosystem -- [Composite Actions](https://github.e.it.census.gov/CSVD/github-actions/wiki) - Available reusable actions -- [AWS Permissions Documentation](./AWS_PERMISSIONS.md) - Complete IAM permission requirements and examples -- [Security Review Guide](./SECURITY_REVIEW.md) - Summary for security team reviews and approvals -- [GitHub App Setup](./GITHUB_APP_SETUP.md) - GitHub App authentication configuration +- [Emergency Runbook](./RUNBOOK.md) — Incident response procedures +- [Monitoring Implementation Plan](./MONITORING_IMPLEMENTATION_PLAN.md) — Monitoring design details +- [Monitoring Deployment Summary](./MONITORING_DEPLOYMENT_SUMMARY.md) — Deployed monitoring resources +- [AWS Permissions Documentation](./AWS_PERMISSIONS.md) — Complete IAM requirements and examples +- [Security Review Guide](./SECURITY_REVIEW.md) — Summary for security team reviews and approvals +- [GitHub App Setup](./GITHUB_APP_SETUP.md) — Optional GitHub App authentication configuration +- [GitHub App Migration Guide](./GITHUB_APP_MIGRATION.md) — Migrating from PAT to GitHub App - [AWS ECS Fargate Documentation](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/AWS_Fargate.html) - [GitHub Actions Self-Hosted Runners](https://docs.github.com/en/actions/hosting-your-own-runners) @@ -1365,20 +874,18 @@ For assistance: 1. **Infrastructure Issues**: Contact the infrastructure team 2. **Workflow Issues**: Review workflow-specific documentation 3. **GitHub Issues**: Open an issue in this repository -4. **AWS Issues**: Check CloudWatch logs and ECS console +4. **AWS Issues**: Check CloudWatch logs, alarms, and the ECS console ## Contributing -We welcome contributions! Please follow these steps: - 1. Fork the repository 2. Create a feature branch (`git checkout -b feature/improvement`) 3. Make your changes -4. Test thoroughly in a dev environment -5. Submit a pull request with detailed description -6. Ensure Terraform formatting (`terraform fmt`) +4. Test in a dev environment using `tf plan` / `tf apply` +5. Submit a pull request with a detailed description +6. Ensure Terraform formatting (`tf fmt`) 7. Update documentation as needed ## License -Internal use only - Census Bureau CSVD organization. +Internal use only — Census Bureau CSVD organization. diff --git a/codebuild/Makefile b/codebuild/Makefile new file mode 100644 index 0000000..850cfa1 --- /dev/null +++ b/codebuild/Makefile @@ -0,0 +1,329 @@ +# ───────────────────────────────────────────────────────────────────────────── +# Makefile — ghe-runner CodeBuild cronjob +# +# Usage: +# make [TF_WORKSPACE=] [GITHUB_TOKEN_SECRET_ARN=] +# +# TF_WORKSPACE selects which ghe-runner workspace the CodeBuild job will +# deploy (maps to varfiles/{workspace}.tfvars in the repo root). It also +# drives backend-config selection for this Terraform configuration. +# +# Prerequisites: +# - AWS credentials exported in the current shell (or via awscreds) +# - GITHUB_TOKEN environment variable set to a valid PAT +# - terraform >= 1.9 on PATH (or ~/git/tfenv/bin/terraform) +# ───────────────────────────────────────────────────────────────────────────── + +# ── Configurable variables ──────────────────────────────────────────────────── + +## Which ghe-runner workspace the CodeBuild job will target. +## Also used to select the backend-config for this Terraform workspace. +## Defaults to the active workspace of the parent ghe-runner directory so you +## don't have to set it manually — just run `tf workspace select ` in the +## repo root and make will pick it up automatically. +## Override with: make TF_WORKSPACE=csvd +TF_WORKSPACE ?= $(shell cd .. && tf workspace show 2>/dev/null || echo default) + +## AWS region for all operations. +AWS_REGION ?= us-gov-west-1 + +## HTTPS clone URL of the ghe-runner repo. Passed to Terraform as a variable +## so CodeBuild knows where to check out from. +SOURCE_LOCATION ?= https://github.e.it.census.gov/CSVD/ghe-runners.git + +## Branch/tag CodeBuild will check out. +SOURCE_VERSION ?= main + +## Name of the Secrets Manager secret that holds the GitHub PAT. +## The secret value must be a JSON object with a "token" key. +SECRET_NAME ?= ghe-runner/github-token + +## ARN of the Secrets Manager secret. Populated automatically by `make secret` +## or can be supplied directly: make apply GITHUB_TOKEN_SECRET_ARN=arn:... +GITHUB_TOKEN_SECRET_ARN ?= + +# ── Internal variables ──────────────────────────────────────────────────────── + +REPO_ROOT := $(shell git -C .. rev-parse --show-toplevel 2>/dev/null || echo ..) +# Use backend-configs/ local to codebuild/ — these have a distinct state key +# (ghe-runner-codebuild) so codebuild state never collides with the parent +# ghe-runner state stored under the same bucket. +BACKEND_CONFIGS := backend-configs +# Mirror the resolution order used by ~/bin/tf: +# 1. TERRAFORM_BINARY env var (explicit override) +# 2. ~/git/tfenv/bin/terraform (project standard) +# 3. first `terraform` on PATH (last resort) +TF := $(or \ + $(and $(TERRAFORM_BINARY),$(wildcard $(TERRAFORM_BINARY)),$(TERRAFORM_BINARY)),\ + $(wildcard $(HOME)/git/tfenv/bin/terraform),\ + $(shell command -v terraform 2>/dev/null)) +TF_DATA_DIR := $(REPO_ROOT)/terraform_data_dirs/codebuild-$(TF_WORKSPACE) + +# Pick a backend-config file: prefer workspace-specific, fall back to csvd-dev-ew +_BACKEND_FILE := $(BACKEND_CONFIGS)/$(TF_WORKSPACE).tf +BACKEND_CONFIG := $(shell [ -f "$(_BACKEND_FILE)" ] && echo "$(_BACKEND_FILE)" || echo "$(BACKEND_CONFIGS)/csvd-dev-ew.tf") + +# Build common terraform var flags +_TF_VARS := \ + -var="tf_workspace=$(TF_WORKSPACE)" \ + -var="aws_region=$(AWS_REGION)" \ + -var="source_location=$(SOURCE_LOCATION)" \ + -var="source_version=$(SOURCE_VERSION)" + +# Append secret ARN only when provided — avoids a blank string reaching terraform +ifneq ($(GITHUB_TOKEN_SECRET_ARN),) + _TF_VARS += -var="github_token_secret_arn=$(GITHUB_TOKEN_SECRET_ARN)" +endif + +# Colorized output helpers (degrade gracefully if no tty) +BOLD := $(shell tput bold 2>/dev/null) +RESET := $(shell tput sgr0 2>/dev/null) +CYAN := $(shell tput setaf 6 2>/dev/null) +YELLOW := $(shell tput setaf 3 2>/dev/null) +RED := $(shell tput setaf 1 2>/dev/null) + +# ── Phony declarations ──────────────────────────────────────────────────────── + +.PHONY: help init plan apply destroy validate fmt \ + setup-credentials secret show-outputs trigger logs clean check-env \ + force-unlock break-lock + +# ── Default target ──────────────────────────────────────────────────────────── + +.DEFAULT_GOAL := help + +## help: Show this help message +help: + @echo "" + @echo "$(BOLD)$(CYAN)ghe-runner CodeBuild cronjob$(RESET)" + @echo "" + @echo "$(BOLD)Usage:$(RESET)" + @echo " make [TF_WORKSPACE=] [GITHUB_TOKEN_SECRET_ARN=]" + @echo "" + @echo "$(BOLD)Current settings:$(RESET)" + @echo " TF_WORKSPACE = $(TF_WORKSPACE)" + @echo " AWS_REGION = $(AWS_REGION)" + @echo " SECRET_NAME = $(SECRET_NAME)" + @echo " GITHUB_TOKEN_SECRET_ARN= $(if $(GITHUB_TOKEN_SECRET_ARN),$(GITHUB_TOKEN_SECRET_ARN),$(YELLOW)(not set — run: make secret)$(RESET))" + @echo " BACKEND_CONFIG = $(BACKEND_CONFIG)" + @echo " TF_DATA_DIR = $(TF_DATA_DIR)" + @echo "" + @echo "$(BOLD)Targets:$(RESET)" + @grep -E '^## [a-zA-Z_-]+:' $(MAKEFILE_LIST) \ + | sed 's/## / /' \ + | awk -F: '{printf " $(CYAN)%-24s$(RESET) %s\n", $$1, $$2}' + @echo "" + @echo "$(BOLD)Recommended first-time flow:$(RESET)" + @echo " 1. make setup-credentials # register GHE PAT with CodeBuild (once per region)" + @echo " 2. make secret # store PAT in Secrets Manager" + @echo " 3. make init # initialize Terraform" + @echo " 4. make plan # preview changes" + @echo " 5. make apply # deploy" + @echo " 6. make trigger # run a manual build to verify" + @echo "" + +# ── Environment guard ───────────────────────────────────────────────────────── + +## check-env: Verify required tools and environment variables are present +check-env: + @echo "$(BOLD)Checking environment...$(RESET)" + @command -v aws >/dev/null 2>&1 || (echo "$(RED)ERROR: aws CLI not found$(RESET)"; exit 1) + @$(TF) version >/dev/null 2>&1 || (echo "$(RED)ERROR: terraform not found at $(TF)$(RESET)"; exit 1) + @[ -n "$(GITHUB_TOKEN)" ] || \ + (echo "$(RED)ERROR: GITHUB_TOKEN is not set. Export your GitHub PAT before running.$(RESET)"; exit 1) + @aws sts get-caller-identity --region $(AWS_REGION) >/dev/null 2>&1 || \ + (echo "$(RED)ERROR: AWS credentials are not valid or have expired. Run: awscreds$(RESET)"; exit 1) + @echo " terraform : $$($(TF) version 2>&1 | head -1)" + @echo " aws cli : $$(aws --version 2>&1 | awk '{print $$1}')" + @echo " identity : $$(aws sts get-caller-identity --region $(AWS_REGION) --query 'Arn' --output text)" + @echo "$(BOLD)$(CYAN)OK$(RESET)" + +# ── One-time setup ──────────────────────────────────────────────────────────── + +## setup-credentials: Register the GitHub PAT with CodeBuild (once per region) +setup-credentials: check-env + @echo "$(BOLD)Registering GHE credentials with CodeBuild in $(AWS_REGION)...$(RESET)" + @[ -n "$(GITHUB_TOKEN)" ] || (echo "$(RED)ERROR: GITHUB_TOKEN not set$(RESET)"; exit 1) + aws codebuild import-source-credentials \ + --server-type GITHUB_ENTERPRISE \ + --auth-type PERSONAL_ACCESS_TOKEN \ + --token "$(GITHUB_TOKEN)" \ + --region $(AWS_REGION) + @echo "$(BOLD)$(CYAN)Credentials registered.$(RESET)" + +## secret: Create or update the GitHub PAT secret in Secrets Manager +secret: check-env + @[ -n "$(GITHUB_TOKEN)" ] || (echo "$(RED)ERROR: GITHUB_TOKEN not set$(RESET)"; exit 1) + @echo "$(BOLD)Writing secret: $(SECRET_NAME)$(RESET)" + @EXISTING=$$(aws secretsmanager describe-secret \ + --secret-id "$(SECRET_NAME)" \ + --region $(AWS_REGION) \ + --query 'ARN' --output text 2>/dev/null); \ + if [ -n "$$EXISTING" ]; then \ + echo " Secret exists — updating value"; \ + aws secretsmanager put-secret-value \ + --secret-id "$(SECRET_NAME)" \ + --secret-string "{\"token\":\"$(GITHUB_TOKEN)\"}" \ + --region $(AWS_REGION); \ + echo " ARN: $$EXISTING"; \ + else \ + echo " Secret does not exist — creating"; \ + ARN=$$(aws secretsmanager create-secret \ + --name "$(SECRET_NAME)" \ + --description "GitHub PAT for ghe-runner CodeBuild job" \ + --secret-string "{\"token\":\"$(GITHUB_TOKEN)\"}" \ + --region $(AWS_REGION) \ + --query 'ARN' --output text); \ + echo " ARN: $$ARN"; \ + fi + @echo "" + @echo "$(BOLD)$(YELLOW)Set this ARN before running make plan/apply:$(RESET)" + @echo " export GITHUB_TOKEN_SECRET_ARN=$$(aws secretsmanager describe-secret \ + --secret-id '$(SECRET_NAME)' \ + --region $(AWS_REGION) \ + --query 'ARN' --output text)" + @echo "" + +# ── Terraform lifecycle ─────────────────────────────────────────────────────── + +## init: Initialize Terraform with the workspace-appropriate backend config +init: check-env + @echo "$(BOLD)Initializing Terraform...$(RESET)" + @echo " workspace : $(TF_WORKSPACE)" + @echo " backend-config: $(BACKEND_CONFIG)" + @mkdir -p $(TF_DATA_DIR) + TF_DATA_DIR=$(TF_DATA_DIR) \ + $(TF) init \ + -input=false \ + -backend-config=$(BACKEND_CONFIG) + +## validate: Validate Terraform configuration (no backend needed) +validate: + @echo "$(BOLD)Validating...$(RESET)" + TF_DATA_DIR=$(TF_DATA_DIR) $(TF) validate + +## fmt: Format all Terraform files in this directory +fmt: + $(TF) fmt -recursive . + +## plan: Show what Terraform would change +plan: check-env _require-secret-arn + @echo "$(BOLD)Planning for workspace: $(TF_WORKSPACE)$(RESET)" + TF_DATA_DIR=$(TF_DATA_DIR) \ + $(TF) plan \ + -input=false \ + $(_TF_VARS) + +## apply: Deploy or update the CodeBuild cronjob infrastructure +apply: check-env _require-secret-arn + @echo "$(BOLD)Applying for workspace: $(TF_WORKSPACE)$(RESET)" + TF_DATA_DIR=$(TF_DATA_DIR) \ + $(TF) apply \ + -input=false \ + -auto-approve \ + $(_TF_VARS) + +## destroy: Tear down the CodeBuild infrastructure for this workspace +destroy: check-env _require-secret-arn + @echo "$(BOLD)$(RED)Destroying CodeBuild infrastructure for workspace: $(TF_WORKSPACE)$(RESET)" + @echo "$(YELLOW)Press Ctrl-C within 5 seconds to abort...$(RESET)" + @sleep 5 + TF_DATA_DIR=$(TF_DATA_DIR) \ + $(TF) destroy \ + -input=false \ + $(_TF_VARS) + +## show-outputs: Print Terraform outputs for the deployed project +show-outputs: + @TF_DATA_DIR=$(TF_DATA_DIR) $(TF) output + +# ── CodeBuild operations ────────────────────────────────────────────────────── + +## trigger: Manually start a build outside the daily schedule +trigger: check-env + $(eval PROJECT := $(shell TF_DATA_DIR=$(TF_DATA_DIR) $(TF) output -raw codebuild_project_name 2>/dev/null)) + @[ -n "$(PROJECT)" ] || (echo "$(RED)ERROR: Could not read codebuild_project_name from state. Run make apply first.$(RESET)"; exit 1) + @echo "$(BOLD)Starting build: $(PROJECT)$(RESET)" + $(eval BUILD_ID := $(shell aws codebuild start-build \ + --project-name $(PROJECT) \ + --region $(AWS_REGION) \ + --query 'build.id' --output text)) + @echo " Build ID : $(BUILD_ID)" + @echo " Console : https://console.amazonaws-us-gov.com/codesuite/codebuild/projects/$(PROJECT)/build/$(BUILD_ID)/log" + @echo "" + @echo "Run $(BOLD)make logs BUILD_ID=$(BUILD_ID)$(RESET) to stream the output." + +## logs: Stream logs for the most recent build (or specify BUILD_ID=...) +logs: check-env + $(eval PROJECT := $(shell TF_DATA_DIR=$(TF_DATA_DIR) $(TF) output -raw codebuild_project_name 2>/dev/null)) + @[ -n "$(PROJECT)" ] || (echo "$(RED)ERROR: Could not read codebuild_project_name from state.$(RESET)"; exit 1) + @if [ -z "$(BUILD_ID)" ]; then \ + echo "$(BOLD)Fetching most recent build for $(PROJECT)...$(RESET)"; \ + LATEST=$$(aws codebuild list-builds-for-project \ + --project-name $(PROJECT) \ + --region $(AWS_REGION) \ + --query 'ids[0]' --output text); \ + echo " Build ID: $$LATEST"; \ + LOG_GROUP=$$(TF_DATA_DIR=$(TF_DATA_DIR) $(TF) output -raw log_group_name 2>/dev/null); \ + aws logs tail "$$LOG_GROUP" \ + --follow \ + --region $(AWS_REGION); \ + else \ + LOG_GROUP=$$(TF_DATA_DIR=$(TF_DATA_DIR) $(TF) output -raw log_group_name 2>/dev/null); \ + aws logs tail "$$LOG_GROUP" \ + --follow \ + --region $(AWS_REGION); \ + fi + +# ── State lock management ──────────────────────────────────────────────────── + +## force-unlock: Release a state lock by ID (use when lock ID is shown in error output) +force-unlock: check-env + @[ -n "$(LOCK_ID)" ] || { \ + echo "$(RED)ERROR: LOCK_ID is required.$(RESET)"; \ + echo " Usage: make force-unlock LOCK_ID="; \ + exit 1; \ + } + TF_DATA_DIR=$(TF_DATA_DIR) $(TF) force-unlock -force $(LOCK_ID) + +## break-lock: Delete the DynamoDB lock record directly (use when lock is malformed/stuck) +# Derives bucket, table, and region by parsing the active backend-config file. +break-lock: check-env + $(eval _BC := $(BACKEND_CONFIG)) + $(eval _BUCKET := $(shell awk -F'"' '/^[[:space:]]*bucket/{print $$2}' $(_BC))) + $(eval _TABLE := $(shell awk -F'"' '/^[[:space:]]*dynamodb_table/{print $$2}' $(_BC))) + $(eval _BREGION := $(shell awk -F'"' '/^[[:space:]]*region/{print $$2}' $(_BC))) + $(eval _KEY := csvd-dev-gov/common/apps/ghe-runner-codebuild) + $(eval _LOCK_ID := $(_BUCKET)/$(_KEY)) + @echo "$(BOLD)Breaking state lock:$(RESET)" + @echo " table : $(_TABLE)" + @echo " region : $(_BREGION)" + @echo " LockID : $(_LOCK_ID)" + @echo "$(YELLOW)Press Ctrl-C within 5 seconds to abort...$(RESET)" + @sleep 5 + aws dynamodb delete-item \ + --table-name "$(_TABLE)" \ + --key "{\"LockID\": {\"S\": \"$(_LOCK_ID)\"}}" \ + --region "$(_BREGION)" + @echo "$(BOLD)$(CYAN)Lock cleared.$(RESET)" + +# ── Housekeeping ────────────────────────────────────────────────────────────── + +## clean: Remove local Terraform cache for this workspace +clean: + @echo "$(BOLD)Removing $(TF_DATA_DIR)...$(RESET)" + rm -rf $(TF_DATA_DIR) + @echo "$(BOLD)Removing .terraform/...$(RESET)" + rm -rf .terraform .terraform.lock.hcl + +# ── Internal helpers (not shown in help) ───────────────────────────────────── + +_require-secret-arn: + @[ -n "$(GITHUB_TOKEN_SECRET_ARN)" ] || { \ + echo "$(RED)ERROR: GITHUB_TOKEN_SECRET_ARN is required.$(RESET)"; \ + echo ""; \ + echo " Run $(BOLD)make secret$(RESET) to create the secret, then:"; \ + echo " export GITHUB_TOKEN_SECRET_ARN="; \ + echo ""; \ + exit 1; \ + } diff --git a/codebuild/README.md b/codebuild/README.md new file mode 100644 index 0000000..0b3a5e0 --- /dev/null +++ b/codebuild/README.md @@ -0,0 +1,285 @@ +# ghe-runner CodeBuild Cronjob + +Terraform configuration that deploys a scheduled AWS CodeBuild job to run +`terraform apply` against the ghe-runner root workspace once per day. The +build also forces a new ECS deployment (so runners pick up the latest task +definition) and immediately invokes the token-refresh Lambda after apply. + +--- + +## Directory layout + +``` +codebuild/ +├── Makefile ← all day-to-day operations +├── buildspec.yml ← what CodeBuild actually runs +├── providers.tf ← AWS provider + S3 backend declaration +├── variables.tf ← project-specific inputs +├── main.tf ← IAM policy + module call +├── outputs.tf ← useful references after deploy +└── modules/ + └── terraform-codebuild-cronjob/ ← generic reusable module + ├── variables.tf + ├── main.tf ← CodeBuild project + EventBridge rule + ├── iam.tf ← service roles for CodeBuild + EventBridge + └── outputs.tf +``` + +The `modules/terraform-codebuild-cronjob` module is intentionally generic — it +knows nothing about GitHub, ECS, or Terraform workspaces. The project-specific +`main.tf` in this directory supplies the IAM permissions, environment variables, +and buildspec path as inputs. To reuse the module for a different cronjob, copy +`main.tf`/`variables.tf`/`outputs.tf` to a new directory and supply a different +buildspec and policy document. + +--- + +## How it works + +``` +EventBridge cron rule (daily) + │ + ▼ + CodeBuild project + │ checks out CSVD/ghe-runners @ main + │ injects GITHUB_TOKEN from Secrets Manager + │ + ├─ terraform init (workspace-aware backend-config) + ├─ terraform workspace select + ├─ terraform apply (-var-file=varfiles/.tfvars) + ├─ aws ecs update-service --force-new-deployment + └─ aws lambda invoke github-runner-token-refresh-* +``` + +`TF_WORKSPACE` is a plain environment variable on the CodeBuild project. +It maps to: + +| Value | `varfiles/` file | `backend-configs/` file | +|-------|-----------------|------------------------| +| `default` | `varfiles/default.tfvars` + `default.auto.tfvars` | falls back to `csvd-dev-ew.tf` | +| `csvd` | `varfiles/csvd.tfvars` | `csvd-dev-ew.tf` *(if present)* | +| `sct-engineering` | `varfiles/sct-engineering.tfvars` | `sct-engineering.tf` *(if present)* | + +--- + +## Prerequisites + +| Requirement | Notes | +|-------------|-------| +| AWS credentials | Valid session in current shell — run `awscreds` if expired | +| `GITHUB_TOKEN` env var | PAT with `admin:org` + `repo` scopes | +| `terraform` ≥ 1.9 | Resolved via `~/git/tfenv/bin/terraform` or `PATH` | +| `aws` CLI | Any recent version | + +--- + +## First-time deployment + +Run each step in order. Steps 1–2 are **one-time per AWS region** and +only need repeating if credentials are rotated. + +### Step 1 — Register GHE credentials with CodeBuild + +CodeBuild needs to know the PAT in order to clone from GitHub Enterprise. +This is a regional setting stored on the CodeBuild service itself, separate +from Secrets Manager. + +```bash +export GITHUB_TOKEN="ghp_..." # your PAT +make setup-credentials +``` + +### Step 2 — Store the PAT in Secrets Manager + +The PAT is also stored as a Secrets Manager secret so CodeBuild can inject +it as `GITHUB_TOKEN` at build time (for the GitHub Terraform provider and +the Lambda variable). + +```bash +make secret +``` + +The command prints an `export` statement at the end. Run it to set +`GITHUB_TOKEN_SECRET_ARN` in your shell before continuing: + +```bash +export GITHUB_TOKEN_SECRET_ARN=arn:aws-us-gov:secretsmanager:us-gov-west-1:229685449397:secret:ghe-runner/github-token-XXXXXX +``` + +### Step 3 — Initialize Terraform + +```bash +make init +``` + +This runs `terraform init -backend-config=../backend-configs/csvd-dev-ew.tf` +(or a workspace-specific backend config if one exists). Terraform state for +the CodeBuild infrastructure is stored under the key: + +``` +csvd-dev-gov/common/apps/ghe-runner-codebuild +``` + +### Step 4 — Preview changes + +```bash +make plan +``` + +Runs `terraform plan` with all workspace variables injected. Nothing is +deployed yet. + +### Step 5 — Deploy + +```bash +make apply +``` + +Creates (or updates) the CodeBuild project, EventBridge schedule rule, IAM +roles, and CloudWatch log group. The schedule starts firing immediately at +the configured cron expression (default: `cron(0 11 * * ? *)` — 11:00 UTC / +6:00 AM ET daily). + +### Step 6 — Verify with a manual build + +```bash +make trigger +``` + +Starts a build immediately outside the schedule and prints the Build ID and +a GovCloud console URL. To stream the logs: + +```bash +make logs +# or for a specific build: +make logs BUILD_ID=ghe-runner-daily-default:abc123... +``` + +--- + +## Targeting a different workspace + +Pass `TF_WORKSPACE` to any target. The Makefile re-derives backend config, +`TF_DATA_DIR`, and var-file automatically. + +```bash +# Deploy a separate CodeBuild project for the csvd workspace +make init TF_WORKSPACE=csvd +make plan TF_WORKSPACE=csvd GITHUB_TOKEN_SECRET_ARN=arn:... +make apply TF_WORKSPACE=csvd GITHUB_TOKEN_SECRET_ARN=arn:... +make trigger TF_WORKSPACE=csvd +``` + +Each workspace produces its own: +- CodeBuild project: `ghe-runner-daily-` +- EventBridge rule: `ghe-runner-daily--schedule` +- IAM roles: `ghe-runner-daily--codebuild-role` / `-events-role` +- Log group: `/codebuild/ghe-runner-daily-` +- TF data dir: `../terraform_data_dirs/codebuild-` + +--- + +## Day-to-day operations + +```bash +make help # show all targets and current settings +make check-env # verify AWS creds, terraform, GITHUB_TOKEN +make show-outputs # print Terraform outputs (project name, log group, etc.) +make trigger # run a build now +make logs # tail the most recent build log +make plan # preview infrastructure changes +make apply # apply infrastructure changes +make clean # wipe local .terraform cache for this workspace +``` + +--- + +## Rotating the GitHub PAT + +When the PAT is rotated, update Secrets Manager and re-register with +CodeBuild: + +```bash +export GITHUB_TOKEN="ghp_" +make secret # updates the Secrets Manager secret value +make setup-credentials # re-registers with CodeBuild service +``` + +No `terraform apply` is needed — the secret value is fetched at build runtime. + +--- + +## Changing the schedule + +Pass a new `schedule_expression` variable: + +```bash +make apply GITHUB_TOKEN_SECRET_ARN=arn:... \ + TF_VARS_EXTRA='-var="schedule_expression=cron(0 13 * * ? *)"' +``` + +Or edit the default in [variables.tf](variables.tf): + +```hcl +variable "schedule_expression" { + default = "cron(0 13 * * ? *)" # 1:00 PM UTC +} +``` + +then run `make apply`. + +--- + +## Destroying + +```bash +make destroy GITHUB_TOKEN_SECRET_ARN=arn:... +``` + +This tears down the CodeBuild project, EventBridge rule, and IAM roles. It +does **not** delete the Secrets Manager secret (created outside of this +Terraform workspace by `make secret`) — remove that manually if needed: + +```bash +aws secretsmanager delete-secret \ + --secret-id ghe-runner/github-token \ + --recovery-window-in-days 7 \ + --region us-gov-west-1 +``` + +--- + +## Troubleshooting + +### `GITHUB_TOKEN_SECRET_ARN is required` + +`make plan` and `make apply` require the secret ARN explicitly to avoid +accidentally deploying with a wrong or missing value. Run `make secret` +and export the printed ARN. + +### `AWS credentials are not valid or have expired` + +Run `awscreds` to refresh your session, then retry. + +### CodeBuild fails with `unable to clone` + +The GHE credential registered with CodeBuild may have expired or been +rotated. Run `make setup-credentials` with a fresh `GITHUB_TOKEN`. + +### `terraform init` fails with `no valid credential sources` + +The backend S3 bucket is in `us-gov-east-1` while the build is in +`us-gov-west-1`. Ensure your AWS credentials allow cross-region S3 access +and that the IAM policy attached to the CodeBuild role includes the state +bucket ARNs in [variables.tf](variables.tf) (`tfstate_bucket_arns`). + +### Build times out before `terraform apply` finishes + +Increase `build_timeout` in [main.tf](main.tf) (currently 60 minutes): + +```hcl +module "ghe_runner_cronjob" { + ... + build_timeout = 90 +} +``` diff --git a/codebuild/backend-configs/csvd-common-ew.tf b/codebuild/backend-configs/csvd-common-ew.tf new file mode 100644 index 0000000..65ef7bb --- /dev/null +++ b/codebuild/backend-configs/csvd-common-ew.tf @@ -0,0 +1,4 @@ +bucket = "inf-tfstate-220615867784" +key = "csvd-common-ew/common/apps/ghe-runner-codebuild" +region = "us-gov-east-1" +dynamodb_table = "tf_remote_state" diff --git a/codebuild/backend-configs/csvd-dev-ew.tf b/codebuild/backend-configs/csvd-dev-ew.tf new file mode 100644 index 0000000..8920ca5 --- /dev/null +++ b/codebuild/backend-configs/csvd-dev-ew.tf @@ -0,0 +1,4 @@ +bucket = "inf-tfstate-229685449397" +key = "csvd-dev-gov/common/apps/ghe-runner-codebuild" +region = "us-gov-east-1" +dynamodb_table = "tf_remote_state" diff --git a/codebuild/buildspec.yml b/codebuild/buildspec.yml new file mode 100644 index 0000000..0beb0a5 --- /dev/null +++ b/codebuild/buildspec.yml @@ -0,0 +1,134 @@ +version: 0.2 + +# ───────────────────────────────────────────────────────────────────────────── +# ghe-runner daily rebuild +# +# Runs `tf apply` against the ghe-runner workspace identified by $TF_WORKSPACE, +# then forces a new ECS deployment so runners pick up any updated task +# definition, and immediately invokes the token-refresh Lambda so a fresh +# registration token is available without waiting for the 30-min EventBridge +# schedule. +# +# The `tf` script is downloaded from the team gist at install time and used +# throughout — it automatically handles TF_DATA_DIR, var-file injection, and +# workspace-specific JSON env loading, so none of that needs to be replicated +# manually here. +# +# Environment variables injected by CodeBuild (see codebuild/main.tf): +# TF_WORKSPACE - ghe-runner workspace (e.g. "default", "csvd") +# TF_IN_AUTOMATION - set to "true" to suppress interactive prompts +# GITHUB_TOKEN - GitHub PAT from Secrets Manager +# HTTP_PROXY / HTTPS_PROXY / NO_PROXY - outbound proxy config +# ───────────────────────────────────────────────────────────────────────────── + +env: + variables: + TF_VERSION: "1.9.8" + TF_GIST_URL: "https://github.e.it.census.gov/gist/arnol377/21b70dd6790d2680a119a9f86369eced/raw/tf" + +phases: + + install: + on-failure: ABORT + commands: + # ── Install terraform ──────────────────────────────────────────────── + - echo "Installing terraform ${TF_VERSION}..." + - curl -sLo /tmp/tf.zip + "https://releases.hashicorp.com/terraform/${TF_VERSION}/terraform_${TF_VERSION}_linux_amd64.zip" + - unzip -q /tmp/tf.zip -d /usr/local/bin && rm /tmp/tf.zip + - terraform version + + # ── Install tf wrapper script from team gist ───────────────────────── + # GITHUB_TOKEN is already in the environment (injected from Secrets Manager). + - echo "Downloading tf script from gist..." + - curl -sLo /usr/local/bin/tf + -H "Authorization: token $GITHUB_TOKEN" + "$TF_GIST_URL" + - chmod +x /usr/local/bin/tf + - tf --version + + # ── Python already available in standard:7.0 ───────────────────────── + - pip3 install --quiet requests + + pre_build: + on-failure: ABORT + commands: + # ── Map GitHub PAT to Terraform variable ───────────────────────────── + # GITHUB_TOKEN is read automatically by the GitHub provider. + # TF_VAR_github_token satisfies the var.github_token Lambda variable. + - export TF_VAR_github_token="$GITHUB_TOKEN" + + # ── terraform init ─────────────────────────────────────────────────── + # Use a workspace-specific backend-config if one exists; otherwise fall + # back to the backend.tf already committed to the repo root. + # tf passes init straight through to terraform so -backend-config works. + - | + BACKEND_CONFIG="backend-configs/${TF_WORKSPACE}.tf" + if [ -f "$BACKEND_CONFIG" ]; then + echo "Initializing with backend-config: $BACKEND_CONFIG" + tf init -input=false -backend-config="$BACKEND_CONFIG" + else + echo "No workspace-specific backend config found; using backend.tf" + tf init -input=false + fi + + # ── Select (or create) workspace ───────────────────────────────────── + # tf workspace select also sets up TF_DATA_DIR, injects the var-file via + # TF_CLI_ARGS_*, and loads varfiles/${TF_WORKSPACE}.json into the env. + - tf workspace select "$TF_WORKSPACE" + - echo "Active workspace: $(tf workspace show)" + + build: + on-failure: ABORT + commands: + # ── Apply ───────────────────────────────────────────────────────────── + - echo "Running tf apply for workspace: $TF_WORKSPACE" + - tf apply -auto-approve -input=false + + # ── Token refresh before redeploy ──────────────────────────────────── + # Invoke the Lambda first so a fresh registration token is already in + # Secrets Manager before any new container starts. If the ECS force- + # deploy happened first, containers could start with a stale token and + # fail to register with GitHub. + - | + LAMBDA_ARN=$(tf output -raw lambda_token_refresh_arn 2>/dev/null || true) + AWS_REGION=$(aws configure get region || echo "us-gov-west-1") + + if [ -n "$LAMBDA_ARN" ]; then + echo "Invoking token refresh Lambda: $LAMBDA_ARN" + aws lambda invoke \ + --function-name "$LAMBDA_ARN" \ + --region "$AWS_REGION" \ + --log-type Tail \ + /tmp/lambda-response.json + echo "Lambda response:" + cat /tmp/lambda-response.json + else + echo "Lambda not deployed or output not found; skipping token refresh." + fi + + # ── Force new ECS deployment ────────────────────────────────────────── + # Cycles the runner tasks so they pick up the updated task definition, + # refreshed image digest, or any changed environment. Runs after the + # token refresh so containers start with a guaranteed-fresh token. + - | + AWS_REGION=$(aws configure get region || echo "us-gov-west-1") + CLUSTER=$(tf output -raw ecs_cluster_name 2>/dev/null) + SERVICE=$(tf output -raw github_runner_service_name 2>/dev/null) + + if [ -n "$CLUSTER" ] && [ -n "$SERVICE" ]; then + echo "Forcing new ECS deployment: cluster=$CLUSTER service=$SERVICE" + aws ecs update-service \ + --cluster "$CLUSTER" \ + --service "$SERVICE" \ + --force-new-deployment \ + --region "$AWS_REGION" + else + echo "WARNING: Could not read ECS cluster/service from tf outputs; skipping force-deploy." + fi + + post_build: + commands: + - echo "Build completed at $(date -u '+%Y-%m-%dT%H:%M:%SZ')" + - echo "Workspace: $TF_WORKSPACE" + - echo "Terraform workspace: $(tf workspace show 2>/dev/null || echo unknown)" diff --git a/codebuild/main.tf b/codebuild/main.tf new file mode 100644 index 0000000..389ea1d --- /dev/null +++ b/codebuild/main.tf @@ -0,0 +1,331 @@ +locals { + name = "ghe-runner-daily-${var.tf_workspace}" + + # Proxy env vars required for all outbound traffic in this environment + proxy_env = [ + { name = "HTTP_PROXY", value = "http://proxy.tco.census.gov:3128", type = "PLAINTEXT" }, + { name = "HTTPS_PROXY", value = "http://proxy.tco.census.gov:3128", type = "PLAINTEXT" }, + { name = "NO_PROXY", value = "169.254.170.2,.census.gov,169.254.169.254,10.0.0.0/8,172.16.0.0/12,.amazonaws.com,169.254.169.254", type = "PLAINTEXT" }, + ] + + # Non-secret environment variables: workspace selection and automation flag + job_env = [ + { name = "TF_WORKSPACE", value = var.tf_workspace, type = "PLAINTEXT" }, + { name = "TF_IN_AUTOMATION", value = "true", type = "PLAINTEXT" }, + ] + + # GitHub PAT injected from Secrets Manager. + # GITHUB_TOKEN is used by: + # 1. The Terraform GitHub provider (reads it automatically) + # 2. The buildspec which maps it to TF_VAR_github_token + secret_env = [ + { + name = "GITHUB_TOKEN" + value = "${var.github_token_secret_arn}:token" + type = "SECRETS_MANAGER" + }, + ] + + all_env_vars = concat(local.proxy_env, local.job_env, local.secret_env) +} + +# ── IAM permissions for the CodeBuild job ───────────────────────────────────── +# Everything `tf apply` needs to reconcile the ghe-runner workspace. + +data "aws_iam_policy_document" "ghe_runner_deploy" { + + # ── Secrets Manager ────────────────────────────────────────────────────── + # Read the GitHub PAT at build start; CRUD runner registration tokens + statement { + sid = "SecretsManagerGitHubToken" + effect = "Allow" + actions = [ + "secretsmanager:GetSecretValue", + "secretsmanager:DescribeSecret", + ] + resources = [var.github_token_secret_arn] + } + + statement { + sid = "SecretsManagerRunnerTokens" + effect = "Allow" + actions = [ + "secretsmanager:CreateSecret", + "secretsmanager:DescribeSecret", + "secretsmanager:GetSecretValue", + "secretsmanager:PutSecretValue", + "secretsmanager:UpdateSecret", + "secretsmanager:DeleteSecret", + "secretsmanager:TagResource", + ] + # The ghe-runner module stores tokens under this prefix + resources = [ + "arn:${data.aws_partition.current.partition}:secretsmanager:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:secret:/github-runners/*", + ] + } + + # ── Terraform remote state ──────────────────────────────────────────────── + statement { + sid = "TfStateS3" + effect = "Allow" + actions = [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:ListBucket", + "s3:GetBucketVersioning", + ] + resources = concat( + var.tfstate_bucket_arns, + [for arn in var.tfstate_bucket_arns : "${arn}/*"], + ) + } + + statement { + sid = "TfStateDynamoDB" + effect = "Allow" + actions = [ + "dynamodb:GetItem", + "dynamodb:PutItem", + "dynamodb:DeleteItem", + "dynamodb:DescribeTable", + ] + resources = var.tfstate_dynamodb_arns + } + + # ── ECS ─────────────────────────────────────────────────────────────────── + statement { + sid = "ECS" + effect = "Allow" + actions = [ + "ecs:CreateCluster", + "ecs:DeleteCluster", + "ecs:DescribeClusters", + "ecs:CreateService", + "ecs:DeleteService", + "ecs:DescribeServices", + "ecs:UpdateService", + "ecs:RegisterTaskDefinition", + "ecs:DeregisterTaskDefinition", + "ecs:DescribeTaskDefinition", + "ecs:ListServices", + "ecs:ListTaskDefinitions", + "ecs:PutClusterCapacityProviders", + "ecs:TagResource", + "ecs:UntagResource", + ] + resources = ["*"] + } + + # ── Lambda (token refresh function) ─────────────────────────────────────── + statement { + sid = "Lambda" + effect = "Allow" + actions = [ + "lambda:CreateFunction", + "lambda:DeleteFunction", + "lambda:GetFunction", + "lambda:GetFunctionConfiguration", + "lambda:UpdateFunctionCode", + "lambda:UpdateFunctionConfiguration", + "lambda:AddPermission", + "lambda:RemovePermission", + "lambda:InvokeFunction", + "lambda:TagResource", + "lambda:UntagResource", + "lambda:ListVersionsByFunction", + "lambda:PublishVersion", + ] + resources = [ + "arn:${data.aws_partition.current.partition}:lambda:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:function:github-runner-token-refresh-*", + ] + } + + # ── EventBridge (Lambda schedule + monitoring alarms) ────────────────── + statement { + sid = "EventBridge" + effect = "Allow" + actions = [ + "events:PutRule", + "events:DeleteRule", + "events:DescribeRule", + "events:PutTargets", + "events:RemoveTargets", + "events:ListTargetsByRule", + "events:TagResource", + ] + resources = ["*"] + } + + # ── CloudWatch Logs (runner log group) ──────────────────────────────────── + statement { + sid = "CloudWatchLogs" + effect = "Allow" + actions = [ + "logs:CreateLogGroup", + "logs:DeleteLogGroup", + "logs:DescribeLogGroups", + "logs:PutRetentionPolicy", + "logs:TagLogGroup", + "logs:ListTagsLogGroup", + "logs:ListTagsForResource", + "logs:TagResource", + ] + resources = ["*"] + } + + # ── CloudWatch Alarms + Dashboard (monitoring.tf) ───────────────────────── + statement { + sid = "CloudWatchMonitoring" + effect = "Allow" + actions = [ + "cloudwatch:PutMetricAlarm", + "cloudwatch:DeleteAlarms", + "cloudwatch:DescribeAlarms", + "cloudwatch:PutDashboard", + "cloudwatch:DeleteDashboards", + "cloudwatch:GetDashboard", + "cloudwatch:TagResource", + ] + resources = ["*"] + } + + # ── SNS (alert topic) ───────────────────────────────────────────────────── + statement { + sid = "SNS" + effect = "Allow" + actions = [ + "sns:CreateTopic", + "sns:DeleteTopic", + "sns:GetTopicAttributes", + "sns:SetTopicAttributes", + "sns:Subscribe", + "sns:Unsubscribe", + "sns:TagResource", + ] + resources = [ + "arn:${data.aws_partition.current.partition}:sns:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:github-runner-*", + ] + } + + # ── IAM (roles/policies created by the ghe-runner module) ───────────────── + statement { + sid = "IAMRoleManagement" + effect = "Allow" + actions = [ + "iam:CreateRole", + "iam:DeleteRole", + "iam:GetRole", + "iam:TagRole", + "iam:UntagRole", + "iam:UpdateRole", + "iam:PutRolePolicy", + "iam:GetRolePolicy", + "iam:DeleteRolePolicy", + "iam:AttachRolePolicy", + "iam:DetachRolePolicy", + "iam:ListAttachedRolePolicies", + "iam:ListRolePolicies", + "iam:PassRole", + ] + resources = [ + "arn:${data.aws_partition.current.partition}:iam::${data.aws_caller_identity.current.account_id}:role/*github*", + "arn:${data.aws_partition.current.partition}:iam::${data.aws_caller_identity.current.account_id}:role/*ghe*", + ] + } + + statement { + sid = "IAMPolicyManagement" + effect = "Allow" + actions = [ + "iam:CreatePolicy", + "iam:DeletePolicy", + "iam:GetPolicy", + "iam:GetPolicyVersion", + "iam:ListPolicyVersions", + "iam:CreatePolicyVersion", + "iam:DeletePolicyVersion", + "iam:TagPolicy", + ] + resources = [ + "arn:${data.aws_partition.current.partition}:iam::${data.aws_caller_identity.current.account_id}:policy/*", + ] + } + + # ── EC2 / VPC (describe only — we do not create VPC resources) ─────────── + statement { + sid = "EC2Describe" + effect = "Allow" + actions = [ + "ec2:DescribeVpcs", + "ec2:DescribeSubnets", + "ec2:DescribeSecurityGroups", + "ec2:DescribeVpcEndpoints", + "ec2:DescribeRouteTables", + ] + resources = ["*"] + } + + # ── S3 (cert bucket read, ECR clone if enabled) ─────────────────────────── + statement { + sid = "S3CertBucket" + effect = "Allow" + actions = [ + "s3:GetObject", + "s3:ListBucket", + ] + resources = [ + "arn:${data.aws_partition.current.partition}:s3:::csvd-dev-ew-github-actions", + "arn:${data.aws_partition.current.partition}:s3:::csvd-dev-ew-github-actions/*", + ] + } +} + +# ── Module call ─────────────────────────────────────────────────────────────── + +module "ghe_runner_cronjob" { + source = "./modules/terraform-codebuild-cronjob" + + name = local.name + description = "Daily terraform apply for the ghe-runner ECS Fargate GitHub Actions runners (workspace: ${var.tf_workspace})" + + # Schedule + schedule_expression = var.schedule_expression + + # Source: the GHE repo. + # NOTE: Before applying, register GHE credentials with CodeBuild once: + # aws codebuild import-source-credentials \ + # --server-type GITHUB_ENTERPRISE \ + # --auth-type PERSONAL_ACCESS_TOKEN \ + # --token $GITHUB_TOKEN \ + # --region us-gov-west-1 + source_type = "GITHUB_ENTERPRISE" + source_location = var.source_location + source_version = var.source_version + insecure_ssl = true # GHE uses an internal CA not trusted by the CodeBuild image + + # Use the buildspec file that lives alongside this code in the repo. + # CodeBuild checks out to CODEBUILD_SRC_DIR (repo root), so paths in the + # buildspec are relative to the repo root, not to codebuild/. + buildspec_path = "codebuild/buildspec.yml" + + # Compute + compute_type = "BUILD_GENERAL1_SMALL" + build_image = "aws/codebuild/standard:7.0" + + # Environment + environment_variables = local.all_env_vars + + # VPC — same placement as the runner tasks + vpc_id = var.vpc_id + subnet_ids = var.subnet_ids + security_group_ids = var.security_group_ids + + # Permissions + additional_policy_json = data.aws_iam_policy_document.ghe_runner_deploy.json + + tags = { + Purpose = "ghe-runner-daily-deploy" + TFWorkspace = var.tf_workspace + } +} diff --git a/codebuild/modules/terraform-codebuild-cronjob/iam.tf b/codebuild/modules/terraform-codebuild-cronjob/iam.tf new file mode 100644 index 0000000..3131d38 --- /dev/null +++ b/codebuild/modules/terraform-codebuild-cronjob/iam.tf @@ -0,0 +1,129 @@ +# ── CodeBuild service role ──────────────────────────────────────────────────── + +data "aws_iam_policy_document" "codebuild_assume" { + statement { + actions = ["sts:AssumeRole"] + principals { + type = "Service" + identifiers = ["codebuild.amazonaws.com"] + } + } +} + +resource "aws_iam_role" "codebuild" { + name = "${var.name}-codebuild-role" + assume_role_policy = data.aws_iam_policy_document.codebuild_assume.json + tags = var.tags +} + +# Base permissions every CodeBuild project needs: +# - Write logs to CloudWatch +# - Create/manage ENIs when running inside a VPC +# - Describe EC2 resources required by the VPC-attached build environment +data "aws_iam_policy_document" "codebuild_base" { + # CloudWatch Logs + statement { + sid = "CloudWatchLogs" + effect = "Allow" + actions = [ + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents", + ] + resources = [ + "arn:${data.aws_partition.current.partition}:logs:*:*:log-group:/codebuild/${var.name}", + "arn:${data.aws_partition.current.partition}:logs:*:*:log-group:/codebuild/${var.name}:*", + ] + } + + # VPC networking — only needed when the build runs in a VPC, but harmless + # to include unconditionally; actions are no-ops without a VPC ENI. + statement { + sid = "VpcNetworking" + effect = "Allow" + actions = [ + "ec2:CreateNetworkInterface", + "ec2:DescribeNetworkInterfaces", + "ec2:DeleteNetworkInterface", + "ec2:DescribeSubnets", + "ec2:DescribeSecurityGroups", + "ec2:DescribeVpcs", + "ec2:DescribeDhcpOptions", + ] + resources = ["*"] + } + + statement { + sid = "VpcNetworkInterfacePermission" + effect = "Allow" + actions = [ + "ec2:CreateNetworkInterfacePermission", + ] + resources = ["*"] + condition { + test = "StringEquals" + variable = "ec2:AuthorizedService" + values = ["codebuild.amazonaws.com"] + } + } +} + +resource "aws_iam_role_policy" "codebuild_base" { + name = "${var.name}-base" + role = aws_iam_role.codebuild.id + policy = data.aws_iam_policy_document.codebuild_base.json +} + +# Caller-supplied additional inline policy (job-specific permissions) +resource "aws_iam_role_policy" "codebuild_additional" { + count = var.additional_policy_json != null ? 1 : 0 + name = "${var.name}-additional" + role = aws_iam_role.codebuild.id + policy = var.additional_policy_json +} + +# Caller-supplied managed policy attachments +resource "aws_iam_role_policy_attachment" "codebuild_additional" { + for_each = toset(var.additional_policy_arns) + role = aws_iam_role.codebuild.name + policy_arn = each.value +} + +# ── EventBridge role (triggers CodeBuild) ───────────────────────────────────── + +data "aws_iam_policy_document" "events_assume" { + statement { + actions = ["sts:AssumeRole"] + principals { + type = "Service" + identifiers = ["events.amazonaws.com"] + } + } +} + +resource "aws_iam_role" "events" { + name = "${var.name}-events-role" + assume_role_policy = data.aws_iam_policy_document.events_assume.json + tags = var.tags +} + +data "aws_iam_policy_document" "events_codebuild" { + statement { + sid = "StartBuild" + effect = "Allow" + actions = ["codebuild:StartBuild"] + resources = [ + aws_codebuild_project.this.arn, + ] + } +} + +resource "aws_iam_role_policy" "events_codebuild" { + name = "${var.name}-start-build" + role = aws_iam_role.events.id + policy = data.aws_iam_policy_document.events_codebuild.json +} + +# ── Data sources used within the module ─────────────────────────────────────── + +data "aws_partition" "current" {} diff --git a/codebuild/modules/terraform-codebuild-cronjob/main.tf b/codebuild/modules/terraform-codebuild-cronjob/main.tf new file mode 100644 index 0000000..596f100 --- /dev/null +++ b/codebuild/modules/terraform-codebuild-cronjob/main.tf @@ -0,0 +1,93 @@ +# CodeBuild project +resource "aws_codebuild_project" "this" { + name = var.name + description = var.description + build_timeout = var.build_timeout + service_role = aws_iam_role.codebuild.arn + + # Artifacts: no persistent artifacts; build output goes to CloudWatch Logs + artifacts { + type = "NO_ARTIFACTS" + } + + # Logs: stream directly to CloudWatch + logs_config { + cloudwatch_logs { + group_name = "/codebuild/${var.name}" + stream_name = "build" + } + } + + environment { + compute_type = var.compute_type + image = var.build_image + type = var.environment_type + image_pull_credentials_type = "CODEBUILD" + privileged_mode = var.privileged_mode + + dynamic "environment_variable" { + for_each = var.environment_variables + content { + name = environment_variable.value.name + value = environment_variable.value.value + type = environment_variable.value.type + } + } + } + + source { + type = var.source_type + location = var.source_type != "NO_SOURCE" ? var.source_location : null + buildspec = var.buildspec != null ? var.buildspec : (var.buildspec_path != null ? var.buildspec_path : null) + insecure_ssl = var.source_type != "NO_SOURCE" ? var.insecure_ssl : null + # buildspec accepts either inline YAML content or a path relative to the + # source root. Setting it to var.buildspec_path tells CodeBuild to read + # the file at that path from the checked-out source. + git_clone_depth = var.source_type != "NO_SOURCE" ? 1 : null + + dynamic "git_submodules_config" { + for_each = var.source_type != "NO_SOURCE" ? [1] : [] + content { + fetch_submodules = false + } + } + } + + source_version = var.source_type != "NO_SOURCE" ? var.source_version : null + + dynamic "vpc_config" { + for_each = var.vpc_id != null ? [1] : [] + content { + vpc_id = var.vpc_id + subnets = var.subnet_ids + security_group_ids = var.security_group_ids + } + } + + tags = merge(var.tags, { + Name = var.name + }) +} + +# ── CloudWatch Logs group for build output ──────────────────────────────────── +resource "aws_cloudwatch_log_group" "codebuild" { + name = "/codebuild/${var.name}" + retention_in_days = 90 + tags = var.tags +} + +# ── EventBridge scheduled rule ──────────────────────────────────────────────── +resource "aws_cloudwatch_event_rule" "schedule" { + name = "${var.name}-schedule" + description = "Daily schedule for ${var.name}" + schedule_expression = var.schedule_expression + state = var.schedule_enabled ? "ENABLED" : "DISABLED" + tags = var.tags +} + +resource "aws_cloudwatch_event_target" "codebuild" { + rule = aws_cloudwatch_event_rule.schedule.name + target_id = "${var.name}-codebuild" + arn = aws_codebuild_project.this.arn + role_arn = aws_iam_role.events.arn +} diff --git a/codebuild/modules/terraform-codebuild-cronjob/outputs.tf b/codebuild/modules/terraform-codebuild-cronjob/outputs.tf new file mode 100644 index 0000000..4a5a5f3 --- /dev/null +++ b/codebuild/modules/terraform-codebuild-cronjob/outputs.tf @@ -0,0 +1,34 @@ +output "codebuild_project_name" { + description = "Name of the CodeBuild project." + value = aws_codebuild_project.this.name +} + +output "codebuild_project_arn" { + description = "ARN of the CodeBuild project." + value = aws_codebuild_project.this.arn +} + +output "codebuild_role_arn" { + description = "ARN of the IAM role used by the CodeBuild project." + value = aws_iam_role.codebuild.arn +} + +output "codebuild_role_name" { + description = "Name of the IAM role used by the CodeBuild project." + value = aws_iam_role.codebuild.name +} + +output "events_role_arn" { + description = "ARN of the IAM role used by EventBridge to trigger CodeBuild." + value = aws_iam_role.events.arn +} + +output "schedule_rule_name" { + description = "Name of the EventBridge scheduled rule." + value = aws_cloudwatch_event_rule.schedule.name +} + +output "log_group_name" { + description = "CloudWatch log group name for build output." + value = aws_cloudwatch_log_group.codebuild.name +} diff --git a/codebuild/modules/terraform-codebuild-cronjob/variables.tf b/codebuild/modules/terraform-codebuild-cronjob/variables.tf new file mode 100644 index 0000000..9e172d0 --- /dev/null +++ b/codebuild/modules/terraform-codebuild-cronjob/variables.tf @@ -0,0 +1,199 @@ +# ─── Identity ───────────────────────────────────────────────────────────────── + +variable "name" { + description = "Name prefix for all resources created by this module." + type = string +} + +variable "description" { + description = "Human-readable description of what this cronjob does." + type = string + default = "" +} + +variable "tags" { + description = "Tags to apply to all resources." + type = map(string) + default = {} +} + +# ─── Schedule ───────────────────────────────────────────────────────────────── + +variable "schedule_expression" { + description = <<-EOT + EventBridge schedule expression for the job. + Examples: + "rate(1 day)" + "cron(0 11 * * ? *)" # 11:00 UTC daily + EOT + type = string +} + +variable "schedule_enabled" { + description = "Set to false to disable the EventBridge schedule without destroying it." + type = bool + default = true +} + +# ─── Source ─────────────────────────────────────────────────────────────────── + +variable "source_type" { + description = <<-EOT + CodeBuild source type. One of: + GITHUB_ENTERPRISE - internal GHE (requires import-source-credentials) + CODECOMMIT + GITHUB + S3 + NO_SOURCE - use buildspec only; clone manually in build script + EOT + type = string + default = "GITHUB_ENTERPRISE" + + validation { + condition = contains(["GITHUB_ENTERPRISE", "CODECOMMIT", "GITHUB", "S3", "NO_SOURCE"], var.source_type) + error_message = "source_type must be one of: GITHUB_ENTERPRISE, CODECOMMIT, GITHUB, S3, NO_SOURCE." + } +} + +variable "source_location" { + description = "Clone URL of the repository. Not required when source_type = NO_SOURCE." + type = string + default = null +} + +variable "insecure_ssl" { + description = <<-EOT + Skip TLS certificate verification when cloning the source repository. + Set to true when the GHE instance uses an internal CA that CodeBuild does + not trust (common in GovCloud / air-gapped environments). + EOT + type = bool + default = false +} + +variable "source_version" { + description = "Branch, tag, or commit SHA to build from." + type = string + default = "main" +} + +variable "buildspec" { + description = <<-EOT + Inline buildspec YAML content. Pass file("buildspec.yml") from the calling + module to keep it as a standalone file. When source_type != NO_SOURCE you + may also leave this null and set buildspec_path instead. + EOT + type = string + default = null +} + +variable "buildspec_path" { + description = <<-EOT + Path to the buildspec file relative to the repo checkout root. + Used when buildspec = null and source_type != NO_SOURCE. + Example: "codebuild/buildspec.yml" + EOT + type = string + default = null + + validation { + condition = ! (var.buildspec == null && var.buildspec_path == null) || true + error_message = "Provide either buildspec (inline) or buildspec_path." + } +} + +# ─── Compute ────────────────────────────────────────────────────────────────── + +variable "compute_type" { + description = "CodeBuild compute type." + type = string + default = "BUILD_GENERAL1_SMALL" + + validation { + condition = contains([ + "BUILD_GENERAL1_SMALL", + "BUILD_GENERAL1_MEDIUM", + "BUILD_GENERAL1_LARGE", + "BUILD_GENERAL1_2XLARGE", + ], var.compute_type) + error_message = "Invalid compute_type." + } +} + +variable "build_image" { + description = "Docker image used for the build environment." + type = string + default = "aws/codebuild/standard:7.0" +} + +variable "environment_type" { + description = "CodeBuild environment type." + type = string + default = "LINUX_CONTAINER" +} + +variable "privileged_mode" { + description = "Enable privileged mode (required if the build itself runs Docker)." + type = bool + default = false +} + +variable "build_timeout" { + description = "Maximum build duration in minutes." + type = number + default = 60 +} + +# ─── Environment Variables ──────────────────────────────────────────────────── + +variable "environment_variables" { + description = <<-EOT + List of environment variables to inject into the build. + type can be PLAINTEXT, PARAMETER_STORE, or SECRETS_MANAGER. + For SECRETS_MANAGER, value should be the ARN (optionally with :key suffix). + EOT + type = list(object({ + name = string + value = string + type = optional(string, "PLAINTEXT") + })) + default = [] +} + +# ─── VPC ────────────────────────────────────────────────────────────────────── + +variable "vpc_id" { + description = "VPC ID for running the build inside a VPC. Required when accessing private resources." + type = string + default = null +} + +variable "subnet_ids" { + description = "Subnets for the CodeBuild VPC configuration." + type = list(string) + default = [] +} + +variable "security_group_ids" { + description = "Security groups for the CodeBuild VPC configuration." + type = list(string) + default = [] +} + +# ─── IAM ────────────────────────────────────────────────────────────────────── + +variable "additional_policy_arns" { + description = "List of existing IAM policy ARNs to attach to the CodeBuild service role." + type = list(string) + default = [] +} + +variable "additional_policy_json" { + description = <<-EOT + JSON-encoded IAM policy document granting job-specific permissions. + Generate with data "aws_iam_policy_document" in the caller and pass .json. + This is where callers inject permissions like ECS, Lambda, S3, etc. + EOT + type = string + default = null +} diff --git a/codebuild/outputs.tf b/codebuild/outputs.tf new file mode 100644 index 0000000..e641f54 --- /dev/null +++ b/codebuild/outputs.tf @@ -0,0 +1,29 @@ +output "codebuild_project_name" { + description = "Name of the CodeBuild project to trigger manually or inspect in the console." + value = module.ghe_runner_cronjob.codebuild_project_name +} + +output "codebuild_project_arn" { + description = "ARN of the CodeBuild project." + value = module.ghe_runner_cronjob.codebuild_project_arn +} + +output "codebuild_role_arn" { + description = "ARN of the IAM role assumed by CodeBuild during builds." + value = module.ghe_runner_cronjob.codebuild_role_arn +} + +output "schedule_rule_name" { + description = "Name of the EventBridge rule that triggers the daily build." + value = module.ghe_runner_cronjob.schedule_rule_name +} + +output "log_group_name" { + description = "CloudWatch log group where build output is streamed." + value = module.ghe_runner_cronjob.log_group_name +} + +output "manual_trigger_command" { + description = "AWS CLI command to trigger a build manually outside the schedule." + value = "aws codebuild start-build --project-name ${module.ghe_runner_cronjob.codebuild_project_name} --region ${data.aws_region.current.name}" +} diff --git a/codebuild/providers.tf b/codebuild/providers.tf new file mode 100644 index 0000000..363f8ad --- /dev/null +++ b/codebuild/providers.tf @@ -0,0 +1,35 @@ +terraform { + required_version = "~> 1.9" + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.70" + } + } + + # Use the same S3 backend as the root workspace but a distinct state key. + # Initialize with the appropriate backend-config file, e.g.: + # terraform init -backend-config=../backend-configs/csvd-dev-ew.tf + backend "s3" { + key = "csvd-dev-gov/common/apps/ghe-runner-codebuild" + # bucket, region, and dynamodb_table are supplied via -backend-config + } +} + +provider "aws" { + region = var.aws_region + + default_tags { + tags = { + finops_project_name = "csvd_github_actions" + finops_project_number = "fs0000000078" + finops_project_role = "csvd_github_actions" + organization = "census:ocio:csvd" + } + } +} + +data "aws_caller_identity" "current" {} +data "aws_region" "current" {} +data "aws_partition" "current" {} diff --git a/codebuild/varfiles/default.json b/codebuild/varfiles/default.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/codebuild/varfiles/default.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/codebuild/varfiles/default.tfvars b/codebuild/varfiles/default.tfvars new file mode 100644 index 0000000..6360c0b --- /dev/null +++ b/codebuild/varfiles/default.tfvars @@ -0,0 +1,17 @@ +# Available Terminal Colors for TF_WORKSPACE_COLOR: +# 30 - Black +# 31 - Red +# 32 - Green +# 33 - Yellow +# 34 - Blue +# 35 - Magenta +# 36 - Cyan +# 37 - White (default) +# 90 - Bright Black (Gray) +# 91 - Bright Red +# 92 - Bright Green +# 93 - Bright Yellow +# 94 - Bright Blue +# 95 - Bright Magenta +# 96 - Bright Cyan +# 97 - Bright White diff --git a/codebuild/variables.tf b/codebuild/variables.tf new file mode 100644 index 0000000..4bf0822 --- /dev/null +++ b/codebuild/variables.tf @@ -0,0 +1,100 @@ +# ── Identity / naming ───────────────────────────────────────────────────────── + +variable "aws_region" { + description = "AWS region for all resources." + type = string + default = "us-gov-west-1" +} + +# ── ghe-runner workspace selection ─────────────────────────────────────────── +# This is the Terraform workspace inside the ghe-runner directory that the +# CodeBuild job will deploy. Stored as a plain PLAINTEXT env var on the +# CodeBuild project so it can be overridden per-execution if needed. + +variable "tf_workspace" { + description = <<-EOT + Terraform workspace in the ghe-runner root to target. + Maps to varfiles/{tf_workspace}.tfvars and the matching backend-configs entry. + Common values: "default", "csvd", "sct-engineering" + EOT + type = string + default = "default" +} + +# ── Schedule ────────────────────────────────────────────────────────────────── + +variable "schedule_expression" { + description = "EventBridge schedule expression. Defaults to 06:00 ET (11:00 UTC) daily." + type = string + default = "cron(0 11 * * ? *)" +} + +# ── GitHub ──────────────────────────────────────────────────────────────────── + +variable "github_token_secret_arn" { + description = <<-EOT + ARN of the Secrets Manager secret that holds the GitHub PAT. + The secret must have a key named "token". + Example: "arn:aws-us-gov:secretsmanager:us-gov-west-1:123456789:secret:ghe-runner/github-token-abc123" + EOT + type = string +} + +variable "source_location" { + description = "HTTPS clone URL of the ghe-runner repo on GitHub Enterprise." + type = string + default = "https://github.e.it.census.gov/CSVD/ghe-runners.git" +} + +variable "source_version" { + description = "Branch or tag to build from." + type = string + default = "main" +} + +# ── VPC ────────────────────────────────────────────────────────────────────── +# Use the same network placement as the runner ECS tasks so the build can +# reach the proxy, GHE, and AWS APIs through the same paths. + +variable "vpc_id" { + description = "VPC ID for the CodeBuild environment." + type = string + default = "vpc-00576a396ec570b94" +} + +variable "subnet_ids" { + description = "Subnets for CodeBuild. Should be private subnets with proxy/NAT access." + type = list(string) + default = ["subnet-0b1992a84536c581b"] +} + +variable "security_group_ids" { + description = "Security groups for CodeBuild." + type = list(string) + default = ["sg-0641c697588b9aa6b"] +} + +# ── Terraform state backend ─────────────────────────────────────────────────── +# The CodeBuild job needs read/write access to whichever S3 bucket(s) back +# the ghe-runner state for the target workspace. + +variable "tfstate_bucket_arns" { + description = <<-EOT + ARNs of all S3 state buckets the CodeBuild role must access. + Add one entry per workspace / account combination you plan to target. + EOT + type = list(string) + default = [ + "arn:aws-us-gov:s3:::inf-tfstate-229685449397", # csvd-dev / default workspace + "arn:aws-us-gov:s3:::inf-tfstate-220615867784", # csvd-common workspace + ] +} + +variable "tfstate_dynamodb_arns" { + description = "ARNs of DynamoDB tables used for Terraform state locking." + type = list(string) + default = [ + "arn:aws-us-gov:dynamodb:us-gov-east-1:229685449397:table/tf_remote_state", + "arn:aws-us-gov:dynamodb:us-gov-east-1:220615867784:table/tf_remote_state", + ] +} diff --git a/default.auto.tfvars b/default.auto.tfvars index 818c65c..f90b566 100644 --- a/default.auto.tfvars +++ b/default.auto.tfvars @@ -28,5 +28,8 @@ certs = { aws_account = "csvd-dev-ew" repo_org = "SCT-Engineering" +# Runner Configuration +desired_count = 1 + # Monitoring Configuration alert_email = "david.j.arnold.jr@census.gov" diff --git a/ecs_cluster.tf b/ecs_cluster.tf index 1025bfd..28cd7c4 100644 --- a/ecs_cluster.tf +++ b/ecs_cluster.tf @@ -9,7 +9,7 @@ resource "aws_ecs_cluster" "github-runner" { tags_all = {} setting { name = "containerInsights" - value = "disabled" + value = "enabled" } } diff --git a/lambda_token_refresh.tf b/lambda_token_refresh.tf index 76afb3f..a6ddcd8 100644 --- a/lambda_token_refresh.tf +++ b/lambda_token_refresh.tf @@ -14,10 +14,15 @@ locals { lambda_function_name = "github-runner-token-refresh-${var.aws_account}" + # Secret name pattern matches what the github-runner module creates + secret_name_prefix = "/github-runners/${lower(var.repo_org)}-${data.aws_caller_identity.current.account_id}-${data.aws_region.current.name}" + secret_arn_pattern = "arn:${data.aws_partition.current.partition}:secretsmanager:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:secret:${local.secret_name_prefix}/*" } # Install Python dependencies locally for Lambda packaging resource "null_resource" "lambda_dependencies" { + count = var.enable_lambda_token_refresh ? 1 : 0 + triggers = { requirements = filemd5("${path.module}/lambda/requirements_pat.txt") source_code = filemd5("${path.module}/lambda/token_refresh_pat.py") @@ -36,6 +41,7 @@ resource "null_resource" "lambda_dependencies" { # Create ZIP file for Lambda deployment with dependencies data "archive_file" "token_refresh_lambda" { + count = var.enable_lambda_token_refresh ? 1 : 0 type = "zip" source_dir = "${path.module}/lambda/package" output_path = "${path.module}/lambda/token_refresh.zip" @@ -45,11 +51,12 @@ data "archive_file" "token_refresh_lambda" { # Lambda function resource "aws_lambda_function" "token_refresh" { - filename = data.archive_file.token_refresh_lambda.output_path + count = var.enable_lambda_token_refresh ? 1 : 0 + filename = data.archive_file.token_refresh_lambda[0].output_path function_name = local.lambda_function_name - role = aws_iam_role.lambda_refresh_role.arn + role = aws_iam_role.lambda_refresh_role[0].arn handler = "token_refresh.lambda_handler" - source_code_hash = data.archive_file.token_refresh_lambda.output_base64sha256 + source_code_hash = data.archive_file.token_refresh_lambda[0].output_base64sha256 runtime = "python3.11" timeout = 60 @@ -58,10 +65,16 @@ resource "aws_lambda_function" "token_refresh" { GITHUB_TOKEN = var.github_token GITHUB_ORG = var.repo_org GITHUB_URL = var.server_url - SECRET_NAME = module.github-runner.secret_name + SECRET_NAME = local.secret_name_prefix + GITHUB_IP = data.dns_a_record_set.github.addrs[0] # Primary IP for github.e.it.census.gov } } + vpc_config { + subnet_ids = var.subnets + security_group_ids = var.security_groups + } + tags = { Name = local.lambda_function_name Environment = var.aws_account @@ -71,6 +84,7 @@ resource "aws_lambda_function" "token_refresh" { # CloudWatch Event Rule - trigger every 30 minutes resource "aws_cloudwatch_event_rule" "token_refresh_schedule" { + count = var.enable_lambda_token_refresh ? 1 : 0 name = "${local.lambda_function_name}-schedule" description = "Refresh GitHub runner registration token every 30 minutes" schedule_expression = "rate(30 minutes)" @@ -83,23 +97,26 @@ resource "aws_cloudwatch_event_rule" "token_refresh_schedule" { # CloudWatch Event Target resource "aws_cloudwatch_event_target" "token_refresh_target" { - rule = aws_cloudwatch_event_rule.token_refresh_schedule.name + count = var.enable_lambda_token_refresh ? 1 : 0 + rule = aws_cloudwatch_event_rule.token_refresh_schedule[0].name target_id = "RefreshTokenLambda" - arn = aws_lambda_function.token_refresh.arn + arn = aws_lambda_function.token_refresh[0].arn } # Allow EventBridge to invoke Lambda resource "aws_lambda_permission" "allow_eventbridge" { + count = var.enable_lambda_token_refresh ? 1 : 0 statement_id = "AllowExecutionFromEventBridge" action = "lambda:InvokeFunction" - function_name = aws_lambda_function.token_refresh.function_name + function_name = aws_lambda_function.token_refresh[0].function_name principal = "events.amazonaws.com" - source_arn = aws_cloudwatch_event_rule.token_refresh_schedule.arn + source_arn = aws_cloudwatch_event_rule.token_refresh_schedule[0].arn } # IAM Role for Lambda resource "aws_iam_role" "lambda_refresh_role" { - name = "${local.lambda_function_name}-role" + count = var.enable_lambda_token_refresh ? 1 : 0 + name = "${local.lambda_function_name}-role" assume_role_policy = jsonencode({ Version = "2012-10-17" @@ -120,8 +137,9 @@ resource "aws_iam_role" "lambda_refresh_role" { # IAM Policy for Lambda resource "aws_iam_role_policy" "lambda_refresh_policy" { - name = "token-refresh-policy" - role = aws_iam_role.lambda_refresh_role.id + count = var.enable_lambda_token_refresh ? 1 : 0 + name = "token-refresh-policy" + role = aws_iam_role.lambda_refresh_role[0].id policy = jsonencode({ Version = "2012-10-17" @@ -133,7 +151,14 @@ resource "aws_iam_role_policy" "lambda_refresh_policy" { "secretsmanager:GetSecretValue", "secretsmanager:PutSecretValue" ] - Resource = module.github-runner.secret_arn + Resource = local.secret_arn_pattern + }, + { + Effect = "Allow" + Action = [ + "secretsmanager:ListSecrets" + ] + Resource = "*" }, { Effect = "Allow" @@ -150,6 +175,7 @@ resource "aws_iam_role_policy" "lambda_refresh_policy" { # CloudWatch Log Group for Lambda resource "aws_cloudwatch_log_group" "lambda_logs" { + count = var.enable_lambda_token_refresh ? 1 : 0 name = "/aws/lambda/${local.lambda_function_name}" retention_in_days = 7 @@ -161,6 +187,7 @@ resource "aws_cloudwatch_log_group" "lambda_logs" { # CloudWatch Alarm for Lambda failures - connected to SNS resource "aws_cloudwatch_metric_alarm" "lambda_errors" { + count = var.enable_lambda_token_refresh ? 1 : 0 alarm_name = "${local.lambda_function_name}-errors" comparison_operator = "GreaterThanThreshold" evaluation_periods = 2 @@ -176,7 +203,7 @@ resource "aws_cloudwatch_metric_alarm" "lambda_errors" { ok_actions = [aws_sns_topic.github_runner_critical_alerts.arn] dimensions = { - FunctionName = aws_lambda_function.token_refresh.function_name + FunctionName = aws_lambda_function.token_refresh[0].function_name } tags = { @@ -191,20 +218,20 @@ data "aws_partition" "current" {} # Output Lambda function details output "lambda_token_refresh_function_name" { description = "Name of the Lambda function that refreshes GitHub tokens" - value = aws_lambda_function.token_refresh.function_name + value = var.enable_lambda_token_refresh ? aws_lambda_function.token_refresh[0].function_name : null } output "lambda_token_refresh_function_arn" { description = "ARN of the Lambda function that refreshes GitHub tokens" - value = aws_lambda_function.token_refresh.arn + value = var.enable_lambda_token_refresh ? aws_lambda_function.token_refresh[0].arn : null } output "lambda_token_refresh_schedule" { description = "Schedule for automatic token refresh" - value = aws_cloudwatch_event_rule.token_refresh_schedule.schedule_expression + value = var.enable_lambda_token_refresh ? aws_cloudwatch_event_rule.token_refresh_schedule[0].schedule_expression : null } output "lambda_token_refresh_log_group" { description = "CloudWatch log group for Lambda function" - value = aws_cloudwatch_log_group.lambda_logs.name + value = var.enable_lambda_token_refresh ? aws_cloudwatch_log_group.lambda_logs[0].name : null } diff --git a/monitoring.tf b/monitoring.tf index c2fff76..6bbc71d 100644 --- a/monitoring.tf +++ b/monitoring.tf @@ -43,7 +43,7 @@ resource "aws_cloudwatch_metric_alarm" "runners_critical" { alarm_name = "github-runners-critical-capacity-${var.aws_account}" comparison_operator = "LessThanThreshold" evaluation_periods = 2 - metric_name = "RunningTasksCount" + metric_name = "RunningTaskCount" namespace = "ECS/ContainerInsights" period = 300 statistic = "Average" @@ -71,7 +71,7 @@ resource "aws_cloudwatch_metric_alarm" "runners_emergency" { alarm_name = "github-runners-emergency-all-down-${var.aws_account}" comparison_operator = "LessThanOrEqualToThreshold" evaluation_periods = 1 - metric_name = "RunningTasksCount" + metric_name = "RunningTaskCount" namespace = "ECS/ContainerInsights" period = 60 statistic = "Maximum" diff --git a/outputs.tf b/outputs.tf index 2028cfc..4833567 100644 --- a/outputs.tf +++ b/outputs.tf @@ -96,4 +96,9 @@ output "vpc_config" { subnets = var.subnets security_groups = var.security_groups } +} + +output "lambda_token_refresh_arn" { + description = "ARN of the token-refresh Lambda function (empty string when not deployed)." + value = var.enable_lambda_token_refresh ? aws_lambda_function.token_refresh[0].arn : "" } \ No newline at end of file diff --git a/varfiles/default.json b/varfiles/default.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/varfiles/default.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/varfiles/default.tfvars b/varfiles/default.tfvars new file mode 100644 index 0000000..6360c0b --- /dev/null +++ b/varfiles/default.tfvars @@ -0,0 +1,17 @@ +# Available Terminal Colors for TF_WORKSPACE_COLOR: +# 30 - Black +# 31 - Red +# 32 - Green +# 33 - Yellow +# 34 - Blue +# 35 - Magenta +# 36 - Cyan +# 37 - White (default) +# 90 - Bright Black (Gray) +# 91 - Bright Red +# 92 - Bright Green +# 93 - Bright Yellow +# 94 - Bright Blue +# 95 - Bright Magenta +# 96 - Bright Cyan +# 97 - Bright White diff --git a/variables.tf b/variables.tf index 7f2c896..164208e 100644 --- a/variables.tf +++ b/variables.tf @@ -230,3 +230,9 @@ variable "alert_email" { error_message = "Must be a valid email address" } } + +variable "enable_lambda_token_refresh" { + description = "Enable the Lambda function for automatic GitHub runner registration token refresh" + type = bool + default = false +}