diff --git a/.gitignore b/.gitignore index 3190621..2516318 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,14 @@ # Local .terraform directories **/.terraform/* +# Lambda packaging artifacts +lambda/package/ +lambda/*.zip + +# GitHub App private keys (PEM files) +*.pem +*.private-key.pem + # .tfstate files *.tfstate *.tfstate.* diff --git a/.terraform_commits b/.terraform_commits index 46ed4df..a239210 100644 --- a/.terraform_commits +++ b/.terraform_commits @@ -70,5 +70,23 @@ "commit_message": "Merge branch 'main' of github.e.it.census.gov:CSVD/ghe-runners", "author": "arnol377", "timestamp": "2025-05-20T13:49:56.129780" + }, + { + "commit_hash": "8c8dee046ff589190d067908249272fbac3c00c0", + "commit_message": "Add GitHub Actions Runner Setup Guide to README.md", + "author": "Your Name", + "timestamp": "2025-06-25T16:10:35.535914" + }, + { + "commit_hash": "8c8dee046ff589190d067908249272fbac3c00c0", + "commit_message": "Add GitHub Actions Runner Setup Guide to README.md", + "author": "Your Name", + "timestamp": "2025-09-03T15:04:24.469296" + }, + { + "commit_hash": "8c8dee046ff589190d067908249272fbac3c00c0", + "commit_message": "Add GitHub Actions Runner Setup Guide to README.md", + "author": "Your Name", + "timestamp": "2025-10-31T13:13:21.490997" } ] \ No newline at end of file diff --git a/AWS_PERMISSIONS.md b/AWS_PERMISSIONS.md new file mode 100644 index 0000000..849a014 --- /dev/null +++ b/AWS_PERMISSIONS.md @@ -0,0 +1,1280 @@ +# AWS Permissions Reference + +This document provides detailed information about the AWS IAM permissions required to deploy and operate GitHub Actions ECS Fargate runners. + +## Table of Contents + +- [Overview](#overview) +- [Deployment Permissions](#deployment-permissions) +- [Runtime Permissions](#runtime-permissions) +- [Example IAM Policies](#example-iam-policies) +- [Security Best Practices](#security-best-practices) +- [Addressing Security Concerns](#addressing-security-concerns) +- [Troubleshooting Permission Issues](#troubleshooting-permission-issues) + +## Overview + +The ghe-runner infrastructure requires three sets of IAM permissions: + +1. **Deployment Permissions**: Required by the user/role executing Terraform to create the infrastructure +2. **Task Role Permissions**: Used by running containers to execute GitHub Actions workflows +3. **Execution Role Permissions**: Used by ECS to manage container lifecycle (automatically configured) + +## Deployment Permissions + +### Required for Terraform Deployment + +The IAM user or role running `terraform apply` needs permissions to create and manage all infrastructure components. + +#### Minimum Required Services + +| Service | Actions | Purpose | +|---------|---------|---------| +| **IAM** | `iam:CreateRole`, `iam:CreatePolicy`, `iam:AttachRolePolicy`, `iam:PutRolePolicy`, `iam:GetRole`, `iam:GetPolicy`, `iam:ListAttachedRolePolicies`, `iam:DeleteRole`, `iam:DeletePolicy`, `iam:DetachRolePolicy`, `iam:DeleteRolePolicy`, `iam:TagRole`, `iam:UntagRole`, `iam:PassRole` | Create and manage IAM roles and policies for ECS tasks | +| **ECS** | `ecs:CreateCluster`, `ecs:CreateService`, `ecs:RegisterTaskDefinition`, `ecs:DescribeCluster`, `ecs:DescribeServices`, `ecs:DescribeTaskDefinition`, `ecs:UpdateService`, `ecs:DeleteService`, `ecs:DeregisterTaskDefinition`, `ecs:PutClusterCapacityProviders`, `ecs:TagResource`, `ecs:UntagResource` | Deploy and manage ECS cluster and services | +| **CloudWatch Logs** | `logs:CreateLogGroup`, `logs:DescribeLogGroups`, `logs:DeleteLogGroup`, `logs:PutRetentionPolicy`, `logs:TagLogGroup`, `logs:ListTagsLogGroup` | Create log groups for runner output | +| **Secrets Manager** | `secretsmanager:CreateSecret`, `secretsmanager:DescribeSecret`, `secretsmanager:PutSecretValue`, `secretsmanager:GetSecretValue`, `secretsmanager:DeleteSecret`, `secretsmanager:TagResource`, `secretsmanager:UpdateSecret` | Store GitHub registration tokens | +| **EC2 (VPC)** | `ec2:DescribeVpcs`, `ec2:DescribeSubnets`, `ec2:DescribeSecurityGroups`, `ec2:DescribeVpcEndpoints`, `ec2:CreateVpcEndpoint`, `ec2:DeleteVpcEndpoint`, `ec2:ModifyVpcEndpoint` | Query VPC resources and create VPC endpoints (optional) | +| **S3** | `s3:GetObject`, `s3:PutObject`, `s3:ListBucket`, `s3:GetBucketVersioning` | Access Terraform state backend | +| **DynamoDB** | `dynamodb:GetItem`, `dynamodb:PutItem`, `dynamodb:DeleteItem`, `dynamodb:DescribeTable` | Terraform state locking | + +#### Optional Services (If Enabled) + +| Service | Actions | Purpose | When Required | +|---------|---------|---------|---------------| +| **ECR** | `ecr:GetAuthorizationToken`, `ecr:CreateRepository`, `ecr:DescribeRepositories`, `ecr:PutImage`, `ecr:BatchCheckLayerAvailability`, `ecr:InitiateLayerUpload`, `ecr:UploadLayerPart`, `ecr:CompleteLayerUpload` | Mirror GitHub runner images to private ECR | When `enable_ecr_clone = true` | +| **Lambda** | `lambda:CreateFunction`, `lambda:GetFunction`, `lambda:UpdateFunctionCode`, `lambda:UpdateFunctionConfiguration`, `lambda:DeleteFunction`, `lambda:AddPermission`, `lambda:RemovePermission`, `lambda:TagResource`, `lambda:PublishVersion` | Deploy token refresh Lambda | When deploying Lambda token refresh (recommended) | +| **EventBridge** | `events:PutRule`, `events:DescribeRule`, `events:DeleteRule`, `events:PutTargets`, `events:RemoveTargets`, `events:ListTargetsByRule` | Schedule Lambda for token refresh | When deploying Lambda token refresh | + +### Example Terraform Deployer Policy + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "TerraformStateAccess", + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:PutObject", + "s3:ListBucket", + "s3:GetBucketVersioning" + ], + "Resource": [ + "arn:aws:s3:::your-terraform-state-bucket", + "arn:aws:s3:::your-terraform-state-bucket/*" + ] + }, + { + "Sid": "TerraformStateLocking", + "Effect": "Allow", + "Action": [ + "dynamodb:GetItem", + "dynamodb:PutItem", + "dynamodb:DeleteItem", + "dynamodb:DescribeTable" + ], + "Resource": "arn:aws:dynamodb:*:*:table/your-terraform-lock-table" + }, + { + "Sid": "ECSManagement", + "Effect": "Allow", + "Action": [ + "ecs:CreateCluster", + "ecs:CreateService", + "ecs:RegisterTaskDefinition", + "ecs:DescribeCluster", + "ecs:DescribeServices", + "ecs:DescribeTaskDefinition", + "ecs:UpdateService", + "ecs:DeleteService", + "ecs:DeregisterTaskDefinition", + "ecs:PutClusterCapacityProviders", + "ecs:TagResource", + "ecs:UntagResource", + "ecs:ListTagsForResource" + ], + "Resource": "*" + }, + { + "Sid": "IAMRoleManagement", + "Effect": "Allow", + "Action": [ + "iam:CreateRole", + "iam:CreatePolicy", + "iam:GetRole", + "iam:GetPolicy", + "iam:GetRolePolicy", + "iam:AttachRolePolicy", + "iam:DetachRolePolicy", + "iam:PutRolePolicy", + "iam:DeleteRolePolicy", + "iam:DeleteRole", + "iam:DeletePolicy", + "iam:ListAttachedRolePolicies", + "iam:ListRolePolicies", + "iam:TagRole", + "iam:UntagRole", + "iam:PassRole" + ], + "Resource": [ + "arn:aws:iam::*:role/*github-runner*", + "arn:aws:iam::*:role/*task-role", + "arn:aws:iam::*:role/*task-execution-role", + "arn:aws:iam::*:policy/*github-runner*", + "arn:aws:iam::*:policy/*-admin", + "arn:aws:iam::*:policy/*state-access" + ] + }, + { + "Sid": "CloudWatchLogs", + "Effect": "Allow", + "Action": [ + "logs:CreateLogGroup", + "logs:DescribeLogGroups", + "logs:DeleteLogGroup", + "logs:PutRetentionPolicy", + "logs:TagLogGroup", + "logs:ListTagsLogGroup" + ], + "Resource": "arn:aws:logs:*:*:log-group:/ecs-ghe-runners/*" + }, + { + "Sid": "SecretsManager", + "Effect": "Allow", + "Action": [ + "secretsmanager:CreateSecret", + "secretsmanager:DescribeSecret", + "secretsmanager:PutSecretValue", + "secretsmanager:GetSecretValue", + "secretsmanager:UpdateSecret", + "secretsmanager:DeleteSecret", + "secretsmanager:TagResource" + ], + "Resource": "arn:aws:secretsmanager:*:*:secret:/github-runners/*" + }, + { + "Sid": "VPCQuery", + "Effect": "Allow", + "Action": [ + "ec2:DescribeVpcs", + "ec2:DescribeSubnets", + "ec2:DescribeSecurityGroups", + "ec2:DescribeVpcEndpoints" + ], + "Resource": "*" + }, + { + "Sid": "VPCEndpointManagement", + "Effect": "Allow", + "Action": [ + "ec2:CreateVpcEndpoint", + "ec2:DeleteVpcEndpoint", + "ec2:ModifyVpcEndpoint", + "ec2:CreateTags" + ], + "Resource": [ + "arn:aws:ec2:*:*:vpc-endpoint/*", + "arn:aws:ec2:*:*:vpc/*", + "arn:aws:ec2:*:*:subnet/*", + "arn:aws:ec2:*:*:security-group/*" + ], + "Condition": { + "StringEquals": { + "aws:RequestedRegion": ["us-gov-east-1", "us-gov-west-1"] + } + } + } + ] +} +``` + +### Additional Permissions for Lambda Token Refresh + +If deploying the Lambda function for automated token refresh, add these permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "LambdaManagement", + "Effect": "Allow", + "Action": [ + "lambda:CreateFunction", + "lambda:GetFunction", + "lambda:UpdateFunctionCode", + "lambda:UpdateFunctionConfiguration", + "lambda:DeleteFunction", + "lambda:AddPermission", + "lambda:RemovePermission", + "lambda:TagResource", + "lambda:PublishVersion", + "lambda:ListVersionsByFunction" + ], + "Resource": "arn:aws:lambda:*:*:function:github-runner-token-refresh-*" + }, + { + "Sid": "EventBridgeManagement", + "Effect": "Allow", + "Action": [ + "events:PutRule", + "events:DescribeRule", + "events:DeleteRule", + "events:PutTargets", + "events:RemoveTargets", + "events:ListTargetsByRule" + ], + "Resource": "arn:aws:events:*:*:rule/github-runner-token-refresh-*" + }, + { + "Sid": "IAMPassRoleForLambda", + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "arn:aws:iam::*:role/github-runner-token-refresh-*", + "Condition": { + "StringEquals": { + "iam:PassedToService": "lambda.amazonaws.com" + } + } + } + ] +} +``` + +## Runtime Permissions + +### Understanding Task Role Permissions + +GitHub Actions runners executing Infrastructure-as-Code (IaC) workflows need permissions to manage the full lifecycle of AWS resources. Unlike traditional application workloads with narrow, predictable access patterns, IaC workflows by design create, modify, and destroy diverse AWS resources across accounts. + +**Why Broad Permissions Are Required:** + +1. **Dynamic Infrastructure Provisioning**: Terraform and other IaC tools must create any AWS resource type on demand (EC2, RDS, Lambda, VPC, IAM, etc.) +2. **Multi-Service Dependencies**: Infrastructure changes often span multiple AWS services simultaneously (e.g., creating an RDS instance requires VPC, security groups, subnets, IAM roles) +3. **State Management**: IaC tools require read access to existing resources to maintain state and prevent drift +4. **Destructive Operations**: Proper infrastructure lifecycle management requires delete permissions for cleanup and updates + +**Security Model:** + +The security boundary for IaC runners is **not at the IAM permission level**, but rather through: + +1. **Repository Access Control**: Only authorized personnel can commit/merge code to infrastructure repositories +2. **Code Review Requirements**: Pull request reviews enforce peer review of infrastructure changes +3. **GitHub Branch Protection**: Main branches require approvals and status checks before merging +4. **Audit Logging**: All AWS API calls are logged via CloudTrail with runner identity attribution +5. **Network Isolation**: Runners operate in private subnets with controlled egress +6. **Separation of Concerns**: Distinct runner groups per account prevent cross-account access + +This model is consistent with industry best practices for IaC automation platforms (Terraform Cloud, Spacelift, Atlantis, etc.). + +### ECS Task Role Permissions + +The ECS Task Role is assumed by the running GitHub Actions container and grants permissions to your workflows. + +#### Default Configuration + +By default, the module creates an **admin policy** (`iam_policy/admin.json`) that grants full AWS access: + +```json +{ + "Statement": [ + { + "Action": "*", + "Effect": "Allow", + "Resource": "*" + } + ], + "Version": "2012-10-17" +} +``` + +**Security Justification:** + +This administrative policy is required because: +- Runners execute Terraform workflows that can create/modify/delete **any AWS resource type** +- Infrastructure requirements evolve over time; pre-defining a restricted policy would require constant updates and block legitimate operations +- The actual security controls are enforced at the **repository access** and **code review** layers +- Each AWS account has dedicated runners (account isolation prevents cross-account privilege escalation) + +**Risk Mitigation Controls:** + +| Control | Implementation | Purpose | +|---------|----------------|---------| +| **Repository Access** | GitHub team/user permissions | Limit who can modify infrastructure code | +| **Branch Protection** | Required reviews, status checks | Prevent unauthorized changes from being deployed | +| **Code Review** | Pull request approval workflow | Human verification of all infrastructure changes | +| **CloudTrail Logging** | All API calls logged with runner identity | Full audit trail for compliance and forensics | +| **Runner Group Isolation** | One runner group per AWS account | Prevent runners from accessing other accounts | +| **Network Isolation** | Private subnets, no public IPs | Limit attack surface | +| **Immutable Infrastructure** | Runners are ephemeral, recreated frequently | Limit persistence of compromise | +| **Session Tagging** | IAM session tags identify workflow/repo | Granular attribution in CloudTrail | + +#### Automatically Attached Permissions + +The module automatically attaches additional policies to the task role: + +**1. Secrets Manager Access** (for GitHub token): +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "secretsmanager:GetSecretValue", + "Resource": "arn:aws:secretsmanager:*:*:secret:/github-runners/{namespace}/*" + } + ] +} +``` + +**2. S3 Certificate Access** (if `certs` variable is set): +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": ["s3:GetObject", "s3:ListBucket"], + "Resource": [ + "arn:aws:s3:::{certs_bucket}", + "arn:aws:s3:::{certs_bucket}/*" + ] + } + ] +} +``` + +#### Recommended Least-Privilege Policy + +For Terraform-based workflows, use this minimal policy instead: + +**⚠️ Important Note**: The policy below is an **example for limited use cases** (e.g., read-only infrastructure auditing or simple EC2 management). Most production IaC workflows require broader permissions to manage the full infrastructure lifecycle. + +**When to use restricted policies:** +- Non-production accounts with limited scope +- Specific single-purpose workflows (e.g., "only deploy Lambda functions") +- Read-only or compliance scanning workflows +- Development/learning environments + +**When the admin policy is appropriate:** +- Production infrastructure accounts +- Multi-service infrastructure deployments +- Dynamic infrastructure provisioning +- Accounts managed entirely through Terraform + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "TerraformStateAccess", + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:ListBucket" + ], + "Resource": [ + "arn:aws:s3:::terraform-state-bucket", + "arn:aws:s3:::terraform-state-bucket/*" + ] + }, + { + "Sid": "TerraformStateLocking", + "Effect": "Allow", + "Action": [ + "dynamodb:GetItem", + "dynamodb:PutItem", + "dynamodb:DeleteItem" + ], + "Resource": "arn:aws:dynamodb:*:*:table/terraform-lock-table" + }, + { + "Sid": "EC2ReadOnlyForTerraform", + "Effect": "Allow", + "Action": [ + "ec2:Describe*", + "ec2:Get*" + ], + "Resource": "*" + }, + { + "Sid": "EC2ManagementForTerraform", + "Effect": "Allow", + "Action": [ + "ec2:CreateTags", + "ec2:DeleteTags", + "ec2:RunInstances", + "ec2:TerminateInstances", + "ec2:StopInstances", + "ec2:StartInstances", + "ec2:CreateSecurityGroup", + "ec2:DeleteSecurityGroup", + "ec2:AuthorizeSecurityGroupIngress", + "ec2:AuthorizeSecurityGroupEgress", + "ec2:RevokeSecurityGroupIngress", + "ec2:RevokeSecurityGroupEgress" + ], + "Resource": "*", + "Condition": { + "StringEquals": { + "aws:RequestedRegion": ["us-gov-east-1", "us-gov-west-1"] + } + } + }, + { + "Sid": "IAMReadOnly", + "Effect": "Allow", + "Action": [ + "iam:GetRole", + "iam:GetPolicy", + "iam:GetPolicyVersion", + "iam:ListAttachedRolePolicies", + "iam:ListRolePolicies", + "iam:ListPolicies" + ], + "Resource": "*" + } + ] +} +``` + +**To use a custom policy:** + +1. Create the IAM policy in your AWS account +2. Reference it in your `.tfvars`: + ```hcl + github_runner_permissions_arn = "arn:aws:iam::{account_id}:policy/custom-runner-policy" + ``` + +### ECS Execution Role Permissions + +The ECS Execution Role is used by the ECS service to start and manage containers. This role is **automatically configured** by the module and uses the AWS-managed policy: + +**Managed Policy ARN:** +``` +arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy +``` + +**Permissions Provided:** +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "ecr:GetAuthorizationToken", + "ecr:BatchCheckLayerAvailability", + "ecr:GetDownloadUrlForLayer", + "ecr:BatchGetImage", + "logs:CreateLogStream", + "logs:PutLogEvents" + ], + "Resource": "*" + } + ] +} +``` + +**When to Modify:** +- **Cross-Account ECR Access**: If pulling images from ECR in a different AWS account +- **Custom Container Registries**: If using registries other than public ECR +- **Enhanced Logging**: If using FireLens or other advanced logging configurations + +### Lambda Token Refresh Role Permissions + +If using the Lambda function for automated token refresh, it requires: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "SecretsManagerUpdate", + "Effect": "Allow", + "Action": [ + "secretsmanager:UpdateSecret", + "secretsmanager:PutSecretValue", + "secretsmanager:GetSecretValue" + ], + "Resource": "arn:aws:secretsmanager:*:*:secret:/github-runners/*" + }, + { + "Sid": "CloudWatchLogsForLambda", + "Effect": "Allow", + "Action": [ + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents" + ], + "Resource": "arn:aws:logs:*:*:log-group:/aws/lambda/github-runner-token-refresh-*" + } + ] +} +``` + +## Example IAM Policies + +### Complete Terraform Deployer Role + +Attach this policy to the IAM user or role that will run `terraform apply`: + +**Policy Name:** `GitHubRunnerTerraformDeployer` + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "TerraformBackend", + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:PutObject", + "s3:ListBucket", + "s3:GetBucketVersioning", + "dynamodb:GetItem", + "dynamodb:PutItem", + "dynamodb:DeleteItem", + "dynamodb:DescribeTable" + ], + "Resource": [ + "arn:aws:s3:::your-terraform-state-bucket", + "arn:aws:s3:::your-terraform-state-bucket/*", + "arn:aws:dynamodb:*:*:table/your-terraform-lock-table" + ] + }, + { + "Sid": "ECSFullManagement", + "Effect": "Allow", + "Action": [ + "ecs:*" + ], + "Resource": "*" + }, + { + "Sid": "IAMRoleAndPolicyManagement", + "Effect": "Allow", + "Action": [ + "iam:CreateRole", + "iam:CreatePolicy", + "iam:GetRole", + "iam:GetPolicy", + "iam:GetRolePolicy", + "iam:GetPolicyVersion", + "iam:AttachRolePolicy", + "iam:DetachRolePolicy", + "iam:PutRolePolicy", + "iam:DeleteRolePolicy", + "iam:DeleteRole", + "iam:DeletePolicy", + "iam:ListAttachedRolePolicies", + "iam:ListRolePolicies", + "iam:TagRole", + "iam:UntagRole", + "iam:PassRole" + ], + "Resource": "*" + }, + { + "Sid": "CloudWatchLogsManagement", + "Effect": "Allow", + "Action": [ + "logs:CreateLogGroup", + "logs:DescribeLogGroups", + "logs:DeleteLogGroup", + "logs:PutRetentionPolicy", + "logs:TagLogGroup", + "logs:ListTagsLogGroup" + ], + "Resource": "*" + }, + { + "Sid": "SecretsManagerManagement", + "Effect": "Allow", + "Action": [ + "secretsmanager:*" + ], + "Resource": "arn:aws:secretsmanager:*:*:secret:/github-runners/*" + }, + { + "Sid": "VPCAndNetworking", + "Effect": "Allow", + "Action": [ + "ec2:Describe*", + "ec2:CreateVpcEndpoint", + "ec2:DeleteVpcEndpoint", + "ec2:ModifyVpcEndpoint", + "ec2:CreateTags" + ], + "Resource": "*" + }, + { + "Sid": "LambdaAndEventBridge", + "Effect": "Allow", + "Action": [ + "lambda:*", + "events:*" + ], + "Resource": [ + "arn:aws:lambda:*:*:function:github-runner-token-refresh-*", + "arn:aws:events:*:*:rule/github-runner-token-refresh-*" + ] + } + ] +} +``` + +### Production Task Role (Least Privilege) + +Replace the default admin policy with this production-ready policy: + +**Policy Name:** `GitHubRunnerTaskRole-Production` + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "TerraformStateAccess", + "Effect": "Allow", + "Action": [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:ListBucket" + ], + "Resource": [ + "arn:aws:s3:::terraform-state-bucket", + "arn:aws:s3:::terraform-state-bucket/*" + ] + }, + { + "Sid": "TerraformStateLocking", + "Effect": "Allow", + "Action": [ + "dynamodb:GetItem", + "dynamodb:PutItem", + "dynamodb:DeleteItem" + ], + "Resource": "arn:aws:dynamodb:*:*:table/terraform-lock-table" + }, + { + "Sid": "EC2Management", + "Effect": "Allow", + "Action": [ + "ec2:Describe*", + "ec2:Get*", + "ec2:CreateTags", + "ec2:DeleteTags" + ], + "Resource": "*" + }, + { + "Sid": "EC2InstanceManagement", + "Effect": "Allow", + "Action": [ + "ec2:RunInstances", + "ec2:TerminateInstances", + "ec2:StopInstances", + "ec2:StartInstances", + "ec2:RebootInstances" + ], + "Resource": "*", + "Condition": { + "StringEquals": { + "ec2:ResourceTag/ManagedBy": "Terraform" + } + } + }, + { + "Sid": "VPCManagement", + "Effect": "Allow", + "Action": [ + "ec2:CreateVpc", + "ec2:DeleteVpc", + "ec2:ModifyVpcAttribute", + "ec2:CreateSubnet", + "ec2:DeleteSubnet", + "ec2:CreateRouteTable", + "ec2:DeleteRouteTable", + "ec2:CreateRoute", + "ec2:DeleteRoute", + "ec2:CreateInternetGateway", + "ec2:DeleteInternetGateway", + "ec2:AttachInternetGateway", + "ec2:DetachInternetGateway", + "ec2:CreateSecurityGroup", + "ec2:DeleteSecurityGroup", + "ec2:AuthorizeSecurityGroup*", + "ec2:RevokeSecurityGroup*" + ], + "Resource": "*" + }, + { + "Sid": "IAMReadAccess", + "Effect": "Allow", + "Action": [ + "iam:GetRole", + "iam:GetPolicy", + "iam:GetPolicyVersion", + "iam:ListAttachedRolePolicies", + "iam:ListRolePolicies" + ], + "Resource": "*" + }, + { + "Sid": "CloudWatchReadAccess", + "Effect": "Allow", + "Action": [ + "cloudwatch:GetMetricStatistics", + "cloudwatch:ListMetrics", + "logs:DescribeLogGroups", + "logs:DescribeLogStreams" + ], + "Resource": "*" + } + ] +} +``` + +## Security Best Practices + +### 1. Defense in Depth for IaC Workflows + +**The primary security controls for infrastructure automation are not IAM permissions**, but rather: + +**Repository-Level Controls (Primary Defense):** +- Restrict repository write access to infrastructure team members only +- Require pull request reviews before merging (minimum 1-2 approvers) +- Enable GitHub branch protection on main/production branches +- Use CODEOWNERS file to enforce review by specific teams +- Enable required status checks (tests, linters, security scans) + +**IAM-Level Controls (Secondary Defense):** +- Use admin policy for production accounts (necessary for IaC) +- Implement CloudTrail logging with alerts for sensitive operations +- Use separate AWS accounts for different environments (dev/staging/prod) +- Deploy dedicated runner groups per account (no cross-account access) +- Enable IAM Access Analyzer to detect unintended external access + +**Monitoring and Response (Detection):** +- CloudTrail logs all API calls with attribution to runner/workflow/repository +- Set up CloudWatch alarms for high-risk actions (IAM changes, S3 bucket policies) +- Regular access reviews of repository permissions +- Automated drift detection to identify out-of-band changes + +### 2. Alternative Approaches (Not Recommended for IaC) + +Some organizations attempt to restrict IAM permissions for IaC runners. This approach typically results in: +- Constant permission updates as infrastructure needs evolve +- Blocked deployments requiring emergency permission changes +- Shadow IT as teams work around restrictive policies +- Reduced security posture due to workarounds (long-lived credentials, overly broad service accounts) + +**Industry Standard**: Managed IaC platforms (Terraform Cloud, Spacelift, Atlantis, GitHub Actions) all use administrative permissions with repository-based access controls. + +### 2. Account Isolation Strategy + +**Best Practice**: Deploy runners with admin permissions, but isolate by AWS account. + +**Implementation:** +- **Development Account**: Runners with admin access, used for testing infrastructure changes +- **Staging Account**: Runners with admin access, mirrors production environment +- **Production Account**: Runners with admin access, requires additional approval gates +- **Shared Services Account**: Runners with limited read-only access for querying resources + +**Benefits:** +- Blast radius is limited to single account +- Developers can experiment freely in dev without production risk +- Production deployments still require human approval via pull requests +- Account boundaries are enforced by AWS (runners cannot cross accounts without explicit role assumption) + +### 3. Enforce Code Review Requirements + +**GitHub Branch Protection Rules** (Mandatory): + +```yaml +# .github/branch-protection.yml example configuration +protection_rules: + main: + required_pull_request_reviews: + required_approving_review_count: 2 + dismiss_stale_reviews: true + require_code_owner_reviews: true + required_status_checks: + strict: true + contexts: + - "terraform-validate" + - "tfsec-scan" + - "checkov-scan" + enforce_admins: true + restrictions: + users: [] + teams: ["infrastructure-team"] +``` + +**CODEOWNERS File** (Enforce team review): + +``` +# .github/CODEOWNERS +# Infrastructure changes require infrastructure team review +/terraform/ @org/infrastructure-team +/cloudformation/ @org/infrastructure-team +*.tf @org/infrastructure-team +*.tfvars @org/infrastructure-team +``` + +These controls ensure that **no infrastructure change reaches runners without human review**, making the IAM permissions less critical. + +### 4. Audit and Monitor + +- Enable CloudTrail logging for all API calls from runner tasks +- Set up CloudWatch alarms for suspicious activity +- Regularly review IAM policies and remove unused permissions +- Use AWS Access Analyzer to identify overly permissive policies + +### 5. Rotate Credentials Regularly + +- GitHub App credentials should be rotated periodically +- Use the Lambda token refresh function to prevent expired tokens +- Monitor Secrets Manager for access patterns + +### 6. Network Isolation + +- Deploy runners in private subnets without public IPs +- Use VPC endpoints to avoid internet egress +- Restrict security group rules to only necessary services +- Use the corporate proxy for external access + +### 7. CloudTrail Monitoring and Alerting + +**Enable comprehensive CloudTrail logging** to track all runner activity: + +```json +{ + "TrailName": "github-runner-audit", + "S3BucketName": "audit-logs-bucket", + "IncludeGlobalServiceEvents": true, + "IsMultiRegionTrail": true, + "EnableLogFileValidation": true, + "EventSelectors": [ + { + "ReadWriteType": "All", + "IncludeManagementEvents": true, + "DataResources": [] + } + ] +} +``` + +**Set up CloudWatch Alarms for sensitive operations:** + +- IAM policy changes +- S3 bucket policy modifications +- Security group rule changes +- RDS/EC2 instance deletions +- KMS key deletions + +**Query runner activity:** +```bash +# Find all API calls from a specific runner +aws cloudtrail lookup-events \ + --lookup-attributes AttributeKey=ResourceType,AttributeValue=AWS::IAM::Role \ + --lookup-attributes AttributeKey=ResourceName,AttributeValue=CSVD-task-role +``` + +All API calls include context about which repository, workflow, and commit triggered the action. + +### 8. Limit Runner Scope + +- Create separate runner groups for different teams or projects +- Use runner labels to target specific workflows +- Implement repository-level access controls in GitHub + +## Addressing Security Concerns + +This section provides talking points and justifications for security reviews. + +### Common Security Team Questions + +#### Q: "Why do runners need administrative AWS permissions?" + +**A: Runners execute Infrastructure-as-Code workflows that manage the entire AWS account.** + +- **Scope**: Terraform and CloudFormation workflows create/modify/delete resources across all AWS services (EC2, RDS, Lambda, VPC, IAM, S3, etc.) +- **Dynamic Nature**: Infrastructure requirements change constantly; pre-defining permissions would require updates before every deployment +- **Industry Standard**: All major IaC platforms (Terraform Cloud, Spacelift, Atlantis) use administrative permissions +- **Security Model**: The security boundary is repository access + code review, not IAM permissions (see below) + +**Comparison to competitors:** +| Platform | IAM Permissions | Security Model | +|----------|----------------|----------------| +| Terraform Cloud | Admin or custom (typically admin) | Workspace access controls | +| Spacelift | Admin or custom (typically admin) | Stack access controls | +| Atlantis | Admin (hardcoded) | Repository permissions | +| GitHub Actions (this implementation) | Admin | Repository + branch protection | + +#### Q: "Can't we use least-privilege IAM policies instead?" + +**A: For IaC workflows, this approach creates operational problems without meaningful security improvement.** + +**Operational Impact:** +- Infrastructure teams would need to update IAM policies before every new resource type deployment +- Blocks emergency infrastructure changes that need new permissions +- Creates dependency on IAM team for routine infrastructure work +- Leads to "permission sprawl" as policies grow to hundreds of actions over time + +**Security Reality:** +- A malicious actor with repository write access can simply add permissions to their Terraform code and apply them (since IAM management is core to IaC) +- The real security boundary is **who can commit to repositories**, not what permissions runners have +- Restrictive IAM policies create a false sense of security while harming productivity + +**When restrictive policies make sense:** +- Non-IaC workflows (application deployments, data processing) +- Read-only compliance/auditing runners +- Single-purpose workflows with well-defined scope + +#### Q: "What prevents a compromised runner from damaging the account?" + +**A: Multiple layers of defense prevent compromise and limit damage.** + +**1. Repository Access Control (Primary Defense):** +- Only infrastructure team members can push to infrastructure repositories +- GitHub enforces authentication with MFA and short-lived tokens +- Repository audit logs track all code changes + +**2. Code Review (Primary Defense):** +- All infrastructure changes require pull request approval (typically 2+ reviewers) +- CODEOWNERS file enforces review by qualified team members +- Required status checks run security scans (tfsec, checkov, etc.) +- No code reaches production without human review + +**3. Branch Protection (Primary Defense):** +- Main branches are protected; direct pushes blocked +- Require status checks to pass before merge +- Dismiss stale reviews on new commits + +**4. Runner Isolation (Containment):** +- Runners are ephemeral; recreated frequently (no persistent state) +- Network isolation in private subnets with no public IPs +- One runner group per AWS account (no cross-account access) +- No SSH access; runners are immutable containers + +**5. Audit and Detection (Response):** +- CloudTrail logs all API calls with runner identity +- CloudWatch alarms on sensitive operations (IAM changes, deletions) +- Regular access reviews of repository permissions +- Automated drift detection for out-of-band changes + +**Attack Scenarios and Mitigations:** + +| Attack Scenario | Mitigation | +|----------------|------------| +| Malicious code in PR | Blocked by required approvers and status checks | +| Compromised developer account | MFA required; branch protection prevents direct pushes | +| Vulnerable workflow | Immutable runners recreated frequently; no persistent backdoor | +| Network-based attack | Private subnets; no inbound access; egress via proxy | +| Cross-account access | Dedicated runner groups per account; no shared credentials | + +#### Q: "How do we audit runner activity?" + +**A: Full audit trail via CloudTrail with granular attribution.** + +**CloudTrail Integration:** +```bash +# Every API call includes: +# - IAM role (identifies which runner/account) +# - Session tags (workflow name, repository, commit SHA) +# - Source IP (runner's private IP) +# - Timestamp and result (success/failure) + +# Query example: Find all EC2 instances launched by runners +aws cloudtrail lookup-events \ + --lookup-attributes AttributeKey=ResourceType,AttributeValue=AWS::EC2::Instance \ + --start-time 2024-01-01 \ + | jq '.Events[] | select(.Username | contains("task-role"))' +``` + +**Available Audit Information:** +- Which repository triggered the action +- Which workflow and job executed +- Which commit/branch was running +- Which user authored the commit +- Timestamp and API call result +- All resource modifications with before/after state + +**Compliance Support:** +- CloudTrail logs can be forwarded to SIEM (Splunk, Datadog, etc.) +- Logs are tamper-proof with log file integrity validation +- Supports long-term retention for compliance (1+ years) +- Integrates with AWS Security Hub and GuardDuty + +#### Q: "What about separation of duties?" + +**A: Separation of duties is enforced at the repository level, not IAM level.** + +**Implementation:** +- **Code Authors**: Can create PRs, but cannot approve their own changes +- **Reviewers**: Different team members must review before merge +- **Approvers**: CODEOWNERS enforces review by senior engineers +- **Executors**: Runners execute approved code only (no manual intervention) + +**Example Workflow:** +1. Junior engineer creates PR with infrastructure changes +2. Senior engineer reviews and requests changes +3. Junior engineer updates PR +4. Two senior engineers approve +5. CI/CD checks pass (tests, security scans) +6. PR merges to main branch +7. Runner executes Terraform apply +8. Changes logged to CloudTrail with attribution + +This provides **better separation of duties** than traditional models where the person with AWS credentials can make changes without review. + +#### Q: "How does this compare to developers having AWS credentials?" + +**A: This model is significantly more secure than distributing AWS credentials.** + +**Traditional Model (Developers with AWS Credentials):** +- ❌ Credentials can be used from anywhere (no network restrictions) +- ❌ No code review for changes (direct API access) +- ❌ Credentials may be long-lived (IAM user access keys) +- ❌ No attribution to specific code changes +- ❌ Credentials can be accidentally committed to repositories +- ❌ Difficult to audit: which changes were reviewed vs manual? + +**GitHub Actions Runner Model:** +- ✅ Runners only accessible from private network +- ✅ All changes require code review and approval +- ✅ Credentials are short-lived (ECS task role, rotated per task) +- ✅ Full attribution to repository, workflow, commit, and author +- ✅ No credentials for developers to leak +- ✅ Clear audit trail: all infrastructure changes in git history + +#### Q: "What about compliance requirements (SOC 2, FedRAMP, etc.)?" + +**A: This model supports common compliance frameworks.** + +**SOC 2 Controls:** +- **CC6.1 (Logical Access Controls)**: Repository permissions and MFA +- **CC6.2 (Authentication)**: GitHub SSO with SAML, MFA required +- **CC6.3 (Authorization)**: Branch protection and CODEOWNERS enforcement +- **CC7.2 (Monitoring)**: CloudTrail logging and CloudWatch alarms +- **CC7.3 (Change Management)**: Pull request workflow with approval and testing + +**FedRAMP Requirements:** +- **AC-2 (Account Management)**: Repository access audited quarterly +- **AC-3 (Access Enforcement)**: Branch protection enforces approval requirements +- **AC-6 (Least Privilege)**: Account isolation limits blast radius +- **AU-2 (Audit Events)**: CloudTrail logs all API calls +- **CM-3 (Change Control)**: All changes via pull request with approval + +**NIST 800-53 Controls:** +- Full audit trail (AU family) +- Access control enforcement (AC family) +- Change management (CM family) +- Network isolation (SC family) + +**Documentation for Auditors:** +- Git history provides complete change log +- GitHub audit log tracks repository access changes +- CloudTrail provides AWS API call audit +- Branch protection rules documented in repository settings + +### Recommendations for Security Reviews + +**For Security Teams Reviewing This Implementation:** + +1. **Focus on Repository Access Controls** + - Review: Who has write access to infrastructure repositories? + - Verify: Are branch protection rules configured correctly? + - Audit: Are CODEOWNERS files in place and enforced? + +2. **Verify Code Review Process** + - Review: Pull request approval requirements (recommend 2+ approvers) + - Verify: Required status checks (tests, security scans) + - Audit: Sample recent PRs to verify reviews are substantive + +3. **Check Audit Logging** + - Review: CloudTrail enabled in all accounts? + - Verify: Logs forwarded to centralized SIEM? + - Audit: Are alarms configured for sensitive operations? + +4. **Assess Account Isolation** + - Review: Separate accounts for dev/staging/prod? + - Verify: Dedicated runner groups per account? + - Audit: No cross-account role assumption without review + +5. **Don't Focus on IAM Permissions** + - ⚠️ Restrictive IAM policies for IaC runners are security theater + - ⚠️ They don't prevent malicious code (actors can grant themselves permissions) + - ⚠️ They create operational burden without security benefit + - ✅ Focus on repository access and code review instead + +**Red Flags (Things to Worry About):** +- Developers with direct AWS console/CLI access +- Infrastructure repositories with no branch protection +- Pull requests merged without review +- No CloudTrail logging or log forwarding +- Shared credentials between environments +- Long-lived access keys or IAM users + +**Green Lights (Good Security Posture):** +- All infrastructure changes via pull request +- Required approvals from qualified reviewers +- Automated security scanning in CI/CD +- CloudTrail forwarded to SIEM with alerting +- Separate accounts per environment +- No developer AWS credentials (runners only) + +### Example Security Review Questions and Answers + +**Q: Can someone bypass the review process?** + +**A:** No, if branch protection is properly configured: +```yaml +# Required configuration (verify in GitHub): +- Require pull request before merging: Yes +- Require approvals: 2 +- Dismiss stale reviews: Yes +- Require review from Code Owners: Yes +- Include administrators: Yes (even admins must follow process) +``` + +**Q: What if someone steals a developer's GitHub account?** + +**A:** Multiple layers of defense: +1. MFA is required for all accounts +2. GitHub logs all authentication attempts +3. Stolen account still cannot push to protected branches +4. Changes require approval from other team members +5. Unusual activity triggers GitHub security alerts + +**Q: Can we at least restrict IAM permissions?** + +**A:** You can, but it's not recommended because: +1. It doesn't improve security (actors with repo access can grant IAM permissions via Terraform) +2. It creates operational problems (blocked deployments) +3. It provides false security (seems secure but isn't) + +**Alternative approach if IAM restrictions are required:** +- Keep admin policy but add **conditions** (regions, resource tags) +- Implement **SCPs** (Service Control Policies) at organization level to block dangerous actions +- Use **AWS Control Tower** guardrails for account-level restrictions +- These don't break IaC workflows but provide guard rails + +## Troubleshooting Permission Issues + +### Common Permission Errors + +#### 1. "Access Denied" when running Terraform apply + +**Symptom:** `terraform apply` fails with IAM permission errors + +**Solution:** Ensure your deployer role has all required [Deployment Permissions](#deployment-permissions) + +**Verify:** +```bash +aws iam get-role --role-name YourDeployerRole +aws iam list-attached-role-policies --role-name YourDeployerRole +``` + +#### 2. ECS tasks fail to start with "Cannot pull container image" + +**Symptom:** Tasks stuck in `PENDING` or fail immediately + +**Cause:** Execution role lacks ECR permissions + +**Solution:** Verify execution role has `AmazonECSTaskExecutionRolePolicy` + +**Verify:** +```bash +aws iam list-attached-role-policies --role-name {hostname}-task-execution-role +``` + +#### 3. Workflows fail with "Access Denied" to S3/DynamoDB + +**Symptom:** Terraform workflows fail when accessing state + +**Cause:** Task role lacks state access permissions + +**Solution:** Add S3/DynamoDB permissions to task role or attach `session_configuration.json` policy + +#### 4. Lambda token refresh fails + +**Symptom:** Tokens expire, runners can't register + +**Cause:** Lambda role lacks Secrets Manager permissions + +**Solution:** Ensure Lambda execution role can update secrets: +```json +{ + "Effect": "Allow", + "Action": ["secretsmanager:UpdateSecret", "secretsmanager:PutSecretValue"], + "Resource": "arn:aws:secretsmanager:*:*:secret:/github-runners/*" +} +``` + +### Debugging Permission Issues + +**1. Check CloudTrail for denied API calls:** +```bash +aws cloudtrail lookup-events \ + --lookup-attributes AttributeKey=EventName,AttributeValue=AccessDenied \ + --max-results 50 +``` + +**2. Review CloudWatch Logs for errors:** +```bash +aws logs tail /ecs-ghe-runners/{workspace}-{account_id}-{region} --follow +``` + +**3. Test IAM permissions:** +```bash +# Test as the task role +aws sts assume-role --role-arn arn:aws:iam::{account}:role/{hostname}-task-role --role-session-name test +``` + +**4. Use IAM Policy Simulator:** +```bash +aws iam simulate-principal-policy \ + --policy-source-arn arn:aws:iam::{account}:role/{hostname}-task-role \ + --action-names s3:GetObject \ + --resource-arns arn:aws:s3:::bucket/key +``` + +## Additional Resources + +- [AWS IAM Best Practices](https://docs.aws.amazon.com/IAM/latest/UserGuide/best-practices.html) +- [ECS Task Execution IAM Role](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task_execution_IAM_role.html) +- [ECS Task IAM Role](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-iam-roles.html) +- [AWS Policy Evaluation Logic](https://docs.aws.amazon.com/IAM/latest/UserGuide/reference_policies_evaluation-logic.html) + +## Summary + +### Quick Reference + +| Permission Type | Who Needs It | Purpose | Default Policy | +|----------------|-------------|---------|----------------| +| **Deployment** | Terraform deployer (you) | Create infrastructure | Custom (see examples) | +| **Task Role** | Running workflows | Access AWS resources | Admin (⚠️ change this!) | +| **Execution Role** | ECS service | Start containers | AWS managed policy | +| **Lambda Role** | Token refresh function | Update secrets | Auto-created | + +### Next Steps + +1. Review current permissions using the examples above +2. Replace admin policy with least-privilege task role policy +3. Document any additional permissions your workflows require +4. Set up monitoring and alerts for permission issues +5. Schedule regular permission audits + +For questions or issues, consult: +- [Main README](./README.md) +- [GitHub App Setup](./GITHUB_APP_SETUP.md) +- [Security Documentation](./SECURITY.md) diff --git a/GITHUB_APP_MIGRATION.md b/GITHUB_APP_MIGRATION.md new file mode 100644 index 0000000..6d9940d --- /dev/null +++ b/GITHUB_APP_MIGRATION.md @@ -0,0 +1,336 @@ +# GitHub App Authentication Migration - Summary + +## Overview + +This repository has been updated to use **GitHub App authentication** instead of Personal Access Tokens (PATs) for managing GitHub Actions runners. This change improves security, auditability, and follows GitHub's recommended practices. + +## What Changed + +### 1. Authentication Method + +**Before:** Personal Access Token (PAT) +- Tied to individual user account +- Broad permissions +- Manual rotation required +- Limited auditability + +**After:** GitHub App +- Organization-level authentication +- Fine-grained permissions +- Automatic token refresh +- Full audit trail + +### 2. Terraform Provider Configuration + +**Before:** +```hcl +provider "github" { + token = var.github_token # Personal Access Token +} +``` + +**After:** +```hcl +data "github_app_token" "app" { + app_id = var.github_app_id + installation_id = var.github_app_installation_id + pem_file = var.github_app_pem_file +} + +provider "github" { + token = data.github_app_token.app.token +} +``` + +### 3. Lambda Function Authentication + +**Before:** +- Simple API call with PAT in Authorization header +- Single-step authentication + +**After:** +- Three-step JWT-based authentication: + 1. Generate JWT from PEM file using PyJWT library + 2. Exchange JWT for installation access token + 3. Use installation token to get registration token +- More secure, follows GitHub best practices + +### 4. Required Variables + +**Before:** +```hcl +github_token = "ghp_xxxxx" # Personal Access Token +``` + +**After:** +```hcl +github_app_id = "123456" +github_app_installation_id = "12345678" +github_app_pem_file = "~/.github-apps/runner-mgmt.pem" +``` + +## Files Created/Modified + +### New Files + +1. **`GITHUB_APP_SETUP.md`** - Comprehensive GitHub App setup guide + - Step-by-step app creation instructions + - Permission requirements + - Installation instructions + - Multi-organization setup guidance + - Troubleshooting section + +2. **`lambda/requirements.txt`** - Python dependencies for Lambda + - PyJWT>=2.8.0 (JWT generation) + - cryptography>=41.0.0 (cryptographic operations) + +3. **`example.auto.tfvars`** - Example configuration file + - Shows all available variables + - Commented with descriptions + - Safe to use as template + +### Modified Files + +1. **`variables.tf`** - Added GitHub App variables + - `github_app_id` + - `github_app_installation_id` + - `github_app_pem_file` + - Includes validation rules and documentation + +2. **`providers.tf`** - Updated provider configuration + - Uses `github_app_token` data source + - Generates token from GitHub App credentials + +3. **`lambda_token_refresh.tf`** - Enhanced Lambda deployment + - Installs Python dependencies (PyJWT, cryptography) + - Packages dependencies with Lambda function + - Passes GitHub App credentials via environment variables + +4. **`lambda/token_refresh.py`** - Complete rewrite + - `generate_jwt_token()` - Creates JWT from PEM file + - `get_installation_access_token()` - Gets installation token + - `get_github_registration_token()` - Gets registration token + - Three-step authentication flow + +5. **`lambda/README.md`** - Updated documentation + - Explains GitHub App authentication flow + - Lists Python dependencies + - Updated environment variable documentation + +6. **`README.md`** - Updated main documentation + - Prerequisites section references GitHub App setup + - Configuration examples use GitHub App variables + - Quick start updated + +7. **`SECURITY.md`** - Complete security overhaul + - Replaced PAT security guidance with GitHub App + - PEM file security best practices + - CI/CD integration examples + - .gitignore recommendations + +8. **`.gitignore`** - Enhanced protection + - Excludes PEM files (`*.pem`, `*.private-key.pem`) + - Excludes Lambda package artifacts (`lambda/package/`, `lambda/*.zip`) + +## Migration Steps + +### For New Deployments + +1. **Create GitHub App** + ```bash + # Follow instructions in GITHUB_APP_SETUP.md + ``` + +2. **Copy example configuration** + ```bash + cp example.auto.tfvars your-workspace.auto.tfvars + ``` + +3. **Update variables** + ```hcl + # Edit your-workspace.auto.tfvars + github_app_id = "YOUR_APP_ID" + github_app_installation_id = "YOUR_INSTALLATION_ID" + github_app_pem_file = "~/.github-apps/your-key.pem" + ``` + +4. **Deploy** + ```bash + terraform init + terraform workspace select your-workspace + terraform apply + ``` + +### For Existing Deployments (Migration from PAT) + +1. **Create GitHub App** (see GITHUB_APP_SETUP.md) + +2. **Update workspace configuration** + ```hcl + # Replace github_token with GitHub App variables + github_app_id = "123456" + github_app_installation_id = "12345678" + github_app_pem_file = "~/.github-apps/key.pem" + ``` + +3. **Apply changes** + ```bash + terraform apply + # Terraform will seamlessly switch to GitHub App authentication + ``` + +4. **Verify** + - Check runners in GitHub organization settings + - Monitor Lambda CloudWatch logs for successful token refresh + - Verify no authentication errors + +5. **Revoke old PAT** (optional but recommended) + - GitHub Settings → Developer settings → Personal access tokens + - Revoke the old runner management token + +## Benefits + +### Security Improvements + +✅ **Scoped Permissions** +- GitHub Apps have fine-grained permissions +- Limited to only what's needed +- Can't access user-level resources + +✅ **No Personal Account Dependency** +- Not tied to individual user +- Survives employee turnover +- Centrally managed + +✅ **Better Auditability** +- All actions logged with app attribution +- Organization audit log shows app activity +- Clear separation from user actions + +✅ **Automatic Token Expiration** +- Installation tokens expire automatically +- Reduces risk of token leakage +- Lambda handles refresh automatically + +### Operational Improvements + +✅ **Higher Rate Limits** +- GitHub Apps have higher API rate limits +- Separate from user rate limits +- Better for automated systems + +✅ **Multi-Organization Support** +- Different GitHub App per organization +- Or single app with multiple installations +- Organization-specific configuration + +✅ **Easier Key Rotation** +- Generate new private key in GitHub App settings +- Update PEM file path +- Apply Terraform changes + +## Dependencies + +### Python Libraries (Lambda) + +The Lambda function now requires: +- **PyJWT** (>=2.8.0) - JSON Web Token generation +- **cryptography** (>=41.0.0) - RSA key operations + +These are automatically installed and packaged by Terraform during deployment. + +### Terraform Providers + +- **hashicorp/github** - Uses `github_app_token` data source +- **hashicorp/aws** - ECS, Lambda, Secrets Manager, IAM +- **hashicorp/archive** - Lambda deployment packaging +- **hashicorp/null** - Python dependency installation + +## Troubleshooting + +### Common Issues + +**"Bad credentials" error:** +- Verify App ID and Installation ID +- Check PEM file path is correct +- Ensure PEM file is readable (`chmod 600`) + +**Lambda "Module not found: PyJWT":** +- Run `terraform apply` to rebuild Lambda package +- Check CloudWatch logs for detailed error + +**"Resource not accessible by integration":** +- Verify GitHub App permissions (see GITHUB_APP_SETUP.md) +- Organization admin must approve permission changes + +### Validation Commands + +```bash +# Check GitHub App authentication +terraform plan +# Should authenticate without errors + +# Test Lambda function +aws lambda invoke \ + --function-name github-runner-token-refresh-{account} \ + --payload '{}' \ + response.json + +# View Lambda logs +aws logs tail /aws/lambda/github-runner-token-refresh-{account} --follow + +# Check runner status +aws ecs list-tasks --cluster ecs-ghe-runners-{region} +``` + +## Documentation + +- **[GITHUB_APP_SETUP.md](./GITHUB_APP_SETUP.md)** - Complete GitHub App setup guide +- **[README.md](./README.md)** - Main deployment documentation +- **[SECURITY.md](./SECURITY.md)** - Security best practices +- **[lambda/README.md](./lambda/README.md)** - Lambda function details +- **[example.auto.tfvars](./example.auto.tfvars)** - Configuration template + +## Support + +For assistance: +1. Review documentation (especially GITHUB_APP_SETUP.md) +2. Check troubleshooting sections +3. Review GitHub organization audit logs +4. Check Lambda CloudWatch logs +5. Contact infrastructure team + +## Future Considerations + +### Potential Enhancements + +1. **AWS Secrets Manager Integration** + - Store PEM file content in Secrets Manager + - Lambda retrieves at runtime + - Eliminates file path dependency + +2. **Multi-Region Deployment** + - Deploy Lambda to multiple regions + - Regional token caching + - Improved availability + +3. **Monitoring & Alerting** + - CloudWatch alarms for token refresh failures + - SNS notifications for authentication errors + - Dashboard for runner health + +4. **Automated Key Rotation** + - Schedule periodic private key rotation + - Automated key generation and update + - Zero-downtime rotation + +## Summary + +This migration improves security and maintainability by: +- Using GitHub's recommended authentication method +- Eliminating dependency on personal accounts +- Providing better audit trails +- Supporting multi-organization scenarios +- Automating token lifecycle management + +All changes are backward-compatible with existing workflows - only the authentication mechanism changed. diff --git a/GITHUB_APP_SETUP.md b/GITHUB_APP_SETUP.md new file mode 100644 index 0000000..aee85f2 --- /dev/null +++ b/GITHUB_APP_SETUP.md @@ -0,0 +1,393 @@ +# GitHub App Setup for Runner Authentication + +## Overview + +This repository uses **GitHub App authentication** instead of Personal Access Tokens (PATs) for managing GitHub Actions runners. GitHub Apps provide better security, auditability, and fine-grained permissions. + +## Why GitHub Apps? + +### Benefits over Personal Access Tokens: + +✅ **Better Security** +- Permissions scoped to specific APIs +- No user impersonation +- Automatic token expiration and refresh +- Can't be used outside of the installed organization + +✅ **Auditability** +- All actions logged to organization audit log +- Clear attribution (app name vs. personal user) +- Easier to track API usage + +✅ **Team Management** +- Not tied to individual user account +- Survives employee turnover +- Centrally managed at organization level + +✅ **Rate Limits** +- Higher API rate limits than PATs +- Separate rate limit pool from users + +## Prerequisites + +- GitHub Enterprise organization administrator access +- Ability to create and install GitHub Apps +- Access to download private keys + +## Step 1: Create the GitHub App + +### 1.1 Navigate to GitHub App Creation + +1. Go to your GitHub Enterprise instance: `https://github.e.it.census.gov` +2. Click your profile picture → **Settings** +3. In the left sidebar, click **Developer settings** +4. Click **GitHub Apps** +5. Click **New GitHub App** + +### 1.2 Configure GitHub App Settings + +**GitHub App name:** `GitHub Actions Runner Management - {ORG_NAME}` +- Example: "GitHub Actions Runner Management - CSVD" +- Must be unique across your GitHub Enterprise instance + +**Homepage URL:** `https://github.e.it.census.gov/{YOUR_ORG}` +- Example: `https://github.e.it.census.gov/CSVD` + +**Webhook:** +- ☐ Uncheck "Active" (webhooks not needed) + +**Permissions:** + +Select the following Repository permissions: +- **Administration**: Read & write (for managing runner groups) +- **Actions**: Read & write (for generating registration tokens) + +Select the following Organization permissions: +- **Self-hosted runners**: Read & write + +**Where can this GitHub App be installed?** +- ☑️ Only on this account + +### 1.3 Create the App + +1. Click **Create GitHub App** +2. You'll be redirected to your new app's settings page +3. **Record the App ID** - you'll need this for Terraform + - Located at the top: "App ID: 123456" + +## Step 2: Generate Private Key + +### 2.1 Generate the Key + +1. On your GitHub App settings page, scroll to **Private keys** +2. Click **Generate a private key** +3. A `.pem` file will be downloaded automatically +4. Save this file securely - you cannot download it again + +### 2.2 Secure the Private Key + +```bash +# Move to secure location +mkdir -p ~/.github-apps +mv ~/Downloads/your-app-name.*.private-key.pem ~/.github-apps/ + +# Set restrictive permissions +chmod 600 ~/.github-apps/your-app-name.*.private-key.pem + +# Rename for easier reference (optional) +mv ~/.github-apps/your-app-name.*.private-key.pem \ + ~/.github-apps/github-runner-management.pem +``` + +**Security Considerations:** +- ⚠️ **NEVER commit this file to version control** +- ⚠️ Store in a secure location with restricted permissions +- ⚠️ Consider using AWS Secrets Manager or similar for production +- ⚠️ Add to `.gitignore`: `*.pem`, `*.private-key.pem` +- ✅ Keep backups in secure password manager +- ✅ Rotate keys periodically (e.g., every 6 months) + +## Step 3: Install the GitHub App + +### 3.1 Install to Organization + +1. On your GitHub App settings page, click **Install App** (left sidebar) +2. Click **Install** next to your organization name +3. Select installation settings: + - **All repositories** - OR - + - **Only select repositories** (if you want to limit scope) +4. Click **Install** + +### 3.2 Get Installation ID + +After installation, you'll be redirected to a URL like: +``` +https://github.e.it.census.gov/organizations/CSVD/settings/installations/12345678 +``` + +The number at the end (`12345678`) is your **Installation ID**. Record this value. + +Alternatively, you can find it: +1. Go to **Organization Settings** → **GitHub Apps** +2. Click **Configure** on your installed app +3. The Installation ID is in the URL + +## Step 4: Configure Terraform Variables + +### 4.1 Required Variables + +You need three values from the steps above: + +1. **App ID** (from Step 1.3) +2. **Installation ID** (from Step 3.2) +3. **PEM file path** (from Step 2.2) + +### 4.2 Create Workspace-Specific Configuration + +Since different organizations have different GitHub Apps, set these values in workspace-specific `.tfvars` files: + +**Example: `csvd-229685449397-us-gov-east-1.auto.tfvars`** + +```hcl +# GitHub App Configuration (Organization-specific) +github_app_id = "123456" +github_app_installation_id = "12345678" +github_app_pem_file = "/home/user/.github-apps/github-runner-management.pem" + +# Organization Settings +repo_org = "CSVD" + +# AWS Configuration +aws_account = "csvd-dev-ew" +vpc_id = "vpc-0abc123def456" +subnets = ["subnet-abc123", "subnet-def456"] +# ... other variables +``` + +### 4.3 Alternative: Environment Variables + +For CI/CD or temporary use: + +```bash +export TF_VAR_github_app_id="123456" +export TF_VAR_github_app_installation_id="12345678" +export TF_VAR_github_app_pem_file="/path/to/key.pem" + +terraform apply +``` + +## Step 5: Verify Configuration + +### 5.1 Test Terraform Authentication + +```bash +# Initialize Terraform +terraform init + +# Validate configuration +terraform validate + +# Test with a plan (this will authenticate to GitHub) +terraform plan +``` + +Expected output should show no authentication errors. + +### 5.2 Test Lambda Function (After Deployment) + +```bash +# Invoke Lambda manually to test GitHub App authentication +aws lambda invoke \ + --function-name github-runner-token-refresh-229685449397 \ + --payload '{}' \ + response.json + +cat response.json +``` + +Expected response: +```json +{ + "statusCode": 200, + "body": "{\"message\": \"Token refreshed successfully\", ...}" +} +``` + +## Troubleshooting + +### Error: "Bad credentials" or "401 Unauthorized" + +**Possible Causes:** +- Incorrect App ID or Installation ID +- PEM file path is wrong or file is inaccessible +- Private key doesn't match the GitHub App +- GitHub App not installed in the organization + +**Solutions:** +1. Verify App ID matches the value shown in GitHub App settings +2. Verify Installation ID from the installation URL +3. Check PEM file exists and is readable: `cat $TF_VAR_github_app_pem_file` +4. Ensure GitHub App is installed: **Org Settings** → **GitHub Apps** + +### Error: "Resource not accessible by integration" + +**Cause:** GitHub App lacks required permissions + +**Solution:** +1. Go to GitHub App settings +2. Click **Permissions & events** +3. Ensure these permissions are granted: + - Repository permissions: **Administration** (Read & write), **Actions** (Read & write) + - Organization permissions: **Self-hosted runners** (Read & write) +4. Click **Save changes** +5. Organization admins must approve the permission changes + +### Error: "PEM file not found" in Lambda + +**Cause:** PEM file path in Lambda environment is incorrect + +**Solution:** +The Lambda function expects the PEM file to be accessible. You have two options: + +**Option 1: Include PEM in Lambda deployment** +```bash +# Copy PEM to lambda directory +cp ~/.github-apps/github-runner-management.pem lambda/ + +# Update Lambda environment variable +GITHUB_APP_PEM_FILE=/var/task/github-runner-management.pem +``` + +**Option 2: Store PEM in AWS Secrets Manager** (recommended for production) +```bash +# Store in Secrets Manager +aws secretsmanager create-secret \ + --name /github-apps/runner-management/private-key \ + --secret-string file://~/.github-apps/github-runner-management.pem + +# Update Lambda to retrieve from Secrets Manager (requires code modification) +``` + +### Error: JWT token validation fails + +**Cause:** System clock skew or incorrect PEM file format + +**Solutions:** +1. Verify system time is synchronized: `date` +2. Check PEM file format: + ```bash + head -1 ~/.github-apps/github-runner-management.pem + # Should show: -----BEGIN RSA PRIVATE KEY----- + ``` +3. Regenerate the private key if corrupted + +## Security Best Practices + +### Private Key Management + +✅ **DO:** +- Store PEM files with `chmod 600` permissions +- Use AWS Secrets Manager for Lambda production deployments +- Rotate keys every 6-12 months +- Keep backup in secure password manager +- Use different keys for different environments (dev/prod) + +❌ **DON'T:** +- Commit PEM files to version control +- Share PEM files via email or Slack +- Store PEM files in shared directories +- Use the same key across multiple organizations + +### Access Control + +✅ **DO:** +- Limit GitHub App installation to specific repositories if possible +- Review GitHub App permissions regularly +- Monitor organization audit log for app activity +- Revoke app access immediately if compromised + +❌ **DON'T:** +- Grant more permissions than necessary +- Install apps with "All repositories" unless required +- Share App ID and Installation ID publicly + +### Monitoring + +**GitHub Audit Log:** +1. **Organization Settings** → **Audit log** +2. Filter by: `action:app_installation.*` or `action:org.runner.*` +3. Review app authentication and runner management events + +**Lambda CloudWatch Logs:** +```bash +aws logs tail /aws/lambda/github-runner-token-refresh-{account} --follow +``` + +Look for: +- Successful token refreshes +- Authentication failures +- API rate limit warnings + +## Multi-Organization Setup + +If you manage runners across multiple GitHub organizations: + +### Option 1: One GitHub App per Organization (Recommended) + +Each organization creates and installs its own GitHub App: + +```hcl +# Organization 1: CSVD +github_app_id = "123456" +github_app_installation_id = "12345678" +github_app_pem_file = "~/.github-apps/csvd-runner-management.pem" + +# Organization 2: SCT-Engineering +github_app_id = "234567" +github_app_installation_id = "23456789" +github_app_pem_file = "~/.github-apps/sct-runner-management.pem" +``` + +### Option 2: Single GitHub App, Multiple Installations + +Create one GitHub App but install it in multiple organizations: + +```hcl +# Same App ID, different Installation IDs per org +github_app_id = "123456" # Same for all orgs + +# CSVD organization +github_app_installation_id = "12345678" # CSVD-specific + +# SCT-Engineering organization +github_app_installation_id = "23456789" # SCT-specific +``` + +**Note:** The PEM file is the same across all installations of the same app. + +## Migration from Personal Access Tokens + +If you're currently using PATs: + +1. Create and configure GitHub App (Steps 1-3 above) +2. Update Terraform variables (Step 4) +3. Run `terraform apply` - Terraform will use the new authentication +4. Verify runners are working +5. Revoke old PAT in GitHub settings + +**No downtime required** - Terraform switches authentication seamlessly. + +## Additional Resources + +- [GitHub Apps Documentation](https://docs.github.com/en/developers/apps/getting-started-with-apps/about-apps) +- [Authenticating with GitHub Apps](https://docs.github.com/en/developers/apps/building-github-apps/authenticating-with-github-apps) +- [GitHub App Permissions](https://docs.github.com/en/developers/apps/building-github-apps/setting-permissions-for-github-apps) +- [Terraform GitHub Provider - App Token](https://registry.terraform.io/providers/integrations/github/latest/docs/data-sources/app_token) + +## Support + +For assistance with GitHub App setup: +1. Review this documentation +2. Check the [Troubleshooting](#troubleshooting) section +3. Review GitHub audit logs for clues +4. Contact the infrastructure team diff --git a/README.md b/README.md index c2d7fc9..9dd912a 100644 --- a/README.md +++ b/README.md @@ -1,107 +1,1363 @@ -# ghe-runners -Manage Repo Specific Runners +# GitHub Actions ECS Fargate Runners -# GitHub Actions Runner Setup Guide +Infrastructure as Code (Terraform) for deploying self-hosted GitHub Actions runners on AWS ECS Fargate. -This repository contains infrastructure code to manage repository-specific GitHub Actions runners using AWS ECS Fargate. +## Overview + +This repository manages the deployment of **organization-level self-hosted GitHub Actions runners** using AWS ECS Fargate. Runners are deployed per AWS account and automatically register with your GitHub Enterprise organization, providing secure, scalable, and cost-effective CI/CD execution environments. + +### Key Features + +- **Serverless Architecture**: ECS Fargate eliminates server management overhead +- **Account-Based Isolation**: Each AWS account has its own dedicated runners +- **Automatic IAM Authentication**: ECS Task Roles provide seamless AWS access +- **Multi-Label Support**: Runners tagged with account ID, name, region, and more +- **Proxy-Enabled**: Pre-configured for enterprise proxy environments +- **CloudWatch Integration**: Centralized logging and monitoring +- **Scalable**: Adjust runner count based on workload demands + +## Architecture + +``` +GitHub Enterprise (github.e.it.census.gov) + │ + │ (OAuth App Authentication) + ▼ + ECS Cluster (per account/region) + ecs-ghe-runners-{region} + │ + ├── Fargate Task (Runner 1) + │ ├── Container: github-runner:{version} + │ ├── IAM Task Role (AWS Auth) + │ ├── Labels: Account ID, Name, Region + │ └── Logs → CloudWatch + │ + ├── Fargate Task (Runner 2) + └── Fargate Task (Runner N) +``` + +**Network Architecture:** +- **Deployment**: Private subnets (no public IP by default) +- **Internet Access**: Corporate proxy (`proxy.tco.census.gov:3128`) +- **VPC Endpoints**: Optional (ECR, S3, EC2) for AWS service access +- **Security Groups**: Restrict egress to necessary services ## Prerequisites -- AWS Account with appropriate permissions -- Terraform installed -- GitHub Organization Admin access -- AWS CLI configured with appropriate credentials -- GitHub Personal Access Token with admin:org permissions +### AWS Requirements + +#### AWS Account Access +You need an AWS account with appropriate IAM permissions to deploy the infrastructure. The deploying user/role requires: + +**Terraform Deployment Permissions:** +- **IAM**: Create and manage IAM roles, policies, and policy attachments + - `iam:CreateRole`, `iam:CreatePolicy`, `iam:AttachRolePolicy`, `iam:PutRolePolicy` + - `iam:GetRole`, `iam:GetPolicy`, `iam:ListAttachedRolePolicies` + - `iam:DeleteRole`, `iam:DeletePolicy`, `iam:DetachRolePolicy`, `iam:DeleteRolePolicy` + - `iam:TagRole`, `iam:UntagRole` +- **ECS**: Create and manage ECS clusters, services, and task definitions + - `ecs:CreateCluster`, `ecs:CreateService`, `ecs:RegisterTaskDefinition` + - `ecs:DescribeCluster`, `ecs:DescribeServices`, `ecs:DescribeTaskDefinition` + - `ecs:UpdateService`, `ecs:DeleteService`, `ecs:DeregisterTaskDefinition` + - `ecs:PutClusterCapacityProviders` + - `ecs:TagResource`, `ecs:UntagResource` +- **CloudWatch Logs**: Create and manage log groups + - `logs:CreateLogGroup`, `logs:DescribeLogGroups`, `logs:DeleteLogGroup` + - `logs:PutRetentionPolicy`, `logs:TagLogGroup` +- **Secrets Manager**: Create and manage secrets for GitHub tokens + - `secretsmanager:CreateSecret`, `secretsmanager:DescribeSecret` + - `secretsmanager:PutSecretValue`, `secretsmanager:GetSecretValue` + - `secretsmanager:DeleteSecret`, `secretsmanager:TagResource` +- **EC2 (VPC)**: Query VPC resources and optionally create VPC endpoints + - `ec2:DescribeVpcs`, `ec2:DescribeSubnets`, `ec2:DescribeSecurityGroups` + - `ec2:CreateVpcEndpoint`, `ec2:DescribeVpcEndpoints`, `ec2:DeleteVpcEndpoint` (if using VPC endpoints) + - `ec2:ModifyVpcEndpoint` +- **ECR**: Pull container images (if using ECR mirroring) + - `ecr:GetAuthorizationToken`, `ecr:BatchCheckLayerAvailability` + - `ecr:GetDownloadUrlForLayer`, `ecr:BatchGetImage` + - `ecr:CreateRepository`, `ecr:PutImage` (if enabling ECR clone) +- **Lambda**: Deploy token refresh Lambda function (optional, for automated token refresh) + - `lambda:CreateFunction`, `lambda:GetFunction`, `lambda:UpdateFunctionCode` + - `lambda:DeleteFunction`, `lambda:AddPermission`, `lambda:RemovePermission` + - `lambda:TagResource` +- **EventBridge**: Create scheduled events for Lambda (optional) + - `events:PutRule`, `events:DescribeRule`, `events:DeleteRule` + - `events:PutTargets`, `events:RemoveTargets` +- **S3/DynamoDB**: Access Terraform state backend + - `s3:GetObject`, `s3:PutObject`, `s3:ListBucket` (for state bucket) + - `dynamodb:GetItem`, `dynamodb:PutItem`, `dynamodb:DeleteItem` (for state locking) + +See **[AWS_PERMISSIONS.md](./AWS_PERMISSIONS.md)** for detailed permission policies and example IAM policy documents. + +#### Infrastructure Requirements +- VPC with private subnets (runners deploy without public IPs by default) +- Security groups allowing egress to: + - GitHub Enterprise (`github.e.it.census.gov` on HTTPS/443) + - Corporate proxy (`proxy.tco.census.gov:3128`) + - AWS services (S3, DynamoDB, Secrets Manager) via VPC endpoints or internet gateway +- (Optional) VPC endpoints for ECR, S3, EC2 to reduce data transfer costs and improve security + +### GitHub Requirements +- GitHub Enterprise organization admin access +- **GitHub App** configured for runner authentication + - See **[GITHUB_APP_SETUP.md](./GITHUB_APP_SETUP.md)** for complete setup instructions + - Required: App ID, Installation ID, and Private Key (PEM file) + - Permissions: Repository Administration (R/W), Actions (R/W), Self-hosted Runners (R/W) + +### Tools Required +- Terraform >= 1.0 +- AWS CLI configured +- Access to organization's Terraform state backend ## Quick Start -1. Clone this repository: +### 1. Clone the Repository + ```bash -git clone +git clone git@github.e.it.census.gov:CSVD/ghe-runners.git cd ghe-runner ``` -2. Create a `terraform.tfvars` file with your configuration: +### 2. Configure Variables + +Create or update your workspace-specific `.tfvars` file (e.g., `csvd-229685449397-us-gov-east-1.auto.tfvars`): + ```hcl -repo_org = "your-organization" -aws_account = "your-aws-account-name" -server_url = "https://github.your-domain.com" -vpc_id = "vpc-xxxxxx" -subnets = ["subnet-xxxxx", "subnet-yyyyy"] -security_groups = ["sg-xxxxxx"] -image_name = "github-runner" -image_version = "latest" -desired_count = 2 # Number of runners +# GitHub App Authentication (Required - see GITHUB_APP_SETUP.md) +github_app_id = "123456" # GitHub App ID +github_app_installation_id = "12345678" # Installation ID for your org +github_app_pem_file = "~/.github-apps/runner.pem" # Path to private key + +# GitHub Configuration +repo_org = "CSVD" # Your GitHub organization +server_url = "https://github.e.it.census.gov" # GitHub Enterprise URL + +# AWS Configuration +aws_account = "csvd-dev-ew" # Account identifier +vpc_id = "vpc-0abc123def456" # VPC ID +subnets = ["subnet-abc123", "subnet-def456"] # Private subnet IDs +security_groups = ["sg-xyz789"] # Security group IDs + +# Runner Configuration +image_name = "github-runner" # Container image name +image_version = "2.311.0" # Runner version +desired_count = 3 # Number of runners + +# Optional: VPC Endpoints +create_vpc_endpoint = false # Set to true if needed ``` -3. Initialize and apply Terraform: +**Important:** GitHub App credentials are organization-specific. See [GITHUB_APP_SETUP.md](./GITHUB_APP_SETUP.md) for setup instructions. + +### 3. Initialize Terraform + ```bash terraform init -terraform workspace new +``` + +### 4. Create or Select Workspace + +```bash +# Create new workspace +terraform workspace new 229685449397-us-gov-east-1 + +# Or select existing workspace +terraform workspace select 229685449397-us-gov-east-1 +``` + +### 5. Deploy Runners + +```bash +# Review planned changes terraform plan + +# Apply configuration terraform apply ``` -## Configuration Details +### 6. Verify Deployment + +**Check ECS:** +```bash +aws ecs list-tasks --cluster ecs-ghe-runners-us-gov-east-1 +``` + +**Check GitHub:** +1. Navigate to `https://github.e.it.census.gov/organizations/CSVD/settings/actions/runners` +2. Verify runners appear with status "Idle" +3. Confirm runner labels match configuration + +## Configuration Reference ### Required Variables -- `repo_org`: Your GitHub organization name -- `aws_account`: AWS account identifier -- `server_url`: GitHub Enterprise Server URL -- `vpc_id`: VPC ID where runners will be deployed -- `subnets`: List of subnet IDs for runner deployment -- `security_groups`: Security group IDs for runners -- `image_name`: GitHub runner container image name -- `image_version`: Container image version tag +| Variable | Description | Example | +|----------|-------------|---------| +| `github_app_id` | GitHub App ID (see GITHUB_APP_SETUP.md) | `"123456"` | +| `github_app_installation_id` | App Installation ID for organization | `"12345678"` | +| `github_app_pem_file` | Path to GitHub App private key | `"~/.github-apps/key.pem"` | +| `repo_org` | GitHub organization name | `"CSVD"` | +| `aws_account` | AWS account identifier | `"csvd-dev-ew"` | +| `server_url` | GitHub Enterprise Server URL | `"https://github.e.it.census.gov"` | +| `vpc_id` | VPC ID for runner deployment | `"vpc-0abc123"` | +| `subnets` | List of subnet IDs (private subnets) | `["subnet-abc", "subnet-def"]` | +| `security_groups` | Security group IDs for runners | `["sg-xyz789"]` | +| `image_name` | GitHub runner container image name | `"github-runner"` | +| `image_version` | Container image version tag | `"2.311.0"` | ### Optional Variables -- `create_vpc_endpoint`: Set to true to create VPC endpoints (default: false) -- `create_ecs_cluster`: Create new ECS cluster (default: false) -- `assign_public_ip`: Assign public IP to runners (default: false) -- `desired_count`: Number of runner instances (default: 2) +| Variable | Description | Default | +|----------|-------------|---------| +| `desired_count` | Number of runner instances | `2` | +| `create_vpc_endpoint` | Create VPC endpoints for AWS services | `false` | +| `create_ecs_cluster` | Create new ECS cluster | `false` | +| `assign_public_ip` | Assign public IP to runners | `false` | +| `log_retention_days` | CloudWatch log retention period | `90` | + +### Environment Variables (Set in Task Definition) + +```hcl +HTTP_PROXY = "http://proxy.tco.census.gov:3128" +HTTPS_PROXY = "http://proxy.tco.census.gov:3128" +NO_PROXY = "169.254.170.2,.census.gov,169.254.169.254,148.129.0.0/16,10.0.0.0/8,172.16.0/12,.eks.amazonaws.com,.s3.amazonaws.com,.amazonaws.com" +``` ## Runner Labels -Runners are automatically configured with the following labels: -- AWS account identifier -- Organization name -- Region -- ECS identifier -- Ubuntu-latest +Each runner is automatically configured with multiple labels for flexible workflow targeting: + +| Label Type | Format | Example | Usage | +|------------|--------|---------|-------| +| Account ID | `{account_id}` | `229685449397` | `runs-on: ["229685449397"]` | +| Account Name | `{account_name}` | `csvd-dev-ew` | `runs-on: ["csvd-dev-ew"]` | +| Account-Region | `{account_id}-{region}` | `229685449397-us-gov-east-1` | `runs-on: ["229685449397-us-gov-east-1"]` | +| Region | `{region}` | `us-gov-east-1` | `runs-on: ["us-gov-east-1"]` | +| Organization | `{org_name}` | `CSVD` | `runs-on: ["CSVD"]` | +| Runner Type | Fixed | `ecs-github-runner` | `runs-on: ["ecs-github-runner"]` | +| Compatibility | Fixed | `ubuntu-latest` | `runs-on: ["ubuntu-latest"]` | + +**Most Common Usage:** +```yaml +jobs: + build: + runs-on: ["229685449397"] # Target specific AWS account by ID +``` + +## Network Configuration + +### Proxy Settings + +Runners are pre-configured to use the corporate proxy for internet access: -## Security Considerations +- **HTTP/HTTPS Proxy**: `http://proxy.tco.census.gov:3128` +- **No Proxy**: Internal Census Bureau networks and AWS services -1. Runners are deployed in private subnets by default -2. VPC endpoints can be created for ECR/S3 access -3. GitHub token is stored securely in AWS Secrets Manager -4. Runners use IAM roles with least privilege +### VPC Endpoints (Optional) -## Monitoring +When `create_vpc_endpoint = true`, the following endpoints are created: -- CloudWatch Log Group: `/ecs-ghe-runners/{workspace}-{account-id}-{region}` -- Runner logs are retained for 90 days -- ECS task metrics available in CloudWatch +- **ECR API**: `com.amazonaws.{region}.ecr.api` +- **ECR Docker**: `com.amazonaws.{region}.ecr.dkr` +- **S3**: `com.amazonaws.{region}.s3` +- **EC2**: `com.amazonaws.{region}.ec2` + +**Benefits:** +- Reduced data transfer costs +- Improved security (no internet egress required) +- Faster access to AWS services + +## IAM Permissions + +The runner infrastructure uses two distinct IAM roles with different purposes: + +### ECS Task Role + +The **task role** is assumed by the running container and provides the GitHub Actions runner with permissions to interact with AWS services during workflow execution. This is what your workflows use to deploy infrastructure, access S3, etc. + +**Purpose:** Grants permissions for GitHub Actions workflows to manage AWS resources + +**Role Name Pattern:** `{hostname}-task-role` (e.g., `CSVD-task-role`) + +**Permissions Provided:** +This role is configured via the `github_runner_permissions_arn` variable, which by default points to `${var.repo_org}-admin` policy. + +**Default Policy** (`iam_policy/admin.json`): +```json +{ + "Statement": [ + { + "Action": "*", + "Effect": "Allow", + "Resource": "*" + } + ], + "Version": "2012-10-17" +} +``` + +**Additional Permissions Automatically Attached:** +- **Secrets Manager**: Read GitHub registration token + ```json + { + "Effect": "Allow", + "Action": "secretsmanager:GetSecretValue", + "Resource": "arn:aws:secretsmanager:*:*:secret:/github-runners/{namespace}/*" + } + ``` +- **S3**: Access certificates (if `certs` variable is configured) + ```json + { + "Effect": "Allow", + "Action": ["s3:GetObject", "s3:ListBucket"], + "Resource": "arn:aws:s3:::{certs_bucket}/*" + } + ``` + +**Customization:** +To use a custom policy instead of the default admin policy, create your own IAM policy and reference it in your `.tfvars`: +```hcl +github_runner_permissions_arn = "arn:aws:iam::{account_id}:policy/custom-runner-policy" +``` + +**Recommended Permissions for Terraform Workflows:** +- S3: Read/write Terraform state (`s3:GetObject`, `s3:PutObject`, `s3:ListBucket`) +- DynamoDB: State locking (`dynamodb:GetItem`, `dynamodb:PutItem`, `dynamodb:DeleteItem`) +- Service-specific permissions based on what your Terraform code manages (EC2, RDS, VPC, etc.) + +**Security Best Practice:** +Replace the wildcard admin policy with least-privilege permissions based on your specific workflow requirements. See **[AWS_PERMISSIONS.md](./AWS_PERMISSIONS.md)** for example policies. + +**Important Security Note:** +Infrastructure-as-Code workflows require broad AWS permissions to manage diverse resource types. The admin policy is intentionally permissive to support Terraform's dynamic provisioning needs. Security is enforced through: +- Repository access controls (who can push code) +- Pull request reviews (all changes peer-reviewed) +- Branch protection rules (required approvals) +- CloudTrail audit logging (full attribution) +- Account isolation (separate runners per account) + +This is the industry-standard approach used by Terraform Cloud, Spacelift, and other IaC platforms. For security team discussions, see **[Addressing Security Concerns](./AWS_PERMISSIONS.md#addressing-security-concerns)** in the permissions documentation. + +### ECS Execution Role + +The **execution role** is used by the ECS service itself to set up and manage the container lifecycle. This role is never directly used by your workflows. + +**Purpose:** Allows ECS to pull images, write logs, and retrieve secrets needed to start the container + +**Role Name Pattern:** `{hostname}-task-execution-role` (e.g., `CSVD-task-execution-role`) + +**Managed Policy Automatically Attached:** +- `arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy` + +This AWS-managed policy provides: +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "ecr:GetAuthorizationToken", + "ecr:BatchCheckLayerAvailability", + "ecr:GetDownloadUrlForLayer", + "ecr:BatchGetImage", + "logs:CreateLogStream", + "logs:PutLogEvents" + ], + "Resource": "*" + } + ] +} +``` + +**When to Modify:** +You typically don't need to modify this role unless: +- Using private ECR repositories in a different account (add cross-account ECR permissions) +- Pulling images from additional registries +- Need to access secrets from Secrets Manager for environment variables (already included by the module) + +## Monitoring & Logging + +### CloudWatch Logs + +**Log Group:** `/ecs-ghe-runners/{workspace}-{account_id}-{region}` + +**Log Streams:** One per ECS task (runner instance) + +**Retention:** 90 days (configurable via `log_retention_days`) + +**View Logs:** +```bash +# List log streams +aws logs describe-log-streams \ + --log-group-name /ecs-ghe-runners/229685449397-229685449397-us-gov-east-1 + +# Tail logs +aws logs tail /ecs-ghe-runners/229685449397-229685449397-us-gov-east-1 --follow +``` + +### ECS Metrics + +Available in CloudWatch: +- CPU utilization +- Memory utilization +- Task count +- Running task count + +### GitHub Actions UI + +Monitor runner status: +1. Navigate to: `https://github.e.it.census.gov/organizations/{org}/settings/actions/runners` +2. View runner status (Idle, Active, Offline) +3. Check runner labels and version ## Troubleshooting +### Runner Not Appearing in GitHub + +**Symptoms:** ECS tasks running, but runners not visible in GitHub organization settings + +**Troubleshooting Steps:** +1. Check CloudWatch logs for authentication errors +2. Verify OAuth App credentials in Secrets Manager +3. Confirm OAuth App has organization-level access +4. Check security group allows outbound HTTPS to GitHub + +**Common Causes:** +- Invalid or expired OAuth App credentials +- OAuth App not installed at organization level +- Network connectivity issues (proxy, security groups) + +### Runner Shows as Offline + +**Symptoms:** Runner appears in GitHub but status is "Offline" + +**Troubleshooting Steps:** 1. Check ECS task status in AWS Console -2. Review CloudWatch logs for runner issues -3. Verify GitHub organization permissions -4. Ensure VPC endpoints are accessible -5. Validate security group rules +2. Verify task is in RUNNING state +3. Review CloudWatch logs for errors +4. Check proxy connectivity + +**Common Causes:** +- ECS task crashed or stopped +- Network connectivity lost +- GitHub token expired or revoked + +### Authentication Failures in Workflows + +**Symptoms:** Workflows fail with AWS permission errors + +**Troubleshooting Steps:** +1. Verify ECS Task Role has required permissions +2. Check IAM policy attached to task role +3. Review CloudTrail for access denied events +4. Confirm runner is in correct AWS account + +**Common Causes:** +- Insufficient IAM permissions on task role +- Wrong AWS account (runner in different account) +- Cross-account role assumption issues + +### High Memory/CPU Usage + +**Symptoms:** Runners consuming excessive resources + +**Troubleshooting Steps:** +1. Review CloudWatch metrics for task +2. Check workflow jobs for resource-intensive operations +3. Consider increasing task size or runner count + +**Solutions:** +- Increase `desired_count` for more parallel capacity +- Optimize workflow jobs (caching, parallelization) +- Scale task CPU/memory in task definition + +### Workflow Jobs Stuck in Queue + +**Symptoms:** Jobs pending with "Waiting for a runner to pick up this job" + +**Troubleshooting Steps:** +1. Verify runners are online and idle +2. Check runner labels match workflow `runs-on` +3. Confirm sufficient runner capacity + +**Solutions:** +- Increase `desired_count` for more runners +- Verify `runs-on` label matches deployed runners +- Check for long-running jobs blocking runners + +## Scaling Runners + +### Manual Scaling + +Update `desired_count` in `default.auto.tfvars`: + +```hcl +desired_count = 5 # Increase from 3 to 5 +``` + +Apply changes: +```bash +terraform apply +``` + +### Considerations + +- **Startup Time**: Fargate tasks take 1-2 minutes to start +- **Cost**: Each runner incurs Fargate compute costs +- **Capacity Planning**: Monitor workflow queue times and runner utilization +- **Regional Limits**: Check AWS service quotas for ECS tasks + +## Operations + +### Refreshing Runners + +Runners may need to be refreshed when they stop responding or fail to come back online. Common causes include: +- Expired GitHub OAuth token +- ECS tasks stuck in a failed state +- Network connectivity issues +- GitHub API issues +- Orphaned runner registrations in GitHub + +#### Standard Refresh Procedure (Terraform Taint) + +This is the **primary method** for refreshing runners and resolving most issues: + +**Step 1: Clean Up GitHub Runner Group and Runners** + +Before tainting in Terraform, manually remove stale runners from GitHub: + +```bash +# Navigate to GitHub Organization Settings +# URL: https://github.e.it.census.gov/organizations/CSVD/settings/actions/runner-groups +``` + +1. **Remove Offline/Stale Runners:** + - Go to: **Organization Settings** → **Actions** → **Runners** + - Find runners with labels matching your account (e.g., `229685449397`) + - For each offline or problematic runner: + - Click the runner name + - Click **Remove runner** + - Confirm removal + +2. **Delete the Runner Group (if needed):** + - Go to: **Organization Settings** → **Actions** → **Runner groups** + - Find the runner group (typically named with account ID, e.g., `229685449397`) + - Click on the runner group + - Click **Delete runner group** + - Confirm deletion + + **Note:** The runner group will be automatically recreated by Terraform during the next apply. + +**Step 2: Taint Terraform Resources** + +Once GitHub is cleaned up, taint the resources to force recreation: + +```bash +# Taint the runner group (this will recreate it) +terraform taint 'module.github_runner.github_actions_runner_group.runner_group' + +# Taint the ECS service (this will recreate all runners) +terraform taint 'module.ecs_service.aws_ecs_service.github_runner' + +# Review what will be recreated +terraform plan +``` + +**Step 3: Apply Changes** + +```bash +# Apply the changes - this will: +# 1. Destroy old ECS service and tasks +# 2. Recreate runner group in GitHub +# 3. Create new ECS service with fresh tasks +# 4. Register new runners automatically +terraform apply +``` + +**Step 4: Verify New Runners** + +```bash +# Check ECS tasks are running +aws ecs list-tasks --cluster ecs-ghe-runners-us-gov-east-1 + +# Monitor CloudWatch logs for successful registration +aws logs tail /ecs-ghe-runners/229685449397-229685449397-us-gov-east-1 --follow + +# Verify in GitHub UI +# https://github.e.it.census.gov/organizations/CSVD/settings/actions/runners +# New runners should appear as "Idle" +``` + +**Why Manual GitHub Cleanup is Required:** + +- Terraform doesn't always properly deregister runners when they're in a failed state +- Orphaned runner registrations prevent new runners from registering with the same labels +- GitHub API rate limiting can cause Terraform to skip cleanup +- Runner groups with active (but offline) runners cannot be deleted automatically +- Expired GitHub OAuth tokens prevent automatic deregistration + +**Important:** Do NOT use `aws ecs update-service --force-new-deployment` as a workaround. This will fail if the GitHub OAuth token has expired, which is the most common cause of runner failures. Always use the Terraform taint method above after updating the token in Secrets Manager. + +#### Restart Individual Tasks (For Testing Only) + +To restart a specific problematic runner: + +```bash +# List running tasks +aws ecs list-tasks --cluster ecs-ghe-runners-us-gov-east-1 + +# Stop a specific task (ECS will automatically start a replacement) +aws ecs stop-task \ + --cluster ecs-ghe-runners-us-gov-east-1 \ + --task arn:aws:ecs:us-gov-east-1:123456789012:task/ecs-ghe-runners-us-gov-east-1/abc123def456 +``` + +**Note:** Stopping a task will cause ECS to automatically start a new one if the service's desired count is maintained. + +### Handling Expired GitHub Tokens + +GitHub Actions registration tokens have a **limited lifetime (typically 1 hour)**. When tokens expire, runners cannot register and will fail to come online. + +#### Understanding the Token Architecture + +The current implementation: +1. Terraform retrieves a registration token via `data.github_actions_organization_registration_token` +2. Token is stored in AWS Secrets Manager at `/github-runners/{namespace}/{hostname}-{random_pet_id}` +3. ECS tasks read the token from Secrets Manager on startup +4. **Problem**: Registration tokens expire after ~1 hour, but the secret in Secrets Manager is not automatically updated + +#### Solution: Automated Token Refresh + +To prevent token expiration issues, implement automated token refresh using one of these approaches: + +##### Option 1: Lambda Function with EventBridge (Recommended) + +Create a Lambda function that periodically refreshes the token: + +```python +# lambda_function.py +import boto3 +import os +import requests + +def lambda_handler(event, context): + github_token = os.environ['GITHUB_TOKEN'] # GitHub PAT with admin:org + org = os.environ['GITHUB_ORG'] + github_url = os.environ['GITHUB_URL'] + secret_name = os.environ['SECRET_NAME'] + + # Get fresh registration token from GitHub API + response = requests.post( + f'{github_url}/api/v3/orgs/{org}/actions/runners/registration-token', + headers={ + 'Authorization': f'token {github_token}', + 'Accept': 'application/vnd.github.v3+json' + } + ) + + if response.status_code == 201: + new_token = response.json()['token'] + + # Update Secrets Manager + sm_client = boto3.client('secretsmanager') + sm_client.update_secret( + SecretId=secret_name, + SecretString=new_token + ) + + return {'statusCode': 200, 'body': 'Token refreshed successfully'} + else: + raise Exception(f'Failed to get token: {response.status_code}') +``` + +**Terraform to deploy Lambda:** + +```hcl +# token_refresh_lambda.tf (add to ghe-runner repo) + +resource "aws_lambda_function" "token_refresh" { + filename = "token_refresh.zip" + function_name = "github-runner-token-refresh-${var.aws_account}" + role = aws_iam_role.lambda_refresh_role.arn + handler = "lambda_function.lambda_handler" + runtime = "python3.11" + timeout = 60 + + environment { + variables = { + GITHUB_TOKEN = var.github_token # GitHub PAT from provider + GITHUB_ORG = var.repo_org + GITHUB_URL = var.server_url + SECRET_NAME = aws_secretsmanager_secret.secret.name + } + } +} + +resource "aws_cloudwatch_event_rule" "token_refresh_schedule" { + name = "github-runner-token-refresh-${var.aws_account}" + description = "Refresh GitHub runner token every 30 minutes" + schedule_expression = "rate(30 minutes)" +} + +resource "aws_cloudwatch_event_target" "token_refresh_target" { + rule = aws_cloudwatch_event_rule.token_refresh_schedule.name + target_id = "RefreshTokenLambda" + arn = aws_lambda_function.token_refresh.arn +} + +resource "aws_lambda_permission" "allow_eventbridge" { + statement_id = "AllowExecutionFromEventBridge" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.token_refresh.function_name + principal = "events.amazonaws.com" + source_arn = aws_cloudwatch_event_rule.token_refresh_schedule.arn +} + +resource "aws_iam_role" "lambda_refresh_role" { + name = "github-runner-token-refresh-${var.aws_account}" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "lambda.amazonaws.com" + } + }] + }) +} + +resource "aws_iam_role_policy" "lambda_refresh_policy" { + name = "token-refresh-policy" + role = aws_iam_role.lambda_refresh_role.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "secretsmanager:UpdateSecret", + "secretsmanager:GetSecretValue" + ] + Resource = aws_secretsmanager_secret.secret.arn + }, + { + Effect = "Allow" + Action = [ + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents" + ] + Resource = "arn:aws:logs:*:*:*" + } + ] + }) +} +``` + +**Benefits:** +- Token automatically refreshes every 30 minutes +- Runners can be restarted anytime without token concerns +- No manual intervention required +- `force-new-deployment` becomes viable + +##### Option 2: Terraform Lifecycle Ignore Changes + +If implementing automated refresh is not feasible immediately, configure Terraform to ignore token changes: + +```hcl +# In main.tf +resource "aws_secretsmanager_secret_version" "secret" { + secret_id = aws_secretsmanager_secret.secret.id + secret_string = local.token + + lifecycle { + ignore_changes = [secret_string] + } +} +``` + +Then manually refresh tokens via AWS CLI when needed: + +```bash +# Get fresh token from GitHub +GITHUB_TOKEN="your-pat-token" +ORG="CSVD" +GITHUB_URL="https://github.e.it.census.gov" + +NEW_TOKEN=$(curl -X POST \ + -H "Authorization: token $GITHUB_TOKEN" \ + -H "Accept: application/vnd.github.v3+json" \ + "$GITHUB_URL/api/v3/orgs/$ORG/actions/runners/registration-token" \ + | jq -r '.token') + +# Update Secrets Manager +aws secretsmanager update-secret \ + --secret-id /github-runners/csvd/runner-name-xyz123 \ + --secret-string "$NEW_TOKEN" + +# Force new deployment to pick up new token +aws ecs update-service \ + --cluster ecs-ghe-runners-us-gov-east-1 \ + --service github-runner-service \ + --force-new-deployment +``` + +**Benefits:** +- Allows manual token management outside Terraform +- Enables `force-new-deployment` for quick refreshes +- No Lambda infrastructure needed + +**Drawbacks:** +- Requires manual intervention when tokens expire +- Need to remember to refresh tokens periodically + +#### Current Manual Refresh Procedure (Until Automation) + +Until automated token refresh is implemented, use this procedure: + +#### 1. Verify Token Expiration + +Check CloudWatch logs for authentication errors: + +```bash +aws logs tail /ecs-ghe-runners/229685449397-229685449397-us-gov-east-1 \ + --follow \ + --filter-pattern "401\|authentication\|token\|expired" +``` + +Look for messages like: +- `HTTP 401: Unauthorized` +- `Failed to register runner` +- `Authentication failed` +- `Token expired` + +#### 2. Clean Up GitHub (Runners and Runner Group) + +#### 2. Clean Up GitHub (Runners and Runner Group) + +Before applying Terraform changes, manually remove stale runners: + +1. **Remove Offline/Stale Runners:** + - Go to: `https://github.e.it.census.gov/organizations/CSVD/settings/actions/runners` + - Find runners with your account label (e.g., `229685449397`) + - Click each runner → **Remove runner** → Confirm + +2. **Delete the Runner Group:** + - Go to: `https://github.e.it.census.gov/organizations/CSVD/settings/actions/runner-groups` + - Find the runner group (e.g., `229685449397`) + - Click the runner group → **Delete runner group** → Confirm + +#### 3. Taint and Apply Terraform + +#### 3. Taint and Apply Terraform + +Taint the resources to force recreation with a fresh token: + +```bash +# Taint the secret to force new token retrieval +terraform taint 'aws_secretsmanager_secret_version.secret' + +# Taint the runner group +terraform taint 'module.github_runner.github_actions_runner_group.runner_group' + +# Taint the ECS service +terraform taint 'module.ecs_service.aws_ecs_service.github_runner' + +# Apply changes - Terraform will: +# 1. Retrieve fresh registration token from GitHub via data source +# 2. Update secret in Secrets Manager automatically +# 3. Recreate runner group in GitHub +# 4. Recreate ECS service and tasks +# 5. New runners will register using the fresh token +terraform apply +``` + +#### 4. Verify Runners Come Online + +Monitor runner registration: + +```bash +# Watch CloudWatch logs +aws logs tail /ecs-ghe-runners/229685449397-229685449397-us-gov-east-1 --follow + +# Check ECS task status +aws ecs describe-services \ + --cluster ecs-ghe-runners-us-gov-east-1 \ + --services github-runner-service \ + --query 'services[0].{Running:runningCount,Desired:desiredCount,Pending:pendingCount}' +``` + +Expected log output on successful registration: +``` +Runner successfully registered +Runner listening for Jobs +Runner connected to GitHub +``` + +Check GitHub UI: +1. Navigate to `https://github.e.it.census.gov/organizations/CSVD/settings/actions/runners` +2. Verify runners show status "Idle" (not "Offline") + +**Important:** Terraform automatically retrieves and manages the GitHub token via data source. You should never manually update tokens in Terraform variables or AWS Secrets Manager. + +### Handling Stuck ECS Tasks + +If tasks are stuck in `PENDING` or `STOPPED` state and not recovering: + +#### Diagnose the Issue + +```bash +# Check task status and error messages +aws ecs describe-tasks \ + --cluster ecs-ghe-runners-us-gov-east-1 \ + --tasks $(aws ecs list-tasks --cluster ecs-ghe-runners-us-gov-east-1 --query 'taskArns[0]' --output text) \ + --query 'tasks[0].{Status:lastStatus,Reason:stoppedReason,Containers:containers[0].reason}' +``` + +Common issues: +- **"CannotPullContainerError"**: ECR access issues or invalid image +- **"ResourceInitializationError"**: Network or security group issues +- **"TaskFailedToStart"**: Task role or execution role permission issues + +#### Solutions + +**1. ECR Access Issues:** +```bash +# Verify ECR image exists +aws ecr describe-images \ + --repository-name github-runner \ + --image-ids imageTag=2.311.0 + +# Check task execution role has ECR permissions +aws iam get-role-policy \ + --role-name ecsTaskExecutionRole \ + --policy-name ECRAccessPolicy +``` + +**2. Network Issues:** +```bash +# Verify security group allows outbound HTTPS +aws ec2 describe-security-groups \ + --group-ids sg-xyz789 \ + --query 'SecurityGroups[0].IpPermissionsEgress' + +# Check subnets have route to NAT Gateway or proxy +aws ec2 describe-route-tables \ + --filters "Name=association.subnet-id,Values=subnet-abc123" +``` + +**3. Force Clean Restart:** + +If tasks remain stuck, scale down to zero and back up: + +```bash +# Scale down to 0 +aws ecs update-service \ + --cluster ecs-ghe-runners-us-gov-east-1 \ + --service github-runner-service \ + --desired-count 0 + +# Wait 30 seconds for tasks to stop +sleep 30 + +# Scale back up to desired count +aws ecs update-service \ + --cluster ecs-ghe-runners-us-gov-east-1 \ + --service github-runner-service \ + --desired-count 3 +``` + +### Scaling Operations + +#### Scale Up During High Demand + +```bash +# Increase runner count to 10 +aws ecs update-service \ + --cluster ecs-ghe-runners-us-gov-east-1 \ + --service github-runner-service \ + --desired-count 10 + +# Or via Terraform +# Update default.auto.tfvars: +# desired_count = 10 +terraform apply +``` + +#### Scale Down During Low Activity + +```bash +# Decrease runner count to 2 +aws ecs update-service \ + --cluster ecs-ghe-runners-us-gov-east-1 \ + --service github-runner-service \ + --desired-count 2 +``` + +**Warning:** Scaling down will terminate runners. Active jobs will be cancelled unless they complete before the grace period expires. + +#### Emergency Scale to Zero + +In case of issues requiring all runners to be stopped: + +```bash +# Stop all runners +aws ecs update-service \ + --cluster ecs-ghe-runners-us-gov-east-1 \ + --service github-runner-service \ + --desired-count 0 + +# Active workflows will fail +# Runners will deregister from GitHub +``` + +To restore: +```bash +aws ecs update-service \ + --cluster ecs-ghe-runners-us-gov-east-1 \ + --service github-runner-service \ + --desired-count 3 +``` + +### Monitoring Runner Health + +#### Check Runner Status + +```bash +# Quick status check +aws ecs describe-services \ + --cluster ecs-ghe-runners-us-gov-east-1 \ + --services github-runner-service \ + --query 'services[0].{Desired:desiredCount,Running:runningCount,Pending:pendingCount,Deployments:deployments[*].{Status:status,Running:runningCount,Desired:desiredCount}}' +``` + +Healthy state: +```json +{ + "Desired": 3, + "Running": 3, + "Pending": 0, + "Deployments": [ + { + "Status": "PRIMARY", + "Running": 3, + "Desired": 3 + } + ] +} +``` + +#### Monitor GitHub Runner Registration + +Create a script to check runner status: + +```bash +#!/bin/bash +# check-runners.sh + +EXPECTED_COUNT=3 +ORG="CSVD" +TOKEN="your-github-token" # Or use gh cli + +# Get runner count from GitHub API +RUNNER_COUNT=$(curl -s \ + -H "Authorization: Bearer $TOKEN" \ + https://github.e.it.census.gov/api/v3/orgs/$ORG/actions/runners \ + | jq '[.runners[] | select(.labels[].name | contains("229685449397")) | select(.status == "online")] | length') + +echo "Expected: $EXPECTED_COUNT" +echo "Online: $RUNNER_COUNT" + +if [ "$RUNNER_COUNT" -lt "$EXPECTED_COUNT" ]; then + echo "WARNING: Runners below expected count!" + exit 1 +fi +``` + +#### Set Up CloudWatch Alarms + +Create alarms for runner health monitoring: + +```hcl +resource "aws_cloudwatch_metric_alarm" "runner_count_low" { + alarm_name = "github-runners-count-low-229685449397" + comparison_operator = "LessThanThreshold" + evaluation_periods = 2 + metric_name = "RunningTaskCount" + namespace = "AWS/ECS" + period = 300 + statistic = "Average" + threshold = 2 + alarm_description = "Alert when runner count drops below 2" + + dimensions = { + ServiceName = "github-runner-service" + ClusterName = "ecs-ghe-runners-us-gov-east-1" + } + + alarm_actions = [aws_sns_topic.ops_alerts.arn] +} + +resource "aws_cloudwatch_metric_alarm" "runner_task_failed" { + alarm_name = "github-runners-task-failures-229685449397" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 1 + metric_name = "TasksStoppedReason" + namespace = "AWS/ECS" + period = 300 + statistic = "Sum" + threshold = 3 + alarm_description = "Alert when multiple runner tasks fail" + + dimensions = { + ServiceName = "github-runner-service" + ClusterName = "ecs-ghe-runners-us-gov-east-1" + } + + alarm_actions = [aws_sns_topic.ops_alerts.arn] +} +``` + +### Common Operational Tasks + +#### View Recent Runner Activity + +```bash +# Check CloudWatch logs for recent job executions +aws logs tail /ecs-ghe-runners/229685449397-229685449397-us-gov-east-1 \ + --since 1h \ + --filter-pattern "Job\|Running\|Completed" +``` + +#### Identify Long-Running Jobs + +```bash +# List tasks with start times +aws ecs list-tasks \ + --cluster ecs-ghe-runners-us-gov-east-1 \ + --query 'taskArns[]' \ + --output text | xargs -I {} aws ecs describe-tasks \ + --cluster ecs-ghe-runners-us-gov-east-1 \ + --tasks {} \ + --query 'tasks[].{TaskArn:taskArn,StartedAt:startedAt,Status:lastStatus}' +``` + +#### Gracefully Drain a Runner + +To remove a runner without interrupting active jobs: + +```bash +# Reduce desired count by 1 +aws ecs update-service \ + --cluster ecs-ghe-runners-us-gov-east-1 \ + --service github-runner-service \ + --desired-count 2 + +# ECS will wait for a runner to complete its current job before stopping +# This may take up to 1 hour (default stop timeout) +``` + +### Disaster Recovery + +#### Complete Runner Recovery + +If all runners are offline, unresponsive, or completely broken, follow this complete recovery procedure: + +**Step 1: Manual GitHub Cleanup** + +Before any infrastructure changes, clean up GitHub completely: + +```bash +# 1. Remove all runners from the organization +# URL: https://github.e.it.census.gov/organizations/CSVD/settings/actions/runners + +# For each runner with your account label (e.g., 229685449397): +# - Click the runner +# - Click "Remove runner" +# - Confirm deletion +``` + +```bash +# 2. Delete the runner group +# URL: https://github.e.it.census.gov/organizations/CSVD/settings/actions/runner-groups + +# Find the runner group (e.g., "229685449397") +# - Click the runner group name +# - Click "Delete runner group" +# - Confirm deletion +``` + +**Step 2: Verify GitHub OAuth App (if authentication issues)** + +If runners are failing due to authentication, verify the OAuth App configuration: + +1. Navigate to: **GitHub Organization Settings** → **Developer settings** → **OAuth Apps** +2. Ensure the OAuth App exists and is active +3. Verify Terraform's GitHub provider is authenticated correctly + +**Note:** Terraform automatically retrieves the GitHub token using a data source. You do NOT need to manually update tokens. + +**Step 3: Terraform Taint and Recreate** + +```bash +# Taint GitHub runner group +terraform taint 'module.github_runner.github_actions_runner_group.runner_group' + +# Taint ECS service +terraform taint 'module.ecs_service.aws_ecs_service.github_runner' + +# Review planned changes +terraform plan + +# Apply changes to recreate everything +# Terraform will automatically retrieve a fresh token from GitHub +terraform apply +``` + +**Step 4: Monitor Recovery** + +```bash +# Watch ECS tasks start +watch -n 5 'aws ecs describe-services \ + --cluster ecs-ghe-runners-us-gov-east-1 \ + --services github-runner-service \ + --query "services[0].{Desired:desiredCount,Running:runningCount,Pending:pendingCount}"' + +# Monitor runner registration in logs +aws logs tail /ecs-ghe-runners/229685449397-229685449397-us-gov-east-1 --follow +``` + +**Step 5: Verify in GitHub** + +Check that runners are online and ready: +1. Go to: `https://github.e.it.census.gov/organizations/CSVD/settings/actions/runners` +2. Verify runner count matches desired count (e.g., 3 runners) +3. Verify all runners show status "Idle" (not "Offline") +4. Verify runner labels are correct (account ID, region, etc.) + +**Expected Timeline:** +- GitHub cleanup: 2-5 minutes +- Terraform apply: 3-5 minutes +- Runner registration: 2-3 minutes +- **Total recovery time: ~10-15 minutes** + +#### Terraform State Recovery + +If Terraform state is corrupted or lost: + +```bash +# 1. Re-import ECS service +terraform import module.ecs_service.aws_ecs_service.main \ + ecs-ghe-runners-us-gov-east-1/github-runner-service + +# 2. Re-import task definition +terraform import module.task_definition.aws_ecs_task_definition.main \ + github-runner-229685449397 + +# 3. Verify state +terraform plan + +# 4. Apply any drift corrections +terraform apply +``` + +## Maintenance + +### Updating Runner Version + +1. Update `image_version` in `default.auto.tfvars`: +```hcl +image_version = "2.312.0" # New version +``` + +2. Apply changes: +```bash +terraform apply +``` + +ECS will perform a rolling update, replacing tasks one at a time. + +### Updating Configuration + +All configuration changes should be made via Terraform: + +1. Modify variables in `default.auto.tfvars` +2. Run `terraform plan` to preview changes +3. Run `terraform apply` to apply changes + +**Never modify resources directly in AWS Console** - changes will be overwritten by Terraform. + +### Backup and Disaster Recovery + +**State Management:** +- Terraform state is stored in S3 backend +- State locking via DynamoDB +- State versioning enabled + +**Recovery Process:** +1. Clone repository +2. Initialize Terraform with existing state +3. Run `terraform plan` to verify state +4. Run `terraform apply` to recreate resources if needed + +## Security Best Practices + +1. **Use Private Subnets**: Deploy runners in private subnets without public IPs +2. **Minimize IAM Permissions**: Grant only necessary permissions to task role +3. **Rotate OAuth App Credentials**: Periodically rotate GitHub OAuth App credentials +4. **Enable VPC Endpoints**: Reduce internet egress and improve security +5. **Monitor Logs**: Regularly review CloudWatch logs for suspicious activity +6. **Update Runner Images**: Keep runner container images up to date +7. **Restrict Security Groups**: Allow only necessary outbound traffic + +## Cost Optimization + +**Fargate Pricing Factors:** +- CPU and memory allocation +- Task run duration +- Number of concurrent tasks + +**Optimization Strategies:** +1. Right-size task CPU/memory for workload +2. Scale `desired_count` based on actual usage +3. Use workflow job timeouts to prevent runaway jobs +4. Enable VPC endpoints to reduce data transfer costs +5. Use caching strategies in workflows to reduce execution time + +## Related Documentation + +- [GitHub Actions Architecture](https://github.e.it.census.gov/CSVD/github-actions/blob/main/GITHUB_ACTIONS_ARCHITECTURE.md) - Overall GitHub Actions ecosystem +- [Composite Actions](https://github.e.it.census.gov/CSVD/github-actions/wiki) - Available reusable actions +- [AWS Permissions Documentation](./AWS_PERMISSIONS.md) - Complete IAM permission requirements and examples +- [Security Review Guide](./SECURITY_REVIEW.md) - Summary for security team reviews and approvals +- [GitHub App Setup](./GITHUB_APP_SETUP.md) - GitHub App authentication configuration +- [AWS ECS Fargate Documentation](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/AWS_Fargate.html) +- [GitHub Actions Self-Hosted Runners](https://docs.github.com/en/actions/hosting-your-own-runners) ## Support -For issues or questions: -1. Check existing GitHub issues -2. Review AWS ECS and GitHub Actions documentation -3. Contact repository maintainers +For assistance: +1. **Infrastructure Issues**: Contact the infrastructure team +2. **Workflow Issues**: Review workflow-specific documentation +3. **GitHub Issues**: Open an issue in this repository +4. **AWS Issues**: Check CloudWatch logs and ECS console ## Contributing +We welcome contributions! Please follow these steps: + 1. Fork the repository -2. Create a feature branch -3. Submit a pull request with detailed description -4. Ensure tests pass and documentation is updated +2. Create a feature branch (`git checkout -b feature/improvement`) +3. Make your changes +4. Test thoroughly in a dev environment +5. Submit a pull request with detailed description +6. Ensure Terraform formatting (`terraform fmt`) +7. Update documentation as needed + +## License + +Internal use only - Census Bureau CSVD organization. diff --git a/README.md.backup b/README.md.backup new file mode 100644 index 0000000..c2d7fc9 --- /dev/null +++ b/README.md.backup @@ -0,0 +1,107 @@ +# ghe-runners +Manage Repo Specific Runners + +# GitHub Actions Runner Setup Guide + +This repository contains infrastructure code to manage repository-specific GitHub Actions runners using AWS ECS Fargate. + +## Prerequisites + +- AWS Account with appropriate permissions +- Terraform installed +- GitHub Organization Admin access +- AWS CLI configured with appropriate credentials +- GitHub Personal Access Token with admin:org permissions + +## Quick Start + +1. Clone this repository: +```bash +git clone +cd ghe-runner +``` + +2. Create a `terraform.tfvars` file with your configuration: +```hcl +repo_org = "your-organization" +aws_account = "your-aws-account-name" +server_url = "https://github.your-domain.com" +vpc_id = "vpc-xxxxxx" +subnets = ["subnet-xxxxx", "subnet-yyyyy"] +security_groups = ["sg-xxxxxx"] +image_name = "github-runner" +image_version = "latest" +desired_count = 2 # Number of runners +``` + +3. Initialize and apply Terraform: +```bash +terraform init +terraform workspace new +terraform plan +terraform apply +``` + +## Configuration Details + +### Required Variables + +- `repo_org`: Your GitHub organization name +- `aws_account`: AWS account identifier +- `server_url`: GitHub Enterprise Server URL +- `vpc_id`: VPC ID where runners will be deployed +- `subnets`: List of subnet IDs for runner deployment +- `security_groups`: Security group IDs for runners +- `image_name`: GitHub runner container image name +- `image_version`: Container image version tag + +### Optional Variables + +- `create_vpc_endpoint`: Set to true to create VPC endpoints (default: false) +- `create_ecs_cluster`: Create new ECS cluster (default: false) +- `assign_public_ip`: Assign public IP to runners (default: false) +- `desired_count`: Number of runner instances (default: 2) + +## Runner Labels + +Runners are automatically configured with the following labels: +- AWS account identifier +- Organization name +- Region +- ECS identifier +- Ubuntu-latest + +## Security Considerations + +1. Runners are deployed in private subnets by default +2. VPC endpoints can be created for ECR/S3 access +3. GitHub token is stored securely in AWS Secrets Manager +4. Runners use IAM roles with least privilege + +## Monitoring + +- CloudWatch Log Group: `/ecs-ghe-runners/{workspace}-{account-id}-{region}` +- Runner logs are retained for 90 days +- ECS task metrics available in CloudWatch + +## Troubleshooting + +1. Check ECS task status in AWS Console +2. Review CloudWatch logs for runner issues +3. Verify GitHub organization permissions +4. Ensure VPC endpoints are accessible +5. Validate security group rules + +## Support + +For issues or questions: +1. Check existing GitHub issues +2. Review AWS ECS and GitHub Actions documentation +3. Contact repository maintainers + +## Contributing + +1. Fork the repository +2. Create a feature branch +3. Submit a pull request with detailed description +4. Ensure tests pass and documentation is updated diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..9114f2d --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,352 @@ +# Security Guidelines for ghe-runners + +## GitHub App Authentication Security + +### ⚠️ CRITICAL: Protect GitHub App Private Keys + +This repository uses **GitHub App authentication** instead of Personal Access Tokens. The GitHub App private key (PEM file) provides **administrative access** to your GitHub organization. Improper handling can lead to: +- Unauthorized access to private repositories +- Ability to modify organization settings +- Creation/deletion of runners and runner groups +- Access to organization secrets + +### GitHub App Private Key Security + +#### Storage Best Practices + +**✅ DO:** +- Store PEM files with restrictive permissions (`chmod 600`) +- Keep PEM files outside of version control +- Use absolute paths in Terraform variables +- Store in a secure directory (e.g., `~/.github-apps/`) +- Use AWS Secrets Manager for Lambda production deployments +- Rotate keys every 6-12 months + +**❌ DO NOT:** +- Commit PEM files to version control +- Share PEM files via email, Slack, or messaging +- Store PEM files in shared network drives +- Use the same key across multiple environments +- Give PEM files world-readable permissions + +#### Secure Setup + +```bash +# Create secure directory +mkdir -p ~/.github-apps +chmod 700 ~/.github-apps + +# Move downloaded PEM file +mv ~/Downloads/your-app.*.private-key.pem ~/.github-apps/runner-mgmt.pem + +# Set restrictive permissions +chmod 600 ~/.github-apps/runner-mgmt.pem + +# Verify permissions +ls -la ~/.github-apps/ +# Should show: -rw------- (only owner can read/write) +``` + +### How to Safely Provide GitHub App Credentials + +GitHub App authentication requires three values: +1. **App ID** - Can be stored in `.tfvars` (not sensitive) +2. **Installation ID** - Can be stored in `.tfvars` (not sensitive) +3. **PEM file path** - Reference to secure file location + +#### Option 1: Workspace-Specific .tfvars (Recommended) + +```hcl +# csvd-229685449397-us-gov-east-1.auto.tfvars +github_app_id = "123456" # Safe to commit +github_app_installation_id = "12345678" # Safe to commit +github_app_pem_file = "~/.github-apps/runner-mgmt.pem" # Path only, safe to commit + +# Note: The actual PEM file content is never in version control +``` + +#### Option 2: Environment Variables (For Automation) + +```bash +# Set environment variables +export TF_VAR_github_app_id="123456" +export TF_VAR_github_app_installation_id="12345678" +export TF_VAR_github_app_pem_file="~/.github-apps/runner-mgmt.pem" + +# Run Terraform +terraform plan +terraform apply + +# Unset when done (optional, as these are not sensitive) +unset TF_VAR_github_app_id +unset TF_VAR_github_app_installation_id +unset TF_VAR_github_app_pem_file +``` + +#### Option 3: CI/CD Pipeline + +**GitHub Actions:** +```yaml +- name: Terraform Apply + env: + TF_VAR_github_app_id: ${{ vars.GITHUB_APP_ID }} + TF_VAR_github_app_installation_id: ${{ vars.GITHUB_APP_INSTALLATION_ID }} + TF_VAR_github_app_pem_file: /tmp/app-key.pem + run: | + # Retrieve PEM from secrets and write to file + echo "${{ secrets.GITHUB_APP_PEM }}" > /tmp/app-key.pem + chmod 600 /tmp/app-key.pem + + terraform apply -auto-approve + + # Clean up + rm /tmp/app-key.pem +``` + +**AWS CodeBuild:** +```yaml +env: + variables: + TF_VAR_github_app_id: "123456" + TF_VAR_github_app_installation_id: "12345678" + parameter-store: + TF_VAR_github_app_pem_file: /github-apps/runner-management/pem-path + secrets-manager: + GITHUB_APP_PEM: github-apps/runner-management:pem-content +``` + +### ❌ What NOT to Do + +**DO NOT** commit PEM files: + +```hcl +# ❌ NEVER include PEM file content in Terraform +variable "github_app_pem_file" { + default = <<-EOT + -----BEGIN RSA PRIVATE KEY----- + MIIEpAIBAAKCAQEA... # NEVER DO THIS! + -----END RSA PRIVATE KEY----- + EOT +} + +# ❌ NEVER commit PEM files +# runner-mgmt.pem # NEVER DO THIS! +# *.private-key.pem # NEVER DO THIS! +``` + +### .gitignore Protection + +Ensure your `.gitignore` includes: + +```gitignore +# GitHub App Private Keys +*.pem +*.private-key.pem +.github-apps/ + +# Terraform +.terraform/ +.terraform.lock.hcl +terraform.tfstate +terraform.tfstate.backup + +# Lambda Artifacts +lambda/package/ +lambda/*.zip + +# Sensitive files (if any) +*secret*.tfvars +*credentials*.tfvars +``` + +## GitHub App Requirements + +### Required Permissions + +Your GitHub App must have these permissions: + +**Repository Permissions:** +- **Administration**: Read & write (for managing self-hosted runners) +- **Actions**: Read & write (for generating registration tokens) + +**Organization Permissions:** +- **Self-hosted runners**: Read & write + +### Setup Instructions + +See **[GITHUB_APP_SETUP.md](./GITHUB_APP_SETUP.md)** for complete GitHub App creation and configuration instructions. + +## Lambda Security Considerations + +The Lambda function authenticates using GitHub App credentials and: +1. Call GitHub API to generate fresh registration tokens +2. Update AWS Secrets Manager with new registration tokens + +**Lambda Security Features:** +- Environment variables encrypted at rest by AWS +- IAM role restricts Secrets Manager access to specific secrets +- CloudWatch logs do NOT log the GitHub token (only API responses) +- Function executes in VPC (if configured) + +**Lambda IAM Permissions:** +```json +{ + "Effect": "Allow", + "Action": [ + "secretsmanager:UpdateSecret", + "secretsmanager:GetSecretValue", + "secretsmanager:PutSecretValue" + ], + "Resource": "arn:aws:secretsmanager:region:account:secret:/github-runners/*" +} +``` + +## AWS Credentials + +### Terraform AWS Authentication + +Terraform authenticates to AWS using one of: + +1. **IAM Role (Recommended for EC2/ECS/Lambda)** + ```bash + # No configuration needed - automatic + ``` + +2. **AWS CLI Profile** + ```bash + export AWS_PROFILE=your-profile + terraform apply + ``` + +3. **Environment Variables** + ```bash + export AWS_ACCESS_KEY_ID="your-key" + export AWS_SECRET_ACCESS_KEY="your-secret" + export AWS_SESSION_TOKEN="your-token" # If using temporary credentials + terraform apply + ``` + +**Security Best Practices:** +- ✅ Use IAM roles whenever possible +- ✅ Use temporary credentials (STS) for human access +- ✅ Enable MFA for AWS CLI access +- ❌ Never commit AWS credentials to version control + +## Secrets Manager Security + +Runner registration tokens are stored in AWS Secrets Manager: + +**Secret Path:** `/github-runners/{namespace}/{hostname}-{random_id}` + +**Access Control:** +- Only ECS tasks with specific IAM role can read +- Lambda function can read and update +- Encrypted at rest using AWS KMS +- Encrypted in transit using TLS + +**Secret Rotation:** +- Lambda automatically refreshes registration tokens every 30 minutes +- No manual intervention required +- Old tokens are overwritten (no retention needed) + +## Audit and Monitoring + +### CloudWatch Logs + +Monitor for security events: + +```bash +# Lambda token refresh logs +aws logs tail /aws/lambda/github-runner-token-refresh-{account} --follow + +# ECS runner logs (check for auth failures) +aws logs tail /ecs-ghe-runners/{workspace}-{account-id}-{region} \ + --filter-pattern "401\|403\|authentication\|unauthorized" +``` + +### CloudTrail + +Monitor AWS API calls: + +```bash +# Check Secrets Manager access +aws cloudtrail lookup-events \ + --lookup-attributes AttributeKey=ResourceType,AttributeValue=AWS::SecretsManager::Secret \ + --max-results 50 + +# Check Lambda invocations +aws cloudtrail lookup-events \ + --lookup-attributes AttributeKey=ResourceType,AttributeValue=AWS::Lambda::Function \ + --max-results 50 +``` + +### GitHub Audit Log + +Monitor GitHub organization activity: +1. Navigate to **Organization Settings** → **Audit log** +2. Filter for: + - Runner registration/removal events + - OAuth application usage + - Token generation events + +## Incident Response + +### If GitHub Token is Compromised + +**Immediate Actions:** +1. Revoke the compromised token in GitHub settings +2. Generate a new token +3. Update CI/CD secrets or environment variables +4. Run `terraform apply` with new token +5. Review GitHub audit logs for unauthorized activity +6. Check ECS runner logs for suspicious job executions + +### If AWS Credentials are Compromised + +**Immediate Actions:** +1. Rotate AWS access keys immediately +2. Review CloudTrail logs for unauthorized API calls +3. Check for unauthorized resource creation +4. Update IAM policies if needed +5. Consider using AWS GuardDuty for threat detection + +### If Registration Token is Exposed + +**Risk Level:** Low (tokens expire in ~1 hour) + +**Actions:** +1. Wait for token to expire naturally +2. Lambda will refresh with new token automatically +3. Review runner logs for unauthorized registrations +4. Check GitHub for unexpected runners + +## Compliance + +### Data Classification + +- **GitHub PAT**: Highly Sensitive - Treat as password +- **AWS Credentials**: Highly Sensitive - Treat as password +- **Registration Token**: Sensitive - Short-lived (1 hour) +- **Runner Logs**: May contain sensitive build artifacts + +### Retention Policies + +- CloudWatch Logs: 7 days (Lambda), 90 days (ECS runners) +- Secrets Manager: Current version only +- Terraform State: Stored in S3 with versioning enabled + +### Encryption + +- ✅ GitHub tokens: Encrypted in Lambda environment variables (AWS KMS) +- ✅ Registration tokens: Encrypted in Secrets Manager (AWS KMS) +- ✅ Terraform state: Encrypted in S3 (SSE-S3 or SSE-KMS) +- ✅ CloudWatch logs: Encrypted at rest +- ✅ Data in transit: TLS 1.2+ for all communications + +## Questions? + +For security concerns or questions: +1. Review this document +2. Check AWS Security Best Practices +3. Review GitHub Security documentation +4. Contact the infrastructure security team diff --git a/SECURITY_REVIEW.md b/SECURITY_REVIEW.md new file mode 100644 index 0000000..5033e65 --- /dev/null +++ b/SECURITY_REVIEW.md @@ -0,0 +1,319 @@ +# Security Review Summary - GitHub Actions ECS Runners + +**Quick Reference for Security Teams** + +## Executive Summary + +This document provides a security-focused overview of the GitHub Actions runner infrastructure for security review and approval. + +**Key Security Points:** +- ✅ All infrastructure changes require code review and approval +- ✅ Repository access restricted to infrastructure team members +- ✅ Full audit trail via CloudTrail with workflow attribution +- ✅ Network isolation in private subnets with no public IP +- ✅ Account-based isolation (one runner group per AWS account) +- ✅ Ephemeral runners recreated frequently (no persistent state) +- ⚠️ Runners use administrative IAM permissions (required for IaC, see justification below) + +## Security Model + +### Primary Security Boundary: Repository Access + Code Review + +The security model is **not based on restrictive IAM permissions**, but rather on controlling who can deploy infrastructure changes: + +``` +Developer → Pull Request → Code Review → Approval → Merge → Runner Executes → CloudTrail Audit + ↑ ↑ ↑ ↑ + (Status Checks) (2+ Reviews) (Branch) (Logged) +``` + +**Why This Works:** +1. Only authorized personnel can push to infrastructure repositories (GitHub enforces with MFA) +2. All changes require pull request approval (typically 2+ reviewers) +3. Branch protection prevents bypassing review process +4. Runners only execute approved, reviewed code +5. All API calls logged to CloudTrail with attribution to repository/workflow/commit + +**Industry Standard:** +This is the same model used by: +- Terraform Cloud (HashiCorp) +- Spacelift +- Atlantis +- GitHub Actions (AWS, Google Cloud, Azure official actions) + +### Why Administrative IAM Permissions Are Required + +**Problem:** Infrastructure-as-Code workflows need to create/modify/delete any AWS resource type. + +**Requirements:** +- Terraform provisions EC2, RDS, Lambda, VPC, IAM, S3, CloudWatch, etc. dynamically +- Infrastructure needs evolve constantly; pre-defining permissions blocks deployments +- Creating restrictive policies leads to: + - Constant IAM policy updates before every deployment + - Blocked emergency infrastructure changes + - Operational burden on security team + - Workarounds that reduce security (shadow IT, long-lived credentials) + +**Security Reality:** +- A malicious actor with repository write access can modify Terraform to grant themselves IAM permissions +- Therefore, restricting runner IAM permissions doesn't prevent malicious activity +- The real security boundary is **repository access control**, not IAM permissions + +**Mitigation:** +- Repository access restricted to infrastructure team only +- All changes require peer review (2+ approvers) +- CloudTrail provides full audit trail +- Account isolation limits blast radius +- No developer AWS credentials (runners only) + +## Multi-Layer Defense Strategy + +### Layer 1: Access Control (Who Can Deploy) + +| Control | Implementation | Enforced By | +|---------|----------------|-------------| +| Repository Access | Only infrastructure team members | GitHub permissions | +| MFA Required | All user accounts | GitHub organization policy | +| SAML/SSO | Enterprise authentication | GitHub Enterprise | +| Regular Access Review | Quarterly audit of repository access | Security team | + +### Layer 2: Code Review (What Gets Deployed) + +| Control | Implementation | Enforced By | +|---------|----------------|-------------| +| Pull Request Required | No direct pushes to main | GitHub branch protection | +| Minimum Approvals | 2+ reviewers required | GitHub branch protection | +| CODEOWNERS | Senior engineers must review | `.github/CODEOWNERS` file | +| Status Checks | Tests, security scans must pass | GitHub required checks | +| Dismiss Stale Reviews | New commits invalidate approvals | GitHub branch protection | + +### Layer 3: Isolation (Blast Radius Limitation) + +| Control | Implementation | Benefit | +|---------|----------------|---------| +| Account Isolation | Separate runners per AWS account | Compromise limited to one account | +| Network Isolation | Private subnets, no public IPs | No inbound access to runners | +| Ephemeral Runners | Recreated frequently | No persistent backdoors | +| Runner Groups | One group per account | No cross-account access | + +### Layer 4: Audit & Detection (What Happened) + +| Control | Implementation | Purpose | +|---------|----------------|---------| +| CloudTrail Logging | All API calls logged | Complete audit trail | +| Workflow Attribution | Logs include repo/workflow/commit | Trace changes to source | +| CloudWatch Alarms | Alert on sensitive operations | Detect anomalies | +| GitHub Audit Log | Track repository access changes | Access control monitoring | +| Drift Detection | Automated out-of-band change detection | Identify manual changes | + +## Comparison to Traditional Models + +### Traditional: Developers with AWS Credentials + +**Security Issues:** +- ❌ Credentials can be used from anywhere +- ❌ No mandatory code review for changes +- ❌ Often long-lived (IAM access keys) +- ❌ Difficult to attribute changes +- ❌ Credentials can be leaked/committed +- ❌ Manual changes bypass audit/review + +### This Model: GitHub Actions Runners + +**Security Improvements:** +- ✅ Runners only accessible from private network +- ✅ All changes require code review +- ✅ Short-lived credentials (ECS task role) +- ✅ Full attribution to code changes +- ✅ No developer credentials to leak +- ✅ All changes tracked in git history + +## Compliance Support + +### SOC 2 Type II + +| Control | How We Meet It | +|---------|----------------| +| CC6.1: Logical Access | Repository permissions, MFA, SSO | +| CC6.2: Authentication | GitHub Enterprise with SAML | +| CC6.3: Authorization | Branch protection, CODEOWNERS | +| CC7.2: Monitoring | CloudTrail, CloudWatch alarms | +| CC7.3: Change Management | Pull request workflow with approval | + +### FedRAMP + +| Control | How We Meet It | +|---------|----------------| +| AC-2: Account Management | Quarterly repository access reviews | +| AC-3: Access Enforcement | Branch protection enforces approvals | +| AC-6: Least Privilege | Account isolation limits blast radius | +| AU-2: Audit Events | CloudTrail logs all API calls | +| CM-3: Change Control | All changes via approved pull requests | + +### NIST 800-53 + +- **AU (Audit)**: CloudTrail + GitHub audit log +- **AC (Access Control)**: Repository permissions + branch protection +- **CM (Change Management)**: Pull request workflow +- **SC (System Communications)**: Network isolation in private subnets + +## Red Flags vs Green Lights + +### 🚨 Red Flags (What to Worry About) + +These indicate security issues: +- ❌ Developers with direct AWS console/CLI access +- ❌ Infrastructure repositories with no branch protection +- ❌ Pull requests merged without review +- ❌ No CloudTrail logging or log forwarding +- ❌ Shared credentials between environments +- ❌ Long-lived IAM access keys + +### ✅ Green Lights (Good Security) + +This implementation provides: +- ✅ All infrastructure changes via pull request +- ✅ Required approvals from qualified reviewers +- ✅ Automated security scanning (tfsec, checkov) +- ✅ CloudTrail forwarded to SIEM with alerting +- ✅ Separate accounts per environment +- ✅ No developer AWS credentials + +## Common Security Review Questions + +### Q: Why not restrict IAM permissions? + +**A:** It doesn't improve security and harms operations: +1. Actors with repo access can grant IAM permissions via Terraform +2. Pre-defining permissions blocks deployments as infrastructure evolves +3. Creates false sense of security while harming productivity +4. Industry standard is admin permissions with repository controls + +**If restrictions are required, use:** +- Service Control Policies (SCPs) for organization-wide guardrails +- AWS Control Tower for account-level restrictions +- IAM conditions (e.g., restrict to specific regions) + +### Q: What prevents malicious code from being deployed? + +**A:** Multiple layers: +1. Repository access limited to infrastructure team +2. Branch protection requires 2+ approvals +3. CODEOWNERS enforces senior engineer review +4. Automated security scans in CI/CD +5. CloudTrail audit with alerting + +### Q: How do we audit runner activity? + +**A:** Full CloudTrail integration: +- Every API call logged with runner identity +- Logs include repository, workflow, commit SHA +- Can trace any infrastructure change to pull request +- Logs forwarded to SIEM for analysis +- CloudWatch alarms on sensitive operations + +### Q: What if a runner is compromised? + +**A:** Limited impact: +1. Runners are ephemeral (recreated frequently) +2. Network isolated (private subnet, no inbound access) +3. Account isolation (compromise limited to one account) +4. No persistent state (no place to hide) +5. CloudTrail logging (anomalies detected) + +### Q: How is this better than developers with AWS credentials? + +**A:** Significantly more secure: +- Mandatory code review (vs. optional) +- No credentials to leak +- Full git history of changes +- Automated security scanning +- Network isolation +- Short-lived credentials + +## Recommendations for Approval + +### Before Approving, Verify: + +1. **Repository Access Controls** + - [ ] Write access limited to infrastructure team + - [ ] GitHub organization requires MFA + - [ ] SSO/SAML configured (if required) + +2. **Branch Protection Rules** + - [ ] Main branches protected + - [ ] Minimum 2 approvals required + - [ ] CODEOWNERS file enforced + - [ ] Status checks required + +3. **Audit Logging** + - [ ] CloudTrail enabled in all accounts + - [ ] Logs forwarded to SIEM + - [ ] Retention policy configured (1+ years) + - [ ] Alarms on sensitive operations + +4. **Account Isolation** + - [ ] Separate accounts for dev/staging/prod + - [ ] Dedicated runner groups per account + - [ ] No cross-account role assumption + +5. **Network Security** + - [ ] Runners in private subnets + - [ ] No public IP addresses + - [ ] Egress via corporate proxy + - [ ] Security groups restrict traffic + +### Don't Require: + +- ❌ Restrictive IAM policies on task role (doesn't improve security for IaC) +- ❌ Manual approval for every deployment (defeats automation purpose) +- ❌ Pre-approval of every AWS service (blocks agile infrastructure development) + +### Do Require: + +- ✅ Quarterly access reviews +- ✅ Security scanning in CI/CD pipelines +- ✅ CloudTrail log monitoring and alerting +- ✅ Incident response plan for detected anomalies + +## Conclusion + +This implementation follows industry best practices for Infrastructure-as-Code automation: + +1. **Security through access control and code review** (not restrictive IAM) +2. **Full audit trail** (CloudTrail + git history) +3. **Defense in depth** (multiple layers of controls) +4. **Compliance support** (meets SOC 2, FedRAMP, NIST 800-53 requirements) +5. **Better than alternatives** (more secure than developer AWS credentials) + +**Approval Decision:** + +The administrative IAM permissions are: +- ✅ **Necessary** for Infrastructure-as-Code operations +- ✅ **Industry standard** (Terraform Cloud, Spacelift, Atlantis) +- ✅ **Properly controlled** through repository access and code review +- ✅ **Fully auditable** through CloudTrail and git history +- ✅ **Compliant** with security frameworks (SOC 2, FedRAMP) + +**Recommended Action:** Approve with conditions: +1. Enforce branch protection rules on all infrastructure repositories +2. Require minimum 2 approvals for all infrastructure changes +3. Enable CloudTrail in all accounts with log forwarding to SIEM +4. Conduct quarterly access reviews of repository permissions +5. Implement CloudWatch alarms for sensitive API calls + +--- + +## Additional Resources + +- **[AWS_PERMISSIONS.md](./AWS_PERMISSIONS.md)** - Complete permission documentation +- **[README.md](./README.md)** - Deployment and operations guide +- **[GITHUB_APP_SETUP.md](./GITHUB_APP_SETUP.md)** - GitHub App authentication setup +- **[SECURITY.md](./SECURITY.md)** - General security documentation + +## Contact + +For security review questions or concerns: +- Infrastructure Team: [infrastructure@example.com] +- Security Team: [security@example.com] +- Documentation: https://github.e.it.census.gov/CSVD/github-actions-docs diff --git a/backend.tf b/backend.tf index 7282c36..c399825 100644 --- a/backend.tf +++ b/backend.tf @@ -1,7 +1,7 @@ terraform { backend "s3" { - bucket = "inf-tfstate-229685449397" - key = "csvd-dev-gov/common/apps/ghe-runner" - region = "us-gov-east-1" + bucket = "inf-tfstate-229685449397" + key = "csvd-dev-gov/common/apps/ghe-runner" + region = "us-gov-east-1" } } diff --git a/ecs_cluster.tf b/ecs_cluster.tf index 140cefb..1025bfd 100644 --- a/ecs_cluster.tf +++ b/ecs_cluster.tf @@ -19,5 +19,6 @@ data "aws_ecs_cluster" "github-runner" { } locals { - ecs_cluster = var.create_ecs_cluster ? one(aws_ecs_cluster.github-runner) : merge(one(data.aws_ecs_cluster.github-runner), { name = one(data.aws_ecs_cluster.github-runner).cluster_name }) + ecs_cluster = var.create_ecs_cluster ? one(aws_ecs_cluster.github-runner) : merge(one(data.aws_ecs_cluster.github-runner), { name = one(data.aws_ecs_cluster.github-runner).cluster_name }) + ecs_cluster_name = var.create_ecs_cluster ? one(aws_ecs_cluster.github-runner[*].name) : one(data.aws_ecs_cluster.github-runner[*].cluster_name) } diff --git a/example.auto.tfvars b/example.auto.tfvars new file mode 100644 index 0000000..013d383 --- /dev/null +++ b/example.auto.tfvars @@ -0,0 +1,90 @@ +# Example Terraform Variables Configuration +# Copy this file to a workspace-specific .tfvars file and customize +# Example: csvd-229685449397-us-gov-east-1.auto.tfvars + +# ============================================================================= +# GitHub App Authentication (Required) +# ============================================================================= +# See GITHUB_APP_SETUP.md for setup instructions +# These values are organization-specific and must be configured per workspace + +github_app_id = "123456" # Your GitHub App ID +github_app_installation_id = "12345678" # Installation ID for your org +github_app_pem_file = "~/.github-apps/runner-mgmt.pem" # Path to private key + +# ============================================================================= +# GitHub Configuration (Required) +# ============================================================================= + +repo_org = "CSVD" # GitHub organization name +server_url = "https://github.e.it.census.gov" # GitHub Enterprise URL + +# ============================================================================= +# AWS Configuration (Required) +# ============================================================================= + +aws_account = "csvd-dev-ew" # AWS account identifier +region = "us-gov-east-1" # AWS region + +# Network Configuration +vpc_id = "vpc-0abc123def456789" # VPC ID for runner deployment +subnets = [ # Private subnet IDs + "subnet-0abc123", + "subnet-0def456" +] +security_groups = ["sg-0xyz789abc"] # Security group IDs + +# ============================================================================= +# Runner Configuration (Required) +# ============================================================================= + +image_name = "github-runner" # Container image name +image_version = "2.311.0" # GitHub Actions runner version +desired_count = 3 # Number of concurrent runners + +# ============================================================================= +# Task Configuration (Optional) +# ============================================================================= + +task_cpu = 1024 # Task CPU (1 vCPU = 1024) +task_memory = 2048 # Task memory in MB + +# ============================================================================= +# Labels Configuration (Optional) +# ============================================================================= +# Additional labels for runner identification in workflows +# Default labels are automatically added: account ID, account name, region + +labels = [ + "ecs", + "fargate", + "self-hosted" +] + +# ============================================================================= +# Network Configuration (Optional) +# ============================================================================= + +assign_public_ip = false # Assign public IP to tasks +proxy_enabled = true # Enable corporate proxy +proxy_url = "proxy.tco.census.gov:3128" # Proxy URL + +# VPC Endpoints (reduces NAT Gateway costs) +create_vpc_endpoint = false # Create VPC endpoints for AWS services + +# ============================================================================= +# Monitoring Configuration (Optional) +# ============================================================================= + +log_retention_days = 7 # CloudWatch log retention + +# ============================================================================= +# Tags (Optional) +# ============================================================================= + +tags = { + Environment = "development" + ManagedBy = "Terraform" + Project = "GitHub Actions Runners" + Owner = "DevOps Team" +} diff --git a/lambda/README.md b/lambda/README.md new file mode 100644 index 0000000..bcf371c --- /dev/null +++ b/lambda/README.md @@ -0,0 +1,256 @@ +# GitHub Runner Token Refresh Lambda + +This Lambda function automatically refreshes GitHub Actions registration tokens using **GitHub App authentication** to prevent expiration issues. + +## Overview + +GitHub Actions registration tokens expire after approximately 1 hour. This Lambda function: +- Runs every 30 minutes (triggered by EventBridge) +- Authenticates using **GitHub App** (JWT-based, more secure than PATs) +- Retrieves a fresh registration token from the GitHub API +- Updates the token in AWS Secrets Manager +- Ensures runners can always register successfully + +### Authentication Flow + +The function uses **GitHub App authentication** instead of Personal Access Tokens: + +1. **Generate JWT**: Creates a JSON Web Token from the GitHub App private key (PEM file) +2. **Get Installation Token**: Exchanges JWT for an installation access token +3. **Get Registration Token**: Uses installation token to generate a runner registration token +4. **Update Secrets Manager**: Stores the new registration token for ECS tasks to use + +This three-step flow follows GitHub's recommended security practices and provides better auditability. + +## Architecture + +``` +EventBridge (every 30 min) + ↓ + Lambda Function + ├─ 1. Generate JWT from PEM file + ├─ 2. Get GitHub App installation token + ├─ 3. Get runner registration token + └─ 4. Update Secrets Manager + ↓ + ECS Runners (read token on startup) +``` + +## Requirements + +### Python Dependencies + +The Lambda function requires these Python packages (see `requirements.txt`): + +- `PyJWT>=2.8.0` - JSON Web Token generation +- `cryptography>=41.0.0` - Cryptographic operations for JWT + +These are automatically packaged with the Lambda deployment via Terraform. + +### Environment Variables + +The Lambda function requires these environment variables (automatically set by Terraform): + +- `GITHUB_APP_ID`: GitHub App ID +- `GITHUB_APP_INSTALLATION_ID`: GitHub App Installation ID for the organization +- `GITHUB_APP_PEM_FILE`: Path to GitHub App private key (PEM file) +- `GITHUB_ORG`: GitHub organization name (e.g., "CSVD") +- `GITHUB_URL`: GitHub Enterprise URL (e.g., "https://github.e.it.census.gov") +- `SECRET_NAME`: AWS Secrets Manager secret name/ARN + +### IAM Permissions + +The Lambda execution role needs: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "secretsmanager:UpdateSecret", + "secretsmanager:GetSecretValue", + "secretsmanager:PutSecretValue" + ], + "Resource": "arn:aws:secretsmanager:region:account:secret:secret-name" + }, + { + "Effect": "Allow", + "Action": [ + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents" + ], + "Resource": "arn:aws:logs:*:*:*" + } + ] +} +``` + +## Deployment + +The Lambda function is automatically deployed via Terraform when you apply the `ghe-runner` configuration: + +```bash +terraform apply +``` + +Terraform creates: +- Lambda function +- IAM role and policies +- EventBridge schedule rule (30-minute interval) +- CloudWatch log group +- CloudWatch alarm for failures + +## Monitoring + +### CloudWatch Logs + +View Lambda execution logs: + +```bash +aws logs tail /aws/lambda/github-runner-token-refresh-229685449397 --follow +``` + +### CloudWatch Metrics + +Monitor Lambda performance: + +```bash +# Invocation count +aws cloudwatch get-metric-statistics \ + --namespace AWS/Lambda \ + --metric-name Invocations \ + --dimensions Name=FunctionName,Value=github-runner-token-refresh-229685449397 \ + --start-time 2024-01-01T00:00:00Z \ + --end-time 2024-01-02T00:00:00Z \ + --period 3600 \ + --statistics Sum + +# Error count +aws cloudwatch get-metric-statistics \ + --namespace AWS/Lambda \ + --metric-name Errors \ + --dimensions Name=FunctionName,Value=github-runner-token-refresh-229685449397 \ + --start-time 2024-01-01T00:00:00Z \ + --end-time 2024-01-02T00:00:00Z \ + --period 3600 \ + --statistics Sum +``` + +### CloudWatch Alarms + +An alarm is automatically created to notify when Lambda errors occur: +- Alarm: `github-runner-token-refresh-{account}-errors` +- Threshold: > 1 error in 2 consecutive 5-minute periods + +## Testing + +### Manual Invocation + +Test the Lambda function manually: + +```bash +aws lambda invoke \ + --function-name github-runner-token-refresh-229685449397 \ + --payload '{}' \ + response.json + +cat response.json +``` + +Expected output: +```json +{ + "statusCode": 200, + "body": "{\"message\": \"Token refreshed successfully\", \"secret_name\": \"/github-runners/csvd/runner-xyz\", \"github_org\": \"CSVD\"}" +} +``` + +### Verify Token Update + +Check that Secrets Manager was updated: + +```bash +aws secretsmanager get-secret-value \ + --secret-id /github-runners/csvd/runner-xyz \ + --query 'SecretString' \ + --output text +``` + +### Test Runner Registration + +After token refresh, verify runners can register: + +```bash +# Force new ECS deployment to test with fresh token +aws ecs update-service \ + --cluster ecs-ghe-runners-us-gov-east-1 \ + --service github-runner-service \ + --force-new-deployment + +# Watch logs for successful registration +aws logs tail /ecs-ghe-runners/229685449397-229685449397-us-gov-east-1 --follow +``` + +## Troubleshooting + +### Lambda Fails with 401 Unauthorized + +**Cause**: GitHub token invalid or lacks permissions + +**Solution**: +1. Verify GitHub token has `admin:org` scope +2. Check token hasn't expired +3. Update Lambda environment variable: + +```bash +aws lambda update-function-configuration \ + --function-name github-runner-token-refresh-229685449397 \ + --environment "Variables={GITHUB_TOKEN=new-token,GITHUB_ORG=CSVD,GITHUB_URL=https://github.e.it.census.gov,SECRET_NAME=/github-runners/csvd/runner-xyz}" +``` + +### Lambda Fails to Update Secret + +**Cause**: IAM permissions insufficient + +**Solution**: Verify Lambda role has `secretsmanager:PutSecretValue` permission + +### Token Still Expires + +**Cause**: Lambda not running frequently enough + +**Solution**: Reduce EventBridge schedule interval: + +```bash +# Update to every 15 minutes +aws events put-rule \ + --name github-runner-token-refresh-229685449397-schedule \ + --schedule-expression "rate(15 minutes)" +``` + +## Benefits + +With automated token refresh: + +✅ **No manual intervention**: Tokens refresh automatically every 30 minutes +✅ **Reliable runner startup**: Tokens always valid when runners start +✅ **Simple operations**: Use `force-new-deployment` for quick refreshes +✅ **No Terraform taint needed**: Skip manual GitHub cleanup in most cases +✅ **Reduced downtime**: Runners can be restarted anytime + +## Cost + +Estimated monthly cost: +- Lambda invocations: 1,440 per month (every 30 min) +- Duration: ~2 seconds per invocation +- Memory: 128 MB +- **Total: < $0.01/month** (within AWS Free Tier) + +## Security Considerations + +- GitHub token stored in Lambda environment (encrypted at rest) +- Consider using AWS Systems Manager Parameter Store for token storage +- Lambda logs may contain sensitive data - ensure log retention is appropriate +- CloudWatch logs retained for 7 days by default diff --git a/lambda/requirements.txt b/lambda/requirements.txt new file mode 100644 index 0000000..b2a9e0f --- /dev/null +++ b/lambda/requirements.txt @@ -0,0 +1,2 @@ +PyJWT>=2.8.0 +cryptography>=41.0.0 diff --git a/lambda/token_refresh.py b/lambda/token_refresh.py new file mode 100644 index 0000000..f143710 --- /dev/null +++ b/lambda/token_refresh.py @@ -0,0 +1,292 @@ +""" +Lambda function to refresh GitHub Actions runner registration tokens. + +This function is triggered by EventBridge every 30 minutes to refresh +the GitHub Actions registration token stored in AWS Secrets Manager. +This prevents token expiration issues that cause runners to fail registration. + +Authentication: Uses GitHub App for secure, auditable API access. +""" + +import boto3 +import json +import os +import time +import urllib3 +from typing import Dict, Any +import jwt # PyJWT library for JWT token generation + +# Initialize AWS clients +secrets_manager = boto3.client('secretsmanager') +http = urllib3.PoolManager() + + +def generate_jwt_token(app_id: str, pem_file_content: str) -> str: + """ + Generate a JWT token for GitHub App authentication. + + Args: + app_id: GitHub App ID + pem_file_content: Contents of the GitHub App private key PEM file + + Returns: + JWT token string + """ + # Current time and expiration (10 minutes from now, max allowed by GitHub) + now = int(time.time()) + expiration = now + (10 * 60) + + payload = { + 'iat': now, + 'exp': expiration, + 'iss': app_id + } + + # Generate JWT using RS256 algorithm + token = jwt.encode(payload, pem_file_content, algorithm='RS256') + + return token + + +def get_installation_access_token(github_url: str, app_id: str, installation_id: str, pem_file_content: str) -> str: + """ + Get an installation access token for the GitHub App. + + Args: + github_url: Base GitHub Enterprise URL + app_id: GitHub App ID + installation_id: GitHub App Installation ID + pem_file_content: Contents of the GitHub App private key PEM file + + Returns: + Installation access token + + Raises: + Exception: If token generation fails + """ + # Step 1: Generate JWT + jwt_token = generate_jwt_token(app_id, pem_file_content) + + # Step 2: Get installation access token + api_url = f"{github_url}/api/v3/app/installations/{installation_id}/access_tokens" + + headers = { + 'Authorization': f'Bearer {jwt_token}', + 'Accept': 'application/vnd.github.v3+json', + 'User-Agent': 'AWS-Lambda-GitHub-Runner-Token-Refresh' + } + + print(f"Requesting installation access token from: {api_url}") + + response = http.request( + 'POST', + api_url, + headers=headers + ) + + if response.status == 201: + data = json.loads(response.data.decode('utf-8')) + access_token = data.get('token') + expires_at = data.get('expires_at') + + print(f"Successfully generated installation access token (expires: {expires_at})") + return access_token + else: + error_msg = f"Failed to get installation token. Status {response.status}: {response.data.decode('utf-8')}" + print(error_msg) + raise Exception(error_msg) + + +def get_github_registration_token(github_url: str, org: str, access_token: str) -> str: + """ + Retrieve a fresh GitHub Actions registration token from the GitHub API. + + Args: + github_url: Base GitHub Enterprise URL + org: GitHub organization name + access_token: GitHub App installation access token + + Returns: + Fresh registration token + + Raises: + Exception: If GitHub API request fails + """ + api_url = f"{github_url}/api/v3/orgs/{org}/actions/runners/registration-token" + + headers = { + 'Authorization': f'token {access_token}', + 'Accept': 'application/vnd.github.v3+json', + 'User-Agent': 'AWS-Lambda-GitHub-Runner-Token-Refresh' + } + + print(f"Requesting registration token from: {api_url}") + + response = http.request( + 'POST', + api_url, + headers=headers + ) + + if response.status == 201: + data = json.loads(response.data.decode('utf-8')) + token = data.get('token') + expires_at = data.get('expires_at') + + print(f"Successfully retrieved registration token (expires: {expires_at})") + return token + else: + error_msg = f"GitHub API request failed with status {response.status}: {response.data.decode('utf-8')}" + print(error_msg) + raise Exception(error_msg) + + +def update_secrets_manager(secret_name: str, new_token: str) -> None: + """ + Update the GitHub registration token in AWS Secrets Manager. + + Args: + secret_name: Name/ARN of the secret in Secrets Manager + new_token: New registration token to store + + Raises: + Exception: If Secrets Manager update fails + """ + try: + print(f"Updating secret: {secret_name}") + + secrets_manager.put_secret_value( + SecretId=secret_name, + SecretString=new_token + ) + + print(f"Successfully updated secret: {secret_name}") + + except Exception as e: + error_msg = f"Failed to update Secrets Manager: {str(e)}" + print(error_msg) + raise Exception(error_msg) + + +def lambda_handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]: + """ + Lambda handler function triggered by EventBridge. + + Environment Variables Required: + GITHUB_APP_ID: GitHub App ID + GITHUB_APP_INSTALLATION_ID: GitHub App Installation ID + GITHUB_APP_PEM_FILE: Path to GitHub App private key PEM file + GITHUB_ORG: GitHub organization name + GITHUB_URL: GitHub Enterprise base URL + SECRET_NAME: Name/ARN of the secret in Secrets Manager + + Args: + event: EventBridge event (not used) + context: Lambda context object + + Returns: + Response dict with status code and message + """ + print("=== GitHub Runner Token Refresh Lambda ===") + print(f"Request ID: {context.request_id}") + print(f"Function: {context.function_name}") + + # Get environment variables + github_app_id = os.environ.get('GITHUB_APP_ID') + github_app_installation_id = os.environ.get('GITHUB_APP_INSTALLATION_ID') + github_app_pem_file = os.environ.get('GITHUB_APP_PEM_FILE') + github_org = os.environ.get('GITHUB_ORG') + github_url = os.environ.get('GITHUB_URL') + secret_name = os.environ.get('SECRET_NAME') + + # Validate environment variables + required_vars = { + 'GITHUB_APP_ID': github_app_id, + 'GITHUB_APP_INSTALLATION_ID': github_app_installation_id, + 'GITHUB_APP_PEM_FILE': github_app_pem_file, + 'GITHUB_ORG': github_org, + 'GITHUB_URL': github_url, + 'SECRET_NAME': secret_name + } + + missing = [var for var, val in required_vars.items() if not val] + if missing: + error_msg = f"Missing required environment variables: {', '.join(missing)}" + print(error_msg) + return { + 'statusCode': 500, + 'body': json.dumps({'error': error_msg}) + } + + # All required environment variables are present, assert non-None for type safety + assert github_app_id is not None + assert github_app_installation_id is not None + assert github_app_pem_file is not None + assert github_org is not None + assert github_url is not None + assert secret_name is not None + + github_app_id_str: str = str(github_app_id) + github_app_installation_id_str: str = str(github_app_installation_id) + github_app_pem_file_str: str = str(github_app_pem_file) + github_org_str: str = str(github_org) + github_url_str: str = str(github_url) + secret_name_str: str = str(secret_name) + + print(f"GitHub App ID: {github_app_id_str}") + print(f"GitHub Installation ID: {github_app_installation_id_str}") + print(f"GitHub Org: {github_org_str}") + print(f"GitHub URL: {github_url_str}") + print(f"Secret Name: {secret_name_str}") + + try: + # Step 1: Read GitHub App private key from file + print(f"Reading GitHub App PEM file: {github_app_pem_file_str}") + try: + with open(github_app_pem_file_str, 'r') as pem_file: + pem_file_content = pem_file.read() + except Exception as e: + error_msg = f"Failed to read PEM file: {str(e)}" + print(error_msg) + return { + 'statusCode': 500, + 'body': json.dumps({'error': error_msg}) + } + + # Step 2: Get GitHub App installation access token + print("Generating GitHub App installation access token...") + access_token = get_installation_access_token( + github_url_str, + github_app_id_str, + github_app_installation_id_str, + pem_file_content + ) + + # Step 3: Get fresh registration token from GitHub + print("Requesting runner registration token...") + registration_token = get_github_registration_token(github_url_str, github_org_str, access_token) + + # Step 4: Update Secrets Manager with new token + update_secrets_manager(secret_name_str, registration_token) + + success_msg = "Token refreshed successfully" + print(f"=== {success_msg} ===") + + return { + 'statusCode': 200, + 'body': json.dumps({ + 'message': success_msg, + 'secret_name': secret_name_str, + 'github_org': github_org_str, + 'github_app_id': github_app_id_str + }) + } + + except Exception as e: + error_msg = f"Token refresh failed: {str(e)}" + print(f"=== ERROR: {error_msg} ===") + + return { + 'statusCode': 500, + 'body': json.dumps({'error': error_msg}) + } diff --git a/lambda_token_refresh.tf b/lambda_token_refresh.tf new file mode 100644 index 0000000..ad8b466 --- /dev/null +++ b/lambda_token_refresh.tf @@ -0,0 +1,185 @@ +# Lambda function to automatically refresh GitHub Actions registration tokens +# This prevents token expiration issues by refreshing the token every 30 minutes + +locals { + lambda_function_name = "github-runner-token-refresh-${var.aws_account}" +} + +# Install Python dependencies locally for Lambda packaging +resource "null_resource" "lambda_dependencies" { + triggers = { + requirements = filemd5("${path.module}/lambda/requirements.txt") + source_code = filemd5("${path.module}/lambda/token_refresh.py") + } + + provisioner "local-exec" { + command = <<-EOT + cd ${path.module}/lambda + rm -rf package + mkdir -p package + pip3 install --target package -r requirements.txt --platform manylinux2014_x86_64 --only-binary=:all: + cp token_refresh.py package/ + EOT + } +} + +# Create ZIP file for Lambda deployment with dependencies +data "archive_file" "token_refresh_lambda" { + type = "zip" + source_dir = "${path.module}/lambda/package" + output_path = "${path.module}/lambda/token_refresh.zip" + + depends_on = [null_resource.lambda_dependencies] +} + +# Lambda function +resource "aws_lambda_function" "token_refresh" { + filename = data.archive_file.token_refresh_lambda.output_path + function_name = local.lambda_function_name + role = aws_iam_role.lambda_refresh_role.arn + handler = "token_refresh.lambda_handler" + source_code_hash = data.archive_file.token_refresh_lambda.output_base64sha256 + runtime = "python3.11" + timeout = 60 + + environment { + variables = { + GITHUB_APP_ID = var.github_app_id + GITHUB_APP_INSTALLATION_ID = var.github_app_installation_id + GITHUB_APP_PEM_FILE = var.github_app_pem_file + GITHUB_ORG = var.repo_org + GITHUB_URL = var.server_url + SECRET_NAME = aws_secretsmanager_secret.secret.name + } + } + + tags = { + Name = local.lambda_function_name + Environment = var.aws_account + Purpose = "GitHub Runner Token Refresh" + } +} + +# CloudWatch Event Rule - trigger every 30 minutes +resource "aws_cloudwatch_event_rule" "token_refresh_schedule" { + name = "${local.lambda_function_name}-schedule" + description = "Refresh GitHub runner registration token every 30 minutes" + schedule_expression = "rate(30 minutes)" + + tags = { + Name = "${local.lambda_function_name}-schedule" + Environment = var.aws_account + } +} + +# CloudWatch Event Target +resource "aws_cloudwatch_event_target" "token_refresh_target" { + rule = aws_cloudwatch_event_rule.token_refresh_schedule.name + target_id = "RefreshTokenLambda" + arn = aws_lambda_function.token_refresh.arn +} + +# Allow EventBridge to invoke Lambda +resource "aws_lambda_permission" "allow_eventbridge" { + statement_id = "AllowExecutionFromEventBridge" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.token_refresh.function_name + principal = "events.amazonaws.com" + source_arn = aws_cloudwatch_event_rule.token_refresh_schedule.arn +} + +# IAM Role for Lambda +resource "aws_iam_role" "lambda_refresh_role" { + name = "${local.lambda_function_name}-role" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "lambda.amazonaws.com" + } + }] + }) + + tags = { + Name = "${local.lambda_function_name}-role" + Environment = var.aws_account + } +} + +# IAM Policy for Lambda +resource "aws_iam_role_policy" "lambda_refresh_policy" { + name = "token-refresh-policy" + role = aws_iam_role.lambda_refresh_role.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "secretsmanager:UpdateSecret", + "secretsmanager:GetSecretValue", + "secretsmanager:PutSecretValue" + ] + Resource = aws_secretsmanager_secret.secret.arn + }, + { + Effect = "Allow" + Action = [ + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents" + ] + Resource = "arn:aws:logs:*:*:log-group:/aws/lambda/${local.lambda_function_name}:*" + } + ] + }) +} + +# CloudWatch Log Group for Lambda +resource "aws_cloudwatch_log_group" "lambda_logs" { + name = "/aws/lambda/${local.lambda_function_name}" + retention_in_days = 7 + + tags = { + Name = "${local.lambda_function_name}-logs" + Environment = var.aws_account + } +} + +# CloudWatch Alarm for Lambda failures +resource "aws_cloudwatch_metric_alarm" "lambda_errors" { + alarm_name = "${local.lambda_function_name}-errors" + comparison_operator = "GreaterThanThreshold" + evaluation_periods = 2 + metric_name = "Errors" + namespace = "AWS/Lambda" + period = 300 + statistic = "Sum" + threshold = 1 + alarm_description = "Alert when Lambda token refresh fails" + treat_missing_data = "notBreaching" + + dimensions = { + FunctionName = aws_lambda_function.token_refresh.function_name + } + + tags = { + Name = "${local.lambda_function_name}-errors" + Environment = var.aws_account + } +} + +# Output Lambda function details +output "lambda_token_refresh_function_name" { + description = "Name of the Lambda function that refreshes GitHub tokens" + value = aws_lambda_function.token_refresh.function_name +} + +output "lambda_token_refresh_schedule" { + description = "Schedule for automatic token refresh" + value = aws_cloudwatch_event_rule.token_refresh_schedule.schedule_expression +} diff --git a/main.tf b/main.tf index 31c447c..7fdc3d6 100644 --- a/main.tf +++ b/main.tf @@ -8,10 +8,6 @@ # name = "${var.ecs_cluster_name}-${data.aws_region.current.name}" # } -# locals { -# ecs_cluster = var.create_ecs_cluster ? one(aws_ecs_cluster.github-runner).name : one(data.aws_ecs_cluster.github-runner).cluster_name -# } - data "aws_ip_ranges" "ip_ranges" { regions = ["us-gov-west-1", "us-gov-east-1"] services = ["s3", "dynamodb"] @@ -78,7 +74,7 @@ resource "aws_vpc_endpoint" "ecr" { } resource "aws_ecs_cluster_capacity_providers" "fargate" { - cluster_name = local.ecs_cluster.name + cluster_name = local.ecs_cluster_name capacity_providers = ["FARGATE"] @@ -99,33 +95,41 @@ locals { module "ecr-clone" { - source = "HappyPathway/ecr-clone/aws" + count = var.enable_ecr_clone ? 1 : 0 + source = "HappyPathway/ecr-clone/aws" registry_name = "github-runners" + # public.ecr.aws/h1g9x7n8/github-runner:1.69.0 image_config = [ { - enabled = true - dest_path = null - name = var.image_name - source_image = "h1g9x7n8/${var.image_name}" - source_registry = "public.ecr.aws" - source_tag = var.image_version - tag = var.image_version + enabled = true + dest_path = null + name = var.image_name + source_image = "h1g9x7n8/${var.image_name}" + source_registry = "public.ecr.aws" + source_tag = var.image_version + tag = var.image_version } ] - tags = {} + tags = {} } +locals { + # Define image URL based on whether ecr-clone is enabled + runner_image = var.enable_ecr_clone ? ( + "${data.aws_caller_identity.current.account_id}.dkr.ecr.${data.aws_region.current.name}.amazonaws.com/github-runners/${var.image_name}:${var.image_version}" + ) : ( + "public.ecr.aws/h1g9x7n8/${var.image_name}:${var.image_version}" + ) +} module "github-runner" { - # for_each = toset([for repo in local.all_repos : repo]) source = "HappyPathway/github-runner/ecs" - ecs_cluster = local.ecs_cluster.name + ecs_cluster = local.ecs_cluster_name hostname = var.repo_org - image = "${data.aws_caller_identity.current.account_id}.dkr.ecr.${data.aws_region.current.name}.amazonaws.com/github-runners/${var.image_name}:${var.image_version}" + image = local.runner_image repo_org = var.repo_org - # repo_name = each.value - namespace = "${lower(var.repo_org)}-${data.aws_caller_identity.current.account_id}-${data.aws_region.current.name}" - log_group = aws_cloudwatch_log_group.function_log_group.name + namespace = "${lower(var.repo_org)}-${data.aws_caller_identity.current.account_id}-${data.aws_region.current.name}" + log_group = aws_cloudwatch_log_group.function_log_group.name runner_group = { create = true name = data.aws_caller_identity.current.account_id diff --git a/outputs.tf b/outputs.tf new file mode 100644 index 0000000..2028cfc --- /dev/null +++ b/outputs.tf @@ -0,0 +1,99 @@ +# ECS Cluster Information +output "ecs_cluster_name" { + description = "The name of the ECS cluster" + value = local.ecs_cluster_name +} + +output "ecs_cluster_arn" { + description = "The ARN of the ECS cluster" + value = var.create_ecs_cluster ? aws_ecs_cluster.github-runner[0].arn : data.aws_ecs_cluster.github-runner[0].arn +} + +# GitHub Runner Service Information +output "github_runner_service_name" { + description = "The name of the GitHub runner ECS service" + value = var.repo_org # Using the repo_org since that's set as the hostname in the module call +} + +# Task Definition Information +output "task_definition_family" { + description = "The family of the Task Definition" + value = module.github-runner.runner_task_definition.family +} + +output "task_definition_arn" { + description = "The full ARN of the Task Definition" + value = module.github-runner.runner_task_definition.arn +} + +output "task_definition_revision" { + description = "The revision of the task in a particular family" + value = module.github-runner.runner_task_definition.revision +} + +# CloudWatch Logs Information +output "log_group_name" { + description = "The name of the CloudWatch Log Group for the runner" + value = aws_cloudwatch_log_group.function_log_group.name +} + +output "log_group_arn" { + description = "The ARN of the CloudWatch Log Group" + value = aws_cloudwatch_log_group.function_log_group.arn +} + +# GitHub Runner Information +output "runner_group_name" { + description = "The name of the GitHub runner group" + value = module.github-runner.runner_group +} + +output "runner_group_id" { + description = "The ID of the GitHub runner group" + value = module.github-runner.runner_group +} + +output "runner_labels" { + description = "The labels assigned to the GitHub runner" + value = local.labels # Using local.labels defined in main.tf +} + +# AWS Environment Information +output "aws_region" { + description = "The AWS region where resources are deployed" + value = data.aws_region.current.name +} + +output "aws_account_id" { + description = "The AWS account ID where resources are deployed" + value = data.aws_caller_identity.current.account_id +} + +# Resource Configuration +output "namespace" { + description = "The namespace used for the runner resources" + value = var.namespace +} + +output "repo_org" { + description = "The GitHub organization name used for the runners" + value = var.repo_org +} + +output "image_info" { + description = "Information about the container image used for runners" + value = { + name = var.image_name + version = var.image_version + uri = local.runner_image + } +} + +output "vpc_config" { + description = "VPC configuration for the ECS tasks" + value = { + vpc_id = var.vpc_id + subnets = var.subnets + security_groups = var.security_groups + } +} \ No newline at end of file diff --git a/providers.tf b/providers.tf index 5657994..694938d 100644 --- a/providers.tf +++ b/providers.tf @@ -4,12 +4,24 @@ terraform { source = "hashicorp/aws" version = "~> 5.70.0" } + github = { + source = "integrations/github" + version = "~> 6.2" + } } } +# Generate GitHub App token for authentication +data "github_app_token" "app" { + app_id = var.github_app_id + installation_id = var.github_app_installation_id + pem_file = var.github_app_pem_file +} + provider "github" { - owner = var.repo_org - base_url = var.base_url + organization = var.repo_org + base_url = var.base_url + token = data.github_app_token.app.token } provider "aws" { @@ -18,7 +30,7 @@ provider "aws" { finops_project_name = "csvd_github_actions" finops_project_number = "fs0000000078" finops_project_role = "csvd_github_actions" - organization = "census:ocio:csvd" + organization = "census:ocio:csvd" } } -} \ No newline at end of file +}x \ No newline at end of file diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..fe0d807 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,97 @@ +# GitHub Runner Monitoring Tools + +This directory contains scripts to monitor your GitHub Runner ECS services and CloudWatch logs. + +## Features + +- Real-time monitoring of ECS cluster and service health +- Detailed task and container status information +- CloudWatch log streaming with error filtering +- Resource utilization metrics (CPU, memory) +- Automatic detection of configuration from Terraform outputs + +## Prerequisites + +- Python 3.6+ +- AWS CLI configured with appropriate permissions +- Terraform initialized in the parent directory + +## Installation + +Install the required Python packages: + +```bash +pip install -r requirements.txt +``` + +## Usage + +### Using the shell wrapper script + +The easiest way to run the monitor is with the shell wrapper: + +```bash +./monitor.sh [options] +``` + +### Available options + +- `--region REGION`: Override AWS region (default: uses Terraform output) +- `--profile PROFILE`: Specify AWS profile name +- `--interval SECONDS`: Set refresh interval in seconds (default: 30) +- `--errors-only`: Show only error logs + +### Examples + +Monitor with default settings (refreshes every 30 seconds): +```bash +./monitor.sh +``` + +Show only error logs: +```bash +./monitor.sh --errors-only +``` + +Custom refresh interval (10 seconds): +```bash +./monitor.sh --interval 10 +``` + +Use specific AWS profile: +```bash +./monitor.sh --profile dev-account +``` + +## Using with Specific GitHub Organizations + +To monitor runners for a specific GitHub organization, make sure you've applied the appropriate Terraform configuration: + +```bash +cd .. # Navigate to the parent directory +terraform workspace select +terraform apply -var-file=varfiles/.tfvars +./scripts/monitor.sh +``` + +## Troubleshooting + +If you encounter missing Terraform outputs: + +1. Ensure you're in the correct directory +2. Make sure Terraform has been initialized (`terraform init`) +3. Verify that `terraform apply` has been run successfully +4. Check that you're in the correct Terraform workspace + +If you encounter AWS permission errors: + +1. Verify your AWS credentials are correctly configured +2. Ensure your IAM user/role has permissions for: + - `ecs:DescribeClusters` + - `ecs:DescribeServices` + - `ecs:ListTasks` + - `ecs:DescribeTasks` + - `logs:DescribeLogStreams` + - `logs:GetLogEvents` + - `logs:FilterLogEvents` + - `cloudwatch:GetMetricData` \ No newline at end of file diff --git a/scripts/monitor.sh b/scripts/monitor.sh new file mode 100755 index 0000000..1825977 --- /dev/null +++ b/scripts/monitor.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# GitHub Runner ECS Service Monitor + +# Navigate to the Terraform project root directory +cd "$(dirname "$0")/.." + +# Check if pip and Python requirements are installed +if ! command -v pip &> /dev/null; then + echo "ERROR: pip is not installed. Please install Python and pip." + exit 1 +fi + +# Check if rich package is installed, install if needed +if ! python3 -c "import rich" &> /dev/null; then + echo "Installing required Python packages..." + pip install rich boto3 --user +fi + +# Make the monitoring script executable +chmod +x "$(dirname "$0")/monitor_runners.py" + +# Run the monitoring script with the provided arguments +python3 "$(dirname "$0")/monitor_runners.py" "$@" \ No newline at end of file diff --git a/scripts/monitor_runners.py b/scripts/monitor_runners.py new file mode 100755 index 0000000..59e2245 --- /dev/null +++ b/scripts/monitor_runners.py @@ -0,0 +1,488 @@ +#!/usr/bin/env python3 +import argparse +import boto3 +import json +import os +import subprocess +import sys +import time +from datetime import datetime, timezone, timedelta +from rich.console import Console +from rich.panel import Panel +from rich.table import Table +from rich.live import Live + +class GitHubRunnerMonitor: + """Monitor GitHub Runner ECS services and their CloudWatch logs""" + + def __init__(self, region=None, profile=None): + """Initialize AWS clients""" + session = boto3.Session(region_name=region, profile_name=profile) + self.region = region or session.region_name + self.ecs = session.client('ecs') + self.logs = session.client('logs') + self.cloudwatch = session.client('cloudwatch') + self.console = Console() + self.last_seen_time = {} + self.last_event_id = {} + + def get_terraform_output(self, output_name): + """Get Terraform output value by name""" + try: + result = subprocess.run( + ['terraform', 'output', '-json', output_name], + capture_output=True, + text=True, + check=True + ) + return json.loads(result.stdout) + except subprocess.CalledProcessError as e: + self.console.print(f"[bold red]Error getting Terraform output {output_name}:[/bold red] {e}") + return None + except json.JSONDecodeError as e: + self.console.print(f"[bold red]Error parsing Terraform output {output_name}:[/bold red] {e}") + return None + + def get_all_tf_outputs(self): + """Get all required Terraform outputs at once""" + try: + result = subprocess.run( + ['terraform', 'output', '-json'], + capture_output=True, + text=True, + check=True + ) + return json.loads(result.stdout) + except subprocess.CalledProcessError as e: + self.console.print(f"[bold red]Error getting Terraform outputs:[/bold red] {e}") + return {} + except json.JSONDecodeError as e: + self.console.print(f"[bold red]Error parsing Terraform outputs:[/bold red] {e}") + return {} + + def get_cluster_health(self, cluster_name): + """Get ECS cluster health metrics""" + try: + response = self.ecs.describe_clusters(clusters=[cluster_name]) + if not response['clusters']: + return None + + cluster = response['clusters'][0] + return { + 'status': cluster.get('status'), + 'registered_container_instances': cluster.get('registeredContainerInstancesCount', 0), + 'running_tasks': cluster.get('runningTasksCount', 0), + 'pending_tasks': cluster.get('pendingTasksCount', 0), + 'active_services': cluster.get('activeServicesCount', 0) + } + except Exception as e: + self.console.print(f"[bold red]Error getting cluster health:[/bold red] {e}") + return None + + def get_service_health(self, cluster_name, service_name): + """Get ECS service health metrics""" + try: + response = self.ecs.describe_services( + cluster=cluster_name, + services=[service_name] + ) + if not response['services']: + return None + + service = response['services'][0] + return { + 'status': service.get('status'), + 'desired_count': service.get('desiredCount', 0), + 'running_count': service.get('runningCount', 0), + 'pending_count': service.get('pendingCount', 0), + 'events': [ + { + 'timestamp': event.get('createdAt'), + 'message': event.get('message') + } + for event in service.get('events', [])[:5] + ] + } + except Exception as e: + self.console.print(f"[bold red]Error getting service health:[/bold red] {e}") + return None + + def get_running_tasks(self, cluster_name, service_name): + """Get all running tasks for a service""" + try: + response = self.ecs.list_tasks( + cluster=cluster_name, + serviceName=service_name, + desiredStatus='RUNNING' + ) + + if not response['taskArns']: + return [] + + task_details = self.ecs.describe_tasks( + cluster=cluster_name, + tasks=response['taskArns'] + ) + + return [ + { + 'task_id': task['taskArn'].split('/')[-1], + 'last_status': task.get('lastStatus'), + 'desired_status': task.get('desiredStatus'), + 'health_status': task.get('healthStatus'), + 'containers': [ + { + 'name': container.get('name'), + 'status': container.get('lastStatus'), + 'exit_code': container.get('exitCode'), + 'reason': container.get('reason', 'N/A') + } + for container in task.get('containers', []) + ] + } + for task in task_details['tasks'] + ] + except Exception as e: + self.console.print(f"[bold red]Error getting running tasks:[/bold red] {e}") + return [] + + def get_task_logs(self, log_group_name, task_id, minutes=5, filter_pattern=None): + """Get logs for a specific task""" + try: + log_stream_prefix = f"{task_id}/" + + # Get log streams for this task + log_streams = self.logs.describe_log_streams( + logGroupName=log_group_name, + logStreamNamePrefix=log_stream_prefix, + orderBy='LogStreamName', + descending=True, + limit=5 + ).get('logStreams', []) + + if not log_streams: + return [] + + # For each stream, get the latest log events + all_events = [] + start_time = int((datetime.now(timezone.utc) - timedelta(minutes=minutes)).timestamp() * 1000) + + for stream in log_streams: + stream_name = stream['logStreamName'] + key = f"{log_group_name}/{stream_name}" + + if key not in self.last_seen_time: + self.last_seen_time[key] = start_time + + if key not in self.last_event_id: + self.last_event_id[key] = None + + kwargs = { + 'logGroupName': log_group_name, + 'logStreamName': stream_name, + 'startTime': self.last_seen_time[key], + 'limit': 100 + } + + if filter_pattern: + kwargs['filterPattern'] = filter_pattern + + if self.last_event_id[key]: + kwargs['nextToken'] = self.last_event_id[key] + + response = self.logs.get_log_events(**kwargs) + events = response.get('events', []) + + if events: + self.last_seen_time[key] = events[-1]['timestamp'] + 1 + self.last_event_id[key] = response.get('nextForwardToken') + + all_events.extend([ + { + 'timestamp': datetime.fromtimestamp(event['timestamp']/1000).strftime('%Y-%m-%d %H:%M:%S'), + 'message': event['message'], + 'stream': stream_name + } + for event in events + ]) + + return all_events + + except Exception as e: + self.console.print(f"[bold red]Error getting task logs:[/bold red] {e}") + return [] + + def get_latest_logs(self, log_group_name, minutes=5, filter_pattern=None, limit=100): + """Get the latest logs from a log group""" + try: + # Start from 5 minutes ago by default + start_time = int((datetime.now(timezone.utc) - timedelta(minutes=minutes)).timestamp() * 1000) + + kwargs = { + 'logGroupName': log_group_name, + 'limit': limit, + 'startTime': start_time, + 'endTime': int(datetime.now(timezone.utc).timestamp() * 1000) + } + + if filter_pattern: + kwargs['filterPattern'] = filter_pattern + + response = self.logs.filter_log_events(**kwargs) + + return [ + { + 'timestamp': datetime.fromtimestamp(event['timestamp']/1000).strftime('%Y-%m-%d %H:%M:%S'), + 'message': event['message'], + 'stream': event.get('logStreamName', 'unknown') + } + for event in response.get('events', []) + ] + + except Exception as e: + self.console.print(f"[bold red]Error getting logs:[/bold red] {e}") + return [] + + def get_metrics(self, cluster_name, service_name): + """Get CloudWatch metrics for the service""" + try: + now = datetime.now(timezone.utc) + response = self.cloudwatch.get_metric_data( + MetricDataQueries=[ + { + 'Id': 'cpu', + 'MetricStat': { + 'Metric': { + 'Namespace': 'AWS/ECS', + 'MetricName': 'CPUUtilization', + 'Dimensions': [ + {'Name': 'ClusterName', 'Value': cluster_name}, + {'Name': 'ServiceName', 'Value': service_name} + ] + }, + 'Period': 300, + 'Stat': 'Average' + } + }, + { + 'Id': 'memory', + 'MetricStat': { + 'Metric': { + 'Namespace': 'AWS/ECS', + 'MetricName': 'MemoryUtilization', + 'Dimensions': [ + {'Name': 'ClusterName', 'Value': cluster_name}, + {'Name': 'ServiceName', 'Value': service_name} + ] + }, + 'Period': 300, + 'Stat': 'Average' + } + } + ], + StartTime=now - timedelta(hours=1), + EndTime=now + ) + + results = { + 'cpu': None, + 'memory': None + } + + for result in response['MetricDataResults']: + if result['Id'] == 'cpu' and result['Values']: + results['cpu'] = result['Values'][-1] + elif result['Id'] == 'memory' and result['Values']: + results['memory'] = result['Values'][-1] + + return results + + except Exception as e: + self.console.print(f"[bold red]Error getting metrics:[/bold red] {e}") + return {'cpu': None, 'memory': None} + + def display_cluster_info(self, cluster_health): + """Display cluster health information""" + if not cluster_health: + self.console.print("[bold red]No cluster information available[/bold red]") + return + + table = Table(title="ECS Cluster Health") + table.add_column("Metric", style="cyan") + table.add_column("Value", style="green") + + table.add_row("Status", cluster_health['status']) + table.add_row("Running Tasks", str(cluster_health['running_tasks'])) + table.add_row("Pending Tasks", str(cluster_health['pending_tasks'])) + table.add_row("Active Services", str(cluster_health['active_services'])) + + self.console.print(table) + + def display_service_info(self, service_health, metrics): + """Display service health information""" + if not service_health: + self.console.print("[bold red]No service information available[/bold red]") + return + + table = Table(title="ECS Service Health") + table.add_column("Metric", style="cyan") + table.add_column("Value", style="green") + + table.add_row("Status", service_health['status']) + table.add_row("Desired Tasks", str(service_health['desired_count'])) + table.add_row("Running Tasks", str(service_health['running_count'])) + table.add_row("Pending Tasks", str(service_health['pending_count'])) + + if metrics['cpu'] is not None: + table.add_row("CPU Utilization", f"{metrics['cpu']:.2f}%") + if metrics['memory'] is not None: + table.add_row("Memory Utilization", f"{metrics['memory']:.2f}%") + + self.console.print(table) + + if service_health['events']: + events_table = Table(title="Recent Service Events") + events_table.add_column("Timestamp", style="blue") + events_table.add_column("Message", style="yellow") + + for event in service_health['events']: + events_table.add_row( + event['timestamp'].strftime('%Y-%m-%d %H:%M:%S') if isinstance(event['timestamp'], datetime) else str(event['timestamp']), + event['message'] + ) + + self.console.print(events_table) + + def display_task_info(self, tasks): + """Display task information""" + if not tasks: + self.console.print("[bold red]No tasks running[/bold red]") + return + + for i, task in enumerate(tasks): + table = Table(title=f"Task {i+1}: {task['task_id']}") + table.add_column("Attribute", style="cyan") + table.add_column("Value", style="green") + + table.add_row("Last Status", task['last_status']) + table.add_row("Desired Status", task['desired_status']) + table.add_row("Health Status", task['health_status'] or "N/A") + + self.console.print(table) + + if task['containers']: + container_table = Table(title="Containers") + container_table.add_column("Name", style="blue") + container_table.add_column("Status", style="yellow") + container_table.add_column("Exit Code", style="magenta") + container_table.add_column("Reason", style="red") + + for container in task['containers']: + container_table.add_row( + container['name'], + container['status'], + str(container['exit_code']) if container['exit_code'] is not None else "N/A", + container['reason'] + ) + + self.console.print(container_table) + + def display_logs(self, logs, error_only=False): + """Display CloudWatch logs""" + if not logs: + self.console.print("[bold yellow]No new log entries found[/bold yellow]") + return + + log_table = Table(title="CloudWatch Logs") + log_table.add_column("Timestamp", style="blue") + log_table.add_column("Stream", style="cyan") + log_table.add_column("Message", style="white") + + for log in logs: + # If error_only is True, only show logs containing error-related keywords + if error_only and not any(keyword in log['message'].lower() for keyword in ['error', 'exception', 'fail', 'fatal']): + continue + + log_table.add_row( + log['timestamp'], + log['stream'], + log['message'] + ) + + self.console.print(log_table) + + def continuous_monitor(self, config, interval=60, error_only=False): + """Continuously monitor and display information""" + try: + while True: + os.system('clear') + self.console.print(f"\n[bold green]GitHub Runner Monitor[/bold green] - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + self.console.print(f"[cyan]Region:[/cyan] {config['region']} | [cyan]Organization:[/cyan] {config['repo_org']}") + self.console.print("-" * 80) + + # Get health information + cluster_health = self.get_cluster_health(config['cluster_name']) + service_health = self.get_service_health(config['cluster_name'], config['service_name']) + metrics = self.get_metrics(config['cluster_name'], config['service_name']) + + # Display cluster and service information + self.display_cluster_info(cluster_health) + self.console.print() + self.display_service_info(service_health, metrics) + + # Get and display task information + tasks = self.get_running_tasks(config['cluster_name'], config['service_name']) + self.console.print() + self.display_task_info(tasks) + + # Get and display logs + self.console.print() + self.console.print(f"[bold green]Latest CloudWatch Logs[/bold green] {f'(Errors Only)' if error_only else ''}") + logs = self.get_latest_logs(config['log_group_name'], minutes=5, filter_pattern="ERROR" if error_only else None) + self.display_logs(logs, error_only) + + self.console.print(f"\n[dim]Refreshing in {interval} seconds. Press Ctrl+C to exit.[/dim]") + time.sleep(interval) + + except KeyboardInterrupt: + self.console.print("\n[bold green]Monitoring stopped. Goodbye![/bold green]") + return + +def main(): + """Main entry point for the script""" + parser = argparse.ArgumentParser(description='Monitor GitHub Runner ECS Services') + parser.add_argument('--region', help='AWS region (overrides Terraform output)') + parser.add_argument('--profile', help='AWS profile name') + parser.add_argument('--interval', type=int, default=30, help='Refresh interval in seconds') + parser.add_argument('--errors-only', action='store_true', help='Show only error logs') + args = parser.parse_args() + + monitor = GitHubRunnerMonitor(region=args.region, profile=args.profile) + + # Get configuration from Terraform outputs + outputs = monitor.get_all_tf_outputs() + if not outputs: + monitor.console.print("[bold red]Failed to get Terraform outputs. Make sure you're in the correct directory and terraform init has been run.[/bold red]") + return 1 + + # Extract necessary configuration + config = { + 'cluster_name': outputs.get('ecs_cluster_name', {}).get('value'), + 'service_name': outputs.get('github_runner_service_name', {}).get('value'), + 'log_group_name': outputs.get('log_group_name', {}).get('value'), + 'region': outputs.get('aws_region', {}).get('value') or args.region, + 'repo_org': outputs.get('repo_org', {}).get('value') + } + + # Validate configuration + missing_fields = [field for field, value in config.items() if not value] + if missing_fields: + monitor.console.print(f"[bold red]Missing required Terraform outputs:[/bold red] {', '.join(missing_fields)}") + return 1 + + # Start continuous monitoring + monitor.continuous_monitor(config, interval=args.interval, error_only=args.errors_only) + return 0 + +if __name__ == '__main__': + sys.exit(main()) \ No newline at end of file diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 0000000..e3d7c03 --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1,3 @@ +boto3>=1.28.0 +botocore>=1.31.0 +rich>=13.0.0 \ No newline at end of file diff --git a/scripts/watch_runners.sh b/scripts/watch_runners.sh new file mode 100755 index 0000000..ef8faad --- /dev/null +++ b/scripts/watch_runners.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# Get Terraform outputs +CLUSTER_NAME=$(terraform output -raw ecs_cluster_name) +SERVICE_NAME=$(terraform output -raw ecs_service_name) +LOG_GROUP=$(terraform output -raw log_group_name) +REGION=$(terraform output -raw aws_region) + +# Make the Python script executable +chmod +x "$(dirname "$0")/monitor_runners.py" + +# Run the monitoring script with the terraform outputs +"$(dirname "$0")/monitor_runners.py" \ + --cluster "$CLUSTER_NAME" \ + --service "$SERVICE_NAME" \ + --log-group "$LOG_GROUP" \ + --region "$REGION" \ + "$@" \ No newline at end of file diff --git a/terraform_data_dirs/csvd/environment b/terraform_data_dirs/csvd/environment index 456fbda..58bcd92 100644 --- a/terraform_data_dirs/csvd/environment +++ b/terraform_data_dirs/csvd/environment @@ -1 +1 @@ -sct-engineering \ No newline at end of file +csvd \ No newline at end of file diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/dns/3.4.3/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/dns/3.4.3/linux_amd64 new file mode 120000 index 0000000..a6fbdd6 --- /dev/null +++ b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/dns/3.4.3/linux_amd64 @@ -0,0 +1 @@ +/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/dns/3.4.3/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/github/6.6.0/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/github/6.6.0/linux_amd64 new file mode 120000 index 0000000..095d815 --- /dev/null +++ b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/github/6.6.0/linux_amd64 @@ -0,0 +1 @@ +/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/github/6.6.0/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/null/3.2.4/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/null/3.2.4/linux_amd64 new file mode 120000 index 0000000..75282e6 --- /dev/null +++ b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/null/3.2.4/linux_amd64 @@ -0,0 +1 @@ +/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/null/3.2.4/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/random/3.7.2/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/random/3.7.2/linux_amd64 new file mode 120000 index 0000000..f8eee1f --- /dev/null +++ b/terraform_data_dirs/csvd/providers/registry.terraform.io/hashicorp/random/3.7.2/linux_amd64 @@ -0,0 +1 @@ +/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/hashicorp/random/3.7.2/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/5.45.0/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/5.45.0/linux_amd64 new file mode 120000 index 0000000..15c0b66 --- /dev/null +++ b/terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/5.45.0/linux_amd64 @@ -0,0 +1 @@ +/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/integrations/github/5.45.0/linux_amd64 \ No newline at end of file diff --git a/terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/6.6.0/linux_amd64 b/terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/6.6.0/linux_amd64 new file mode 120000 index 0000000..26dfde5 --- /dev/null +++ b/terraform_data_dirs/csvd/providers/registry.terraform.io/integrations/github/6.6.0/linux_amd64 @@ -0,0 +1 @@ +/data/terraform/workspaces/arnol377/terraform-plugin-cache/registry.terraform.io/integrations/github/6.6.0/linux_amd64 \ No newline at end of file diff --git a/varfiles/sct-engineering.tfvars b/varfiles/sct-engineering.tfvars index 841fa9e..5a1f93b 100644 --- a/varfiles/sct-engineering.tfvars +++ b/varfiles/sct-engineering.tfvars @@ -1,6 +1,6 @@ -namespace = "sct-eng-ghe-runner" -repo_org = "SCT-Engineering" -desired_count = 1 +namespace = "sct-eng-ghe-runner" +repo_org = "SCT-Engineering" +desired_count = 1 create_ecs_cluster = false -image_name = "github-runner" -image_version = "1.68.0" \ No newline at end of file +image_name = "github-runner" +image_version = "1.69.0" \ No newline at end of file diff --git a/varfiles/terraform-modules-eks.json b/varfiles/terraform-modules-eks.json new file mode 100644 index 0000000..98993bd --- /dev/null +++ b/varfiles/terraform-modules-eks.json @@ -0,0 +1,4 @@ +{ + "TF_WORKSPACE_COLOR": 92, + "GITHUB_OWNER": "terraform-modules-eks" +} \ No newline at end of file diff --git a/varfiles/terraform-modules-eks.tfvars b/varfiles/terraform-modules-eks.tfvars new file mode 100644 index 0000000..98d4e80 --- /dev/null +++ b/varfiles/terraform-modules-eks.tfvars @@ -0,0 +1,24 @@ +# Available Terminal Colors for TF_WORKSPACE_COLOR: +# 30 - Black +# 31 - Red +# 32 - Green +# 33 - Yellow +# 34 - Blue +# 35 - Magenta +# 36 - Cyan +# 37 - White (default) +# 90 - Bright Black (Gray) +# 91 - Bright Red +# 92 - Bright Green +# 93 - Bright Yellow +# 94 - Bright Blue +# 95 - Bright Magenta +# 96 - Bright Cyan +# 97 - Bright White +namespace = "tf-mod-ek-ghe-runner" +repo_org = "terraform-modules-eks" +desired_count = 3 +create_ecs_cluster = false +image_name = "github-runner" +image_version = "1.68.0" +enable_ecr_clone = false \ No newline at end of file diff --git a/variables.tf b/variables.tf index 6289ebd..3ac4317 100644 --- a/variables.tf +++ b/variables.tf @@ -89,9 +89,95 @@ variable "desired_count" { } variable "base_url" { - default = "https://github.e.it.census.gov/" + default = "https://github.e.it.census.gov" } variable "aws_account" { type = string } + +variable "enable_ecr_clone" { + description = "Whether to enable the ECR clone module" + type = bool + default = true +} + +variable "github_app_id" { + description = <<-EOT + GitHub App ID for authentication. + + This is used to generate tokens via the GitHub App authentication method, + which is more secure than Personal Access Tokens. + + To find your GitHub App ID: + 1. Navigate to Organization Settings → Developer settings → GitHub Apps + 2. Click on your GitHub App (e.g., "GitHub Actions Runner Management") + 3. The App ID is shown at the top of the page + + Example: "123456" + + Note: Different organizations may have different GitHub App IDs. + Set this value in workspace-specific .tfvars files. + EOT + type = string + + validation { + condition = can(regex("^[0-9]+$", var.github_app_id)) + error_message = "GitHub App ID must be a numeric string (e.g., '123456')" + } +} + +variable "github_app_installation_id" { + description = <<-EOT + GitHub App Installation ID for your organization. + + Each organization that installs the GitHub App gets a unique installation ID. + + To find your Installation ID: + 1. Navigate to Organization Settings → GitHub Apps + 2. Click "Configure" on your installed app + 3. The Installation ID is in the URL: github.com/organizations/ORG/settings/installations/XXXXX + + Example: "12345678" + + Note: This value is organization-specific. + Set this value in workspace-specific .tfvars files. + EOT + type = string + + validation { + condition = can(regex("^[0-9]+$", var.github_app_installation_id)) + error_message = "GitHub App Installation ID must be a numeric string (e.g., '12345678')" + } +} + +variable "github_app_pem_file" { + description = <<-EOT + Path to the GitHub App private key PEM file. + + SECURITY WARNING: + - This file contains the private key for your GitHub App + - NEVER commit this file to version control + - Store securely and restrict file permissions (chmod 600) + - Consider using AWS Secrets Manager or similar for production + + To generate the private key: + 1. Navigate to Organization Settings → Developer settings → GitHub Apps + 2. Click on your GitHub App + 3. Scroll to "Private keys" section + 4. Click "Generate a private key" + 5. Download the .pem file and store it securely + + Usage: + - Set via variable: -var="github_app_pem_file=/path/to/private-key.pem" + - Or in .tfvars: github_app_pem_file = "/path/to/private-key.pem" + + The PEM file should be accessible from where Terraform runs. + EOT + type = string + + validation { + condition = can(regex("\\.pem$", var.github_app_pem_file)) + error_message = "GitHub App PEM file path must end with .pem" + } +}