From 0ca30d8515b65a219ed4eabce2b796fa15963afc Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Mon, 11 May 2026 16:23:12 -0400 Subject: [PATCH 01/27] Add template repo rendering, Bedrock discussion notes, and HOW-IT-WORKS docs - lambda/app.py: add template_repo + template_vars fields to TfRunRequest; merge field_validator to cover both extra_files and template_vars; pass both new fields in CodeBuild environmentVariablesOverride - buildspec.yml: add TEMPLATE_REPO/TEMPLATE_VARS env defaults; new build step clones template repo, renders .j2 files via Jinja2 StrictUndefined, copies non-.j2 files verbatim; EXTRA_FILES step runs after and overrides - service-catalog/product-template.yaml: add TemplateRepo + TemplateVars parameters and parameter group; wire to Lambda Custom Resource - docs/HOW-IT-WORKS.md: full end-to-end documentation of the system - .gitignore: exclude *.tfstate, *.tfvars, .terraform/, terraform_data_dirs/ --- .gitignore | 12 + buildspec.yml | 47 +++ docs/HOW-IT-WORKS.md | 579 ++++++++++++++++++++++++++ lambda/app.py | 34 +- service-catalog/product-template.yaml | 33 +- 5 files changed, 689 insertions(+), 16 deletions(-) create mode 100644 .gitignore create mode 100644 docs/HOW-IT-WORKS.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..87bef24 --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +# Packer pipeline zip files +tf-run-executor-builder.zip + +# Terraform state and local overrides +*.tfstate +*.tfstate.backup +*.tfvars +.terraform/ +.terraform.lock.hcl +.terraform_commits +terraform_data_dirs/ +varfiles/ diff --git a/buildspec.yml b/buildspec.yml index 23e9778..f651f50 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -28,6 +28,8 @@ env: GIT_BRANCH: "repo-init" DRY_RUN: "false" TF_RUN_START_TAG: "" + TEMPLATE_REPO: "" + TEMPLATE_VARS: "{}" EXTRA_FILES: "{}" phases: @@ -69,7 +71,52 @@ phases: - cd repo - git checkout -B "${GIT_BRANCH}" + # --- Render template repo (if specified) into account repo --- + # Clone TEMPLATE_REPO, render .j2 files with TEMPLATE_VARS via Jinja2, + # copy non-template files as-is. Results land in the account repo tree + # at the same relative paths. EXTRA_FILES applied afterwards can override. + - | + if [ -n "${TEMPLATE_REPO}" ]; then + git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${TEMPLATE_REPO}.git" /tmp/template-repo + python3 - <<'PYEOF' + import json, os, pathlib, shutil + from jinja2 import Environment, FileSystemLoader, StrictUndefined + + template_vars = json.loads(os.environ.get('TEMPLATE_VARS', '{}')) + src_root = pathlib.Path('/tmp/template-repo') + dst_root = pathlib.Path('.') # already inside cloned account repo + + rendered = 0 + copied = 0 + for src in src_root.rglob('*'): + if src.is_dir() or any(part.startswith('.git') for part in src.parts): + continue + rel = src.relative_to(src_root) + if src.suffix == '.j2': + # Render Jinja2 template; strip .j2 extension in destination + dst = dst_root / rel.with_suffix('') + dst.parent.mkdir(parents=True, exist_ok=True) + env = Environment( + loader=FileSystemLoader(str(src.parent)), + undefined=StrictUndefined, + keep_trailing_newline=True, + ) + content = env.get_template(src.name).render(**template_vars) + dst.write_text(content) + rendered += 1 + else: + dst = dst_root / rel + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dst) + copied += 1 + print(f'Template repo: rendered {rendered} .j2 file(s), copied {copied} file(s)') + PYEOF + else + echo 'No TEMPLATE_REPO specified — skipping template rendering' + fi + # --- Write extra config files passed in from Lambda (JSON map path -> content) --- + # Applied after template rendering; keys here override template output. - | python3 -c " import json, os, pathlib diff --git a/docs/HOW-IT-WORKS.md b/docs/HOW-IT-WORKS.md new file mode 100644 index 0000000..b4308c5 --- /dev/null +++ b/docs/HOW-IT-WORKS.md @@ -0,0 +1,579 @@ +# How sc-lambda-ghactions Works + +This document explains the complete end-to-end flow of the SC Lambda + CodeBuild +automation system — from a user filling out a Service Catalog form to Terraform +running inside an AWS account repository. + +--- + +## What This System Does + +This system provides a **managed execution environment for arbitrary Terraform +configurations**. Teams maintain reusable template repositories on GitHub +Enterprise (GHE). When a user provisions a product through AWS Service Catalog, +this system: + +1. Accepts configuration data from the user via a product form +2. Clones a team-owned template repository +3. Renders the templates using the user's configuration data +4. Injects the rendered files into the correct location in the target AWS account + repository +5. Runs `tf-run` (the Census Terraform toolchain) to apply the changes +6. Opens a pull request for review +7. Reports success or failure back to CloudFormation + +The goal is a single, centrally-operated execution platform that any team can +drive by writing a template repo and defining a Service Catalog product form — +without needing to operate their own build infrastructure. + +--- + +## Component Overview + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ User fills AWS Service Catalog form (any AWS account in the org) │ +└─────────────────────────────┬───────────────────────────────────────┘ + │ CloudFormation Custom Resource event + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ Lambda: tf-run-executor-trigger │ +│ (centralized in csvd-dev, account 229685449397, us-gov-west-1) │ +│ │ +│ • Validates all inputs (Pydantic v2) │ +│ • Fetches the GHE PAT from Secrets Manager │ +│ • Starts a CodeBuild build with per-run environment variables │ +│ • Polls CodeBuild every 20 seconds │ +│ • Returns PR URL + repo URL to CloudFormation on completion │ +└─────────────────────────────┬───────────────────────────────────────┘ + │ AWS CodeBuild StartBuild API + ▼ +┌─────────────────────────────────────────────────────────────────────┐ +│ CodeBuild project: tf-run-executor │ +│ (Amazon Linux 2, 60-minute timeout) │ +│ │ +│ INSTALL phase │ +│ • Downloads Terraform binary from S3 (registry.terraform.io is │ +│ blocked on the Census network) │ +│ • Installs the Census CA cert so GHE TLS works │ +│ • Installs tf-run, tf-control.sh, tf-directory-setup.py from │ +│ this repo's scripts/ directory │ +│ • Installs the gh CLI from S3 │ +│ │ +│ BUILD phase │ +│ 1. Clone the target account repo over HTTPS │ +│ 2. Check out (or create) the work branch │ +│ 3. Clone the template repo and render Jinja2 templates │ +│ 4. Write any explicit extra files (override layer) │ +│ 5. git commit + push │ +│ 6. cd into // and run tf-run or tf-plan │ +│ 7. Open or update a pull request via the gh CLI │ +│ │ +│ POST_BUILD phase │ +│ • Emit PR_URL= line so Lambda can parse and return it to CFN │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Step-by-Step Walkthrough + +### Step 1 — User Fills the Service Catalog Form + +A user opens the AWS Service Catalog console in any account that belongs to the +Census AWS Organizations org. They find the **tf-run automation** product and fill +in the form: + +| Field | What to put here | +|---|---| +| **Account Repo Name** | The GHE repo that contains the target account's Terraform. Example: `229685449397-csvd-dev-platform-dev-gov` | +| **Terraform Layer** | Which layer of the account repo to operate on: `common`, `infrastructure`, or `vpc` | +| **Region Directory** | `east` or `west` — the subdirectory inside the layer | +| **Template Repository** | Name of a GHE repo containing Jinja2 or raw file templates (optional) | +| **Template Variables** | A JSON object of key/value pairs passed to Jinja2 when rendering templates (optional) | +| **Extra Config Files** | A JSON object of `{"relative/path": "file content"}` written directly into the repo, bypassing templates (optional) | +| **Git Branch** | The branch to commit to and open the PR from (default: `repo-init`) | +| **tf-run Start Tag** | A `TAG` label from `tf-run.data` to start execution from; leave blank to run all steps | +| **Dry Run** | Set `true` to run `tf plan` only — no apply, no PR | + +The user clicks **Launch Product**. CloudFormation creates a stack with a +`Custom::TerraformRun` resource. This triggers the Lambda. + +> **Note**: `aws_account_id` and `aws_region` are **not** on the form. CloudFormation +> resolves them automatically with `!Sub "${AWS::AccountId}"` and `!Sub "${AWS::Region}"`. +> Do not add them as user-facing parameters. + +--- + +### Step 2 — Lambda Validates and Dispatches + +The Lambda (`tf-run-executor-trigger`) receives a CloudFormation Custom Resource +event of type `Create` (first provision) or `Update` (stack update). + +**What the Lambda does:** + +1. **Normalizes property names** — CloudFormation sends PascalCase parameter names + (`AccountRepo`). The Lambda converts them to `snake_case` (`account_repo`) before + validation. Properties already in `snake_case` are left as-is. + +2. **Validates all inputs** via a Pydantic v2 model (`TfRunRequest`): + - `account_repo` — required, must be a non-empty string + - `layer` — must be one of `common`, `infrastructure`, `vpc` + - `region_dir` — must be one of `east`, `west` + - `template_repo` — optional string; empty string means no template rendering + - `template_vars` — optional; accepts a JSON string (as CFN sends it) or a dict + - `extra_files` — optional; accepts a JSON string or a dict + - `git_branch` — optional, defaults to `repo-init` + - `tf_run_start_tag` — optional, defaults to empty (run all steps) + - `dry_run` — optional, defaults to `false` + + If validation fails, the Lambda immediately signals CloudFormation `FAILED` + with the validation error as the reason — the build is never started. + +3. **Fetches the GHE PAT** from AWS Secrets Manager (`ghe-runner/github-token`). + +4. **Starts a CodeBuild build** on the `tf-run-executor` project with all the + parameters injected as per-build environment variable overrides: + + | Env var | Source | + |---|---| + | `ACCOUNT_REPO` | `account_repo` field | + | `LAYER` | `layer` field | + | `REGION_DIR` | `region_dir` field | + | `GIT_BRANCH` | `git_branch` field | + | `TF_RUN_START_TAG` | `tf_run_start_tag` field | + | `TEMPLATE_REPO` | `template_repo` field | + | `TEMPLATE_VARS` | `template_vars` serialized to JSON string | + | `EXTRA_FILES` | `extra_files` serialized to JSON string | + | `DRY_RUN` | `dry_run` as lowercase string (`"true"` / `"false"`) | + | `GITHUB_TOKEN` | PAT from Secrets Manager (plaintext, not a reference) | + +5. **Polls CodeBuild** by calling `BatchGetBuilds` every 20 seconds until the + build status is no longer `IN_PROGRESS`, or until 60 seconds before the + Lambda timeout (900 seconds total), whichever comes first. + +6. On **success**: Calls the GHE API to fetch the open PR URL for the branch and + signals CloudFormation `SUCCESS` with the PR URL, repo URL, and branch name + as outputs. + + On **failure**: Signals CloudFormation `FAILED` with the CodeBuild build ID + and logs URL so the user knows where to look. + +**Delete events** are no-ops — the Lambda signals `SUCCESS` immediately and takes +no action. Terraform changes are not automatically reversed. + +--- + +### Step 3 — CodeBuild: Install Phase + +The `tf-run-executor` CodeBuild project runs on Amazon Linux 2. The install +phase sets up every tool the build needs: + +- **Terraform binary** — downloaded from + `s3://csvd-packer-pipeline-assets/terraform/terraform_1.9.1_linux_amd64.zip`. + `registry.terraform.io` is blocked on the Census network; direct provider + downloads from S3 bypass this. After extraction, `tf` is symlinked to the + binary. +- **tf-{action} symlinks** — `tf-plan`, `tf-apply`, `tf-init`, etc. are all + symlinks to `tf-control.sh`, which wraps each Terraform operation with logging, + proxy settings, and version pinning from `.tf-control`. +- **Census CA cert** — installed from S3 into the system trust store via + `update-ca-trust`. Required for TLS connections to `github.e.it.census.gov`. +- **tf-run toolchain** — `tf-run`, `tf-control.sh`, and `tf-directory-setup.py` + are copied from the `scripts/` directory of this repo (which CodeBuild has + already checked out as its source). +- **Python packages** — `jinja2`, `python-dateutil`, `pyyaml` installed for + template rendering and `tf-directory-setup.py`. +- **gh CLI** — downloaded from S3, used to open pull requests. + +--- + +### Step 4 — CodeBuild: Clone Account Repo + +```bash +git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${ACCOUNT_REPO}.git" repo +cd repo +git checkout -B "${GIT_BRANCH}" +``` + +The account repo is cloned using HTTPS with the GHE PAT embedded in the URL. +SSH is not used — the Census proxy blocks SSH host key exchange. + +`git checkout -B` creates the branch if it does not exist, or resets it to the +current `HEAD` if it does. Subsequent pushes will force-update this branch. + +--- + +### Step 5 — Template Rendering (if a template repo is specified) + +If `TEMPLATE_REPO` is set, the build clones that repo and renders its contents +into the account repo: + +```bash +git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${TEMPLATE_REPO}.git" /tmp/template-repo +``` + +The Python rendering script then walks every file in the template repo: + +- **`.j2` files** are rendered as Jinja2 templates with `TEMPLATE_VARS` as the + variable context. `StrictUndefined` is used, meaning the build **fails** if + a template references a variable that was not provided. The `.j2` extension is + stripped from the output filename. +- **All other files** are copied as-is, preserving directory structure. + +All output files land in the account repo at the same relative paths they had +in the template repo. + +**Example**: A template repo with this layout: + +``` +vpc/west/vpc.tf.j2 +vpc/west/README.md +``` + +...and `TEMPLATE_VARS = {"vpc_cidr": "10.0.0.0/16"}` would write: + +``` +repo/vpc/west/vpc.tf ← rendered from vpc.tf.j2 +repo/vpc/west/README.md ← copied verbatim +``` + +If `TEMPLATE_REPO` is empty or not set, this step is skipped entirely. + +--- + +### Step 6 — Extra Files (direct file injection) + +After template rendering, any key/value pairs in `EXTRA_FILES` are written +directly to the account repo: + +```python +files = json.loads(os.environ.get('EXTRA_FILES', '{}')) +for path, content in files.items(): + pathlib.Path(path).parent.mkdir(parents=True, exist_ok=True) + pathlib.Path(path).write_text(content) +``` + +Because this runs **after** template rendering, `EXTRA_FILES` can override +any file that was produced by the template repo. Use this for one-off +customizations that do not belong in the reusable template. + +--- + +### Step 7 — Commit and Push + +```bash +git add -A +git commit -m "SC automation: ${LAYER}/${REGION_DIR} [${ACCOUNT_REPO}]" \ + --allow-empty +git push origin "${GIT_BRANCH}" +``` + +`--allow-empty` handles the case where the rendered files are identical to what +was already on the branch, preventing the build from failing on a no-change run. + +--- + +### Step 8 — Run Terraform + +The build changes into the target layer/region directory and runs the Census +`tf-run` toolchain: + +```bash +cd "${LAYER}/${REGION_DIR}" + +# Dry run: plan only, no apply, no PR +if [ "${DRY_RUN}" = "true" ]; then + tf-plan -no-color + +# Start from a specific tf-run.data step +elif [ -n "${TF_RUN_START_TAG}" ]; then + TFARGS="-auto-approve" tf-run apply "tag:${TF_RUN_START_TAG}" + +# Full run from the beginning +else + TFARGS="-auto-approve" tf-run apply +fi +``` + +**`tf-run` in non-interactive mode**: `tf-run` normally prompts `continue [y/n]` +between steps. In CodeBuild there is no TTY, so `read -t $TIMEOUT` returns +immediately with a non-zero exit code and the default `y` is used — `tf-run` +auto-proceeds through all steps without any manual intervention needed. + +**For the target directory to work**, the account repo must already have been +bootstrapped. That means `//` must contain: + +- `remote_state.backend.tf` — S3 backend configuration +- `remote_state..tf` — symlink to the correct `.s3` or `.local` variant +- `tf-run.data` — step definitions for this layer/region +- `.tf-control` at the repo root — Terraform version pin + +These files are present in any properly initialized Census account repo. + +--- + +### Step 9 — Open Pull Request + +```bash +GH_HOST=github.e.it.census.gov \ +GH_TOKEN="${GITHUB_TOKEN}" \ +gh pr create \ + --title "SC automation: ${LAYER}/${REGION_DIR} [${ACCOUNT_REPO}]" \ + --body "Triggered by Service Catalog provisioning of ${ACCOUNT_REPO}." \ + --base main \ + --head "${GIT_BRANCH}" \ +|| echo "PR already exists or create failed, continuing" +``` + +If the PR already exists for this branch (e.g., from a previous provision of the +same stack), the `gh pr create` command exits non-zero but the `|| echo` prevents +the build from failing. The `post_build` phase then fetches the existing PR URL +with `gh pr view`. + +--- + +### Step 10 — Lambda Returns Results to CloudFormation + +Once CodeBuild reports `SUCCEEDED`, the Lambda calls the GHE REST API +(`/repos/{org}/{repo}/pulls?state=open&head={org}:{branch}`) to fetch the open PR +URL for the branch. The `PR_URL=` line emitted in `post_build` is informational +only (visible in CodeBuild logs) — it is not parsed by the Lambda. + +The Lambda then signals CloudFormation with: + +| CloudFormation Output | Value | +|---|---| +| `PullRequestUrl` | URL of the open PR | +| `RepositoryUrl` | URL of the account repo | +| `BranchName` | Branch that was committed to | +| `CodeBuildBuildId` | Full CodeBuild build ID (useful for looking up logs) | + +The CloudFormation stack transitions to `CREATE_COMPLETE` and the Service Catalog +provisioned product shows as **AVAILABLE**. + +--- + +## Infrastructure Overview + +All infrastructure lives in `deploy/` and is managed by Terraform. It is deployed +once to `csvd-dev` (`229685449397`, `us-gov-west-1`) and shared to all other +accounts in the org via Service Catalog portfolio sharing. + +| Resource | Purpose | +|---|---| +| `aws_ecr_repository.lambda` | Container image registry for the Lambda | +| `aws_lambda_function.tf_run_trigger` | The Lambda function (`tf-run-executor-trigger`) | +| `aws_lambda_permission.cfn_invoke` | Allows CloudFormation in any org account to invoke the Lambda cross-account | +| `aws_codebuild_project.tf_run_executor` | The CodeBuild project that runs builds | +| `aws_codebuild_source_credential.ghe` | GHE PAT credential for CodeBuild to clone from GHE | +| `aws_iam_role.lambda_exec` | Lambda execution role: Secrets Manager read, CodeBuild start/poll, CloudWatch Logs write | +| `aws_iam_role.codebuild_exec` | CodeBuild service role: S3 read, Secrets Manager read, CloudWatch Logs write | +| `aws_s3_object.product_template` | The CFN product template uploaded to S3 for Service Catalog | +| `aws_servicecatalog_portfolio.this` | The SC portfolio | +| `aws_servicecatalog_product.tf_run` | The SC product (CloudFormation template type) | +| `aws_servicecatalog_constraint.launch` | Launch constraint tying the product to the SC launch IAM role | +| `aws_iam_role.sc_launch` | SC launch role: invoke Lambda + CloudFormation operations | + +### Why Everything Is Centralized in csvd-dev + +The Lambda only calls GHE and CodeBuild — it makes zero AWS API calls in the +account where the SC product is provisioned. There is no reason to deploy a copy +of the Lambda, CodeBuild project, ECR repo, or Secrets Manager secret to every +account. Centralizing in `csvd-dev` means there is one place to update and one +set of credentials to rotate. + +--- + +## Template Repository Design Guide + +A template repo is any GHE repository (in the same org) that contains the files +you want written into the account repo. There are no required conventions — the +directory structure you use in the template repo becomes the directory structure +in the account repo. + +### Jinja2 Templates (`.j2` files) + +Files ending in `.j2` are rendered with Jinja2 before being written. The +`TemplateVars` JSON object passed through the SC form provides the variable +context. + +**Example template** (`vpc/west/main.tf.j2`): +```hcl +# Generated by SC automation +locals { + vpc_name = "{{ vpc_name }}" + cidr_block = "{{ cidr_block }}" + env = "{{ environment }}" +} +``` + +**Example TemplateVars JSON**: +```json +{ + "vpc_name": "my-vpc", + "cidr_block": "10.20.0.0/16", + "environment": "dev" +} +``` + +**Output** written to `vpc/west/main.tf` in the account repo: +```hcl +# Generated by SC automation +locals { + vpc_name = "my-vpc" + cidr_block = "10.20.0.0/16" + env = "dev" +} +``` + +`StrictUndefined` is enabled — if a template references `{{ some_variable }}` and +`some_variable` is not in `TemplateVars`, the build fails immediately with a clear +error message. + +### Non-template Files + +Files without the `.j2` extension are copied verbatim. Use these for static +Terraform files, `tf-run.data` step definitions, `.tf-control` version pins, and +anything else that does not vary per deployment. + +### Extra Files Override + +If you need to inject a file that is not in the template repo — or override a +specific file from the template repo for a particular deployment — use +`ExtraFiles`. It runs after template rendering and wins on any path conflict. + +--- + +## Key Constraints and Limitations + +### Idempotency: Stack Updates + +The Lambda uses `{account_repo}-{layer}-{region_dir}` as the CloudFormation +`PhysicalResourceId`. This means that if you update a stack but those three +fields stay the same, CloudFormation treats it as an in-place `Update` of the +existing resource (and triggers another build). If any of those three fields +change, CloudFormation treats it as a replacement — it will call `Delete` on the +old resource (a no-op) and `Create` on the new one (triggering a fresh build in +the new target location). This is intentional behavior. + +### Inputs the Lambda Enforces + +- `layer` must be exactly `common`, `infrastructure`, or `vpc` +- `region_dir` must be exactly `east` or `west` +- `template_vars` and `extra_files` must be valid JSON objects (or empty strings) + +These constraints exist because `tf-run` is directory-structured and expects to +`cd` into a path like `infrastructure/west/`. + +### Account Repo Must Be Pre-bootstrapped + +`tf-run` does not create `remote_state.backend.tf` or set up the S3 backend +automatically. The target `//` directory must already have been +bootstrapped (via `tf-directory-setup.py`) before this automation can apply +Terraform to it. A future "init mode" will handle first-time setup. + +### Build Timeout + +The Lambda timeout is 900 seconds (15 minutes). The CodeBuild project timeout is +60 minutes. The Lambda polls until 60 seconds before its own deadline and then +gives up, returning `LAMBDA_TIMEOUT`. The CodeBuild build continues running after +a Lambda timeout — check the CodeBuild console for full logs. + +### No SSH Git + +The Census HTTPS proxy blocks SSH `git clone`. All cloning uses HTTPS with the +GHE PAT embedded in the URL. Do not use SSH-based Terraform module sources +(`git::ssh://`) in any Terraform code that will run through this system. + +### Delete Is a No-op + +CloudFormation `Delete` events (stack deletion or product termination) are +acknowledged with `SUCCESS` immediately. No Terraform destroy is run. This is +intentional — automated infra teardown is too risky without explicit human review. + +--- + +## Adding Support for a New Terraform Configuration + +To onboard a new type of Terraform work to this platform: + +1. **Create a template repo** in the `SCT-Engineering` GHE org. Add your `.tf` + files, using `.j2` extension for any file that needs variable substitution. +2. **Document the required `TemplateVars`** so SC product users know what JSON + to supply. +3. **Create a new SC product** (a new CloudFormation template in + `service-catalog/`) that pre-fills `TemplateRepo` with your repo name and + guides users toward filling in the right `TemplateVars`. +4. **Add the product to the portfolio** by referencing it from `deploy/service_catalog.tf`. + +The Lambda and CodeBuild infrastructure itself does not need to change. + +--- + +## Rebuilding the Lambda Image + +When `lambda/app.py`, `lambda/Dockerfile`, or `lambda/requirements.txt` change, +rebuild and push the container image: + +```bash +cd /home/a/arnol377/git/sc-lambda-ghactions +source ~/aws-creds +packer-pipeline --config csvd_config_packer.hcl +``` + +After the `tf-run-executor-builder` CodeBuild build succeeds, force the Lambda +to pull the new image: + +```bash +export AWS_DEFAULT_REGION=us-gov-west-1 +aws lambda update-function-code \ + --function-name tf-run-executor-trigger \ + --image-uri "229685449397.dkr.ecr.us-gov-west-1.amazonaws.com/tf-run-executor/lambda:latest" \ + --region us-gov-west-1 +``` + +## Deploying Infrastructure Changes + +```bash +export AWS_DEFAULT_REGION=us-gov-west-1 +source ~/aws-creds +cd /home/a/arnol377/git/sc-lambda-ghactions/deploy +tf init # only needed after provider changes +tf plan +tf apply +``` + +--- + +## Troubleshooting + +### CloudFormation stack stuck in CREATE_IN_PROGRESS + +The Lambda is polling CodeBuild. Check the Lambda logs in CloudWatch +(`/aws/lambda/tf-run-executor-trigger`) and find the CodeBuild build ID in the +log output, then open that build in the CodeBuild console. + +### FAILED: validation error + +The Lambda rejected the inputs before the build started. The CFN stack event +reason field will contain the Pydantic validation error. Fix the parameter and +update the stack. + +### FAILED: CodeBuild build FAILED + +Open the CodeBuild build using the build ID in the failure reason. The most +common causes are: +- Template rendered with missing variables (`StrictUndefined` error) — add the + missing key to `TemplateVars` +- `tf-run` failed on a Terraform plan/apply error — check the full build log for + the Terraform error output +- Account repo not bootstrapped — the target `//` directory + is missing `remote_state.backend.tf` + +### LAMBDA_TIMEOUT + +The build took longer than the Lambda poll window (~14 minutes). The CodeBuild +build is still running. Wait for it to complete, check the PR on GHE, and if +needed do a stack Update (which will start a new build) to re-sync CFN with the +result. diff --git a/lambda/app.py b/lambda/app.py index 79c674f..9d4cf47 100644 --- a/lambda/app.py +++ b/lambda/app.py @@ -47,23 +47,27 @@ class TfRunRequest(BaseModel): layer: Literal["common", "infrastructure", "vpc"] = Field(..., description="Terraform layer") region_dir: Literal["east", "west"] = Field(..., description="Region directory") tf_run_start_tag: str = Field(default="", description="tf-run.data TAG label to start from; empty = from beginning") - extra_files: dict = Field(default_factory=dict, description='JSON map {"relative/path": "content"} written before tf-run') + template_repo: str = Field(default="", description="GHE repo name containing Jinja2/raw template files to render into the account repo") + template_vars: dict = Field(default_factory=dict, description='JSON map of variables passed to Jinja2 when rendering template_repo files') + extra_files: dict = Field(default_factory=dict, description='JSON map {"relative/path": "content"} written into account repo (after template rendering; overrides template output)') git_branch: str = Field(default="repo-init", description="Branch to commit and open PR from") dry_run: bool = Field(default=False, description="true = tf plan only, no apply") - @field_validator("extra_files", mode="before") + @field_validator("extra_files", "template_vars", mode="before") @classmethod - def parse_extra_files(cls, v: Any) -> Any: - """Accept a JSON string or a dict for extra_files. + def parse_json_dict_fields(cls, v: Any) -> Any: + """Accept a JSON string or a dict for dict-typed fields. - CFN parameters are always strings, so '{}' or '{"path": "content"}' + CFN parameters are always strings, so '{}' or '{"key": "val"}' must be parsed before Pydantic validates the dict type. """ if isinstance(v, str): + if v.strip() == "": + return {} try: return json.loads(v) except json.JSONDecodeError as exc: - raise ValueError(f"extra_files must be a valid JSON object string; got: {v!r}") from exc + raise ValueError(f"Field must be a valid JSON object string; got: {v!r}") from exc return v class Config: @@ -148,14 +152,16 @@ def start_codebuild_build( response = cb.start_build( projectName=project_name, environmentVariablesOverride=[ - {"name": "ACCOUNT_REPO", "value": tf_req.account_repo, "type": "PLAINTEXT"}, - {"name": "LAYER", "value": tf_req.layer, "type": "PLAINTEXT"}, - {"name": "REGION_DIR", "value": tf_req.region_dir, "type": "PLAINTEXT"}, - {"name": "GIT_BRANCH", "value": tf_req.git_branch, "type": "PLAINTEXT"}, - {"name": "TF_RUN_START_TAG", "value": tf_req.tf_run_start_tag, "type": "PLAINTEXT"}, - {"name": "EXTRA_FILES", "value": json.dumps(tf_req.extra_files), "type": "PLAINTEXT"}, - {"name": "DRY_RUN", "value": str(tf_req.dry_run).lower(), "type": "PLAINTEXT"}, - {"name": "GITHUB_TOKEN", "value": github_token, "type": "PLAINTEXT"}, + {"name": "ACCOUNT_REPO", "value": tf_req.account_repo, "type": "PLAINTEXT"}, + {"name": "LAYER", "value": tf_req.layer, "type": "PLAINTEXT"}, + {"name": "REGION_DIR", "value": tf_req.region_dir, "type": "PLAINTEXT"}, + {"name": "GIT_BRANCH", "value": tf_req.git_branch, "type": "PLAINTEXT"}, + {"name": "TF_RUN_START_TAG", "value": tf_req.tf_run_start_tag, "type": "PLAINTEXT"}, + {"name": "TEMPLATE_REPO", "value": tf_req.template_repo, "type": "PLAINTEXT"}, + {"name": "TEMPLATE_VARS", "value": json.dumps(tf_req.template_vars), "type": "PLAINTEXT"}, + {"name": "EXTRA_FILES", "value": json.dumps(tf_req.extra_files), "type": "PLAINTEXT"}, + {"name": "DRY_RUN", "value": str(tf_req.dry_run).lower(), "type": "PLAINTEXT"}, + {"name": "GITHUB_TOKEN", "value": github_token, "type": "PLAINTEXT"}, ], ) build_id = response["build"]["id"] diff --git a/service-catalog/product-template.yaml b/service-catalog/product-template.yaml index a137a5f..7699101 100644 --- a/service-catalog/product-template.yaml +++ b/service-catalog/product-template.yaml @@ -16,6 +16,11 @@ Metadata: - GitBranch - TfRunStartTag - DryRun + - Label: + default: "Template Repository (optional)" + Parameters: + - TemplateRepo + - TemplateVars - Label: default: "Extra Files (optional)" Parameters: @@ -34,6 +39,10 @@ Metadata: default: "tf-run Start Tag (optional)" DryRun: default: "Dry Run (plan only)" + TemplateRepo: + default: "Template Repository Name (optional)" + TemplateVars: + default: "Template Variables (JSON)" ExtraFiles: default: "Extra Config Files (JSON)" @@ -88,11 +97,29 @@ Parameters: - "false" Default: "false" + TemplateRepo: + Type: String + Description: >- + Name of a GHE repo (in the same org) containing template files. + Files ending in .j2 are rendered as Jinja2 templates using TemplateVars. + All other files are copied as-is. Results land in the account repo at + the same relative paths before tf-run executes. + Leave blank to skip template rendering. + Default: "" + MaxLength: 100 + + TemplateVars: + Type: String + Description: >- + JSON object of variables passed to Jinja2 when rendering .j2 files + from the TemplateRepo. Example: {"cluster_name": "my-eks", "env": "dev"} + Default: "{}" + ExtraFiles: Type: String Description: >- - JSON object mapping relative repo paths to file contents. - These files are written into the account repo before tf-run executes. + JSON object mapping relative repo paths to raw file contents. + Applied after TemplateRepo rendering; keys here override template output. Example: {"vpc/west/my-config.tf": "# placeholder"} Default: "{}" @@ -109,6 +136,8 @@ Resources: region_dir: !Ref RegionDir git_branch: !Ref GitBranch tf_run_start_tag: !Ref TfRunStartTag + template_repo: !Ref TemplateRepo + template_vars: !Ref TemplateVars dry_run: !Ref DryRun extra_files: !Ref ExtraFiles From 3ab2abed735da7db99ff97c4219b893ea3f05189 Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Mon, 11 May 2026 16:47:24 -0400 Subject: [PATCH 02/27] Incorporate morga471 review feedback and address Teams follow-up questions - buildspec.yml: - Clone terraform/support at build time for version governance (no more hardcoded 1.9.1/2.49.0 version strings; reads VERSION files from support repo) - S3 env vars are now prefixes (TF_BINARY_S3_PREFIX, GH_CLI_S3_PREFIX); filenames constructed dynamically from support repo VERSION files - Add SSH->HTTPS git URL rewrite so Terraform module SSH sources work via PAT - Add conditional cross-account assume-role step (TARGET_ACCOUNT_ID) - Add 169.254.170.2 to NO_PROXY (AL2023 ECS credential provider) - deploy/codebuild.tf: upgrade image to amazonlinux2023-x86_64-standard:4.0 - deploy/variables.tf: rename tf_binary_s3/gh_cli_s3 to *_prefix with updated descriptions and defaults - lambda/app.py: add optional target_account_id field; pass TARGET_ACCOUNT_ID to CodeBuild environmentVariablesOverride - service-catalog/product-template.yaml: add optional TargetAccountId parameter with AllowedPattern validation - docs/HOW-IT-WORKS.md: - Document version governance via terraform/support - Note AL2023 - Replace 'No SSH Git' constraint with SSH->HTTPS rewrite explanation - Add cross-account section explaining TARGET_ACCOUNT_ID and required role - Add 'Moving This System to a Different Account' section (Teams Q) --- buildspec.yml | 86 +++++++++++++++----- deploy/codebuild.tf | 12 +-- deploy/variables.tf | 12 +-- docs/HOW-IT-WORKS.md | 111 ++++++++++++++++++++++---- lambda/app.py | 12 +-- service-catalog/product-template.yaml | 19 ++++- 6 files changed, 201 insertions(+), 51 deletions(-) diff --git a/buildspec.yml b/buildspec.yml index f651f50..b5b7767 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -4,26 +4,35 @@ version: 0.2 # tf-run-executor buildspec # # Required env-var overrides per build (supplied by Lambda or manual CLI): -# ACCOUNT_REPO - account repo name, e.g. 229685449397-csvd-dev-platform-dev-gov -# LAYER - terraform layer: common | infrastructure | vpc -# REGION_DIR - region directory: east | west -# GITHUB_TOKEN - GHE PAT (type PLAINTEXT, value from Secrets Manager) +# ACCOUNT_REPO - account repo name, e.g. 229685449397-csvd-dev-platform-dev-gov +# LAYER - terraform layer: common | infrastructure | vpc +# REGION_DIR - region directory: east | west +# GITHUB_TOKEN - GHE PAT (type PLAINTEXT, value from Secrets Manager) # # Optional env-var overrides: -# GIT_BRANCH - branch to commit/PR from (default: repo-init) -# TF_RUN_START_TAG - tf-run.data TAG label to start from (default: empty = from top) -# EXTRA_FILES - JSON map {"relative/path": "content"} written before tf-run -# DRY_RUN - "true" = tf plan only, no apply (default: "false") +# GIT_BRANCH - branch to commit/PR from (default: repo-init) +# TF_RUN_START_TAG - tf-run.data TAG label to start from (default: empty = from top) +# TEMPLATE_REPO - GHE repo containing Jinja2/.tf template files (default: empty) +# TEMPLATE_VARS - JSON map of Jinja2 variables for template rendering (default: {}) +# EXTRA_FILES - JSON map {"relative/path": "content"} written after template rendering +# DRY_RUN - "true" = tf plan only, no apply (default: "false") +# TARGET_ACCOUNT_ID - AWS account ID to assume role in before running tf-run +# (default: empty = run with CodeBuild's own credentials, +# i.e. csvd-dev. Set this when targeting a different account.) # --------------------------------------------------------------------------- env: variables: GITHUB_ORG: "SCT-Engineering" - TF_BINARY_S3: "s3://csvd-packer-pipeline-assets/terraform/terraform_1.9.1_linux_amd64.zip" + # S3 prefixes — filenames are resolved at build time from terraform/support VERSION files. + # The S3 bucket must contain the version pinned in terraform/support (keep in sync). + TF_BINARY_S3_PREFIX: "s3://csvd-packer-pipeline-assets/terraform" + GH_CLI_S3_PREFIX: "s3://csvd-packer-pipeline-assets/tools" CENSUS_CA_S3: "s3://csvd-packer-pipeline-assets/certs/census-ca.pem" - GH_CLI_S3: "s3://csvd-packer-pipeline-assets/tools/gh_2.49.0_linux_amd64.tar.gz" + # Org-canonical version governance: clone this repo to read VERSION files + TERRAFORM_SUPPORT_REPO: "terraform/support" HTTPS_PROXY: "http://proxy.tco.census.gov:3128" - NO_PROXY: "github.e.it.census.gov,169.254.169.254" + NO_PROXY: "github.e.it.census.gov,169.254.169.254,169.254.170.2" # Per-build defaults (overridden via environmentVariablesOverride in Lambda) GIT_BRANCH: "repo-init" DRY_RUN: "false" @@ -31,16 +40,27 @@ env: TEMPLATE_REPO: "" TEMPLATE_VARS: "{}" EXTRA_FILES: "{}" + TARGET_ACCOUNT_ID: "" phases: install: commands: - # --- Terraform binary (registry.terraform.io is blocked; pull from S3) --- - - aws s3 cp "$TF_BINARY_S3" /tmp/terraform.zip + # --- Version governance: clone terraform/support to read org-canonical versions --- + # This repo (github.e.it.census.gov/terraform/support) is the single source of truth + # for which Terraform and gh CLI versions the org has blessed. We read VERSION files + # from it rather than hardcoding versions here. + - git clone --depth 1 "https://${GITHUB_TOKEN}@github.e.it.census.gov/${TERRAFORM_SUPPORT_REPO}.git" /tmp/tf-support + - export TF_VERSION=$(cat /tmp/tf-support/terraform/VERSION) + - export GH_VERSION=$(cat /tmp/tf-support/github-cli-releases/VERSION) + - echo "Using Terraform ${TF_VERSION}, gh CLI ${GH_VERSION}" + + # --- Terraform binary (registry.terraform.io is blocked on Census network; use S3) --- + # S3 bucket must contain the version pinned in terraform/support/terraform/VERSION. + - aws s3 cp "${TF_BINARY_S3_PREFIX}/terraform_${TF_VERSION}_linux_amd64.zip" /tmp/terraform.zip - unzip -o /tmp/terraform.zip -d /usr/local/bin/ && chmod +x /usr/local/bin/terraform - ln -sf /usr/local/bin/terraform /usr/local/bin/tf - # --- Census CA certificate (GHE TLS) --- + # --- Census CA certificate (required for TLS to github.e.it.census.gov) --- - aws s3 cp "$CENSUS_CA_S3" /etc/pki/ca-trust/source/anchors/census-ca.pem - update-ca-trust extract @@ -55,18 +75,25 @@ phases: ln -sf /usr/local/bin/tf-control.sh /usr/local/bin/tf-${action}; done - # --- Python deps for tf-directory-setup.py --- + # --- Python deps for tf-directory-setup.py and template rendering --- - pip3 install --quiet jinja2 python-dateutil pyyaml - # --- gh CLI --- - - aws s3 cp "$GH_CLI_S3" /tmp/gh.tar.gz + # --- gh CLI (S3 bucket must contain the version pinned in terraform/support) --- + - aws s3 cp "${GH_CLI_S3_PREFIX}/gh_${GH_VERSION}_linux_amd64.tar.gz" /tmp/gh.tar.gz - mkdir -p /tmp/gh-cli - tar -xzf /tmp/gh.tar.gz -C /tmp/gh-cli --strip-components=1 - cp /tmp/gh-cli/bin/gh /usr/local/bin/gh && chmod +x /usr/local/bin/gh build: commands: - # --- Clone account repo over HTTPS (SSH is blocked by Census proxy) --- + # --- Configure git to rewrite SSH URLs to HTTPS --- + # Account repos reference Terraform modules via ssh://git@github.e.it.census.gov/... + # This rewrite makes those module fetches work transparently via HTTPS + PAT, + # avoiding the need for a per-repo deploy key. + - git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "ssh://git@github.e.it.census.gov/" + - git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "git@github.e.it.census.gov:" + + # --- Clone account repo --- - git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${ACCOUNT_REPO}.git" repo - cd repo - git checkout -B "${GIT_BRANCH}" @@ -137,6 +164,29 @@ phases: --allow-empty - git push origin "${GIT_BRANCH}" + # --- Assume cross-account role (if TARGET_ACCOUNT_ID is set) --- + # CodeBuild runs in csvd-dev by default. To run tf-run apply against resources + # in a different AWS account, set TARGET_ACCOUNT_ID. The role + # sc-automation-codebuild-role must exist in that account and trust the + # CodeBuild IAM role from csvd-dev. + - | + if [ -n "${TARGET_ACCOUNT_ID}" ]; then + PARTITION=$(aws sts get-caller-identity --query Arn --output text | cut -d: -f2) + ROLE_ARN="arn:${PARTITION}:iam::${TARGET_ACCOUNT_ID}:role/sc-automation-codebuild-role" + echo "Assuming cross-account role: ${ROLE_ARN}" + CREDS=$(aws sts assume-role \ + --role-arn "${ROLE_ARN}" \ + --role-session-name "sc-automation-${ACCOUNT_REPO}" \ + --query Credentials \ + --output json) + export AWS_ACCESS_KEY_ID=$(echo "$CREDS" | python3 -c "import json,sys; print(json.load(sys.stdin)['AccessKeyId'])") + export AWS_SECRET_ACCESS_KEY=$(echo "$CREDS" | python3 -c "import json,sys; print(json.load(sys.stdin)['SecretAccessKey'])") + export AWS_SESSION_TOKEN=$(echo "$CREDS" | python3 -c "import json,sys; print(json.load(sys.stdin)['SessionToken'])") + echo "Successfully assumed role in account ${TARGET_ACCOUNT_ID}" + else + echo "No TARGET_ACCOUNT_ID set — running with CodeBuild role (csvd-dev)" + fi + # --- Run Terraform in target layer/region directory --- # tf-run auto-proceeds on non-TTY stdin (read -t timeout defaults to "y") - cd "${LAYER}/${REGION_DIR}" diff --git a/deploy/codebuild.tf b/deploy/codebuild.tf index d65d38a..1c198e7 100644 --- a/deploy/codebuild.tf +++ b/deploy/codebuild.tf @@ -27,7 +27,7 @@ resource "aws_codebuild_project" "tf_run_executor" { environment { compute_type = "BUILD_GENERAL1_SMALL" - image = "aws/codebuild/amazonlinux2-x86_64-standard:3.0" + image = "aws/codebuild/amazonlinux2023-x86_64-standard:4.0" type = "LINUX_CONTAINER" privileged_mode = false @@ -37,16 +37,16 @@ resource "aws_codebuild_project" "tf_run_executor" { value = var.github_org } environment_variable { - name = "TF_BINARY_S3" - value = var.tf_binary_s3 + name = "TF_BINARY_S3_PREFIX" + value = var.tf_binary_s3_prefix } environment_variable { name = "CENSUS_CA_S3" value = var.census_ca_s3 } environment_variable { - name = "GH_CLI_S3" - value = var.gh_cli_s3 + name = "GH_CLI_S3_PREFIX" + value = var.gh_cli_s3_prefix } environment_variable { name = "HTTPS_PROXY" @@ -54,7 +54,7 @@ resource "aws_codebuild_project" "tf_run_executor" { } environment_variable { name = "NO_PROXY" - value = "github.e.it.census.gov,169.254.169.254" + value = "github.e.it.census.gov,169.254.169.254,169.254.170.2" } # Placeholder values — always overridden by Lambda per-build environment_variable { diff --git a/deploy/variables.tf b/deploy/variables.tf index ea0c197..4558239 100644 --- a/deploy/variables.tf +++ b/deploy/variables.tf @@ -10,10 +10,10 @@ variable "source_repo_url" { # e.g. "https://github.e.it.census.gov/SCT-Engineering/sc-lambda-ghactions" } -variable "tf_binary_s3" { - description = "S3 URI for the Terraform Linux AMD64 zip (registry.terraform.io is blocked)" +variable "tf_binary_s3_prefix" { + description = "S3 URI prefix for Terraform Linux AMD64 zips (registry.terraform.io is blocked). Filename is resolved at build time from the terraform/support VERSION file." type = string - default = "s3://csvd-packer-pipeline-assets/terraform/terraform_1.9.1_linux_amd64.zip" + default = "s3://csvd-packer-pipeline-assets/terraform" } variable "census_ca_s3" { @@ -22,10 +22,10 @@ variable "census_ca_s3" { default = "s3://csvd-packer-pipeline-assets/certs/census-ca.pem" } -variable "gh_cli_s3" { - description = "S3 URI for the gh CLI Linux AMD64 tarball" +variable "gh_cli_s3_prefix" { + description = "S3 URI prefix for gh CLI Linux AMD64 tarballs. Filename is resolved at build time from the terraform/support github-cli-releases/VERSION file." type = string - default = "s3://csvd-packer-pipeline-assets/tools/gh_2.49.0_linux_amd64.tar.gz" + default = "s3://csvd-packer-pipeline-assets/tools" } variable "https_proxy" { diff --git a/docs/HOW-IT-WORKS.md b/docs/HOW-IT-WORKS.md index b4308c5..2d1659e 100644 --- a/docs/HOW-IT-WORKS.md +++ b/docs/HOW-IT-WORKS.md @@ -50,7 +50,7 @@ without needing to operate their own build infrastructure. ▼ ┌─────────────────────────────────────────────────────────────────────┐ │ CodeBuild project: tf-run-executor │ -│ (Amazon Linux 2, 60-minute timeout) │ +│ (Amazon Linux 2023, 60-minute timeout) │ │ │ │ INSTALL phase │ │ • Downloads Terraform binary from S3 (registry.terraform.io is │ @@ -89,6 +89,7 @@ in the form: | **Account Repo Name** | The GHE repo that contains the target account's Terraform. Example: `229685449397-csvd-dev-platform-dev-gov` | | **Terraform Layer** | Which layer of the account repo to operate on: `common`, `infrastructure`, or `vpc` | | **Region Directory** | `east` or `west` — the subdirectory inside the layer | +| **Target AWS Account ID** | Optional. The AWS account ID where `tf-run apply` should make changes. When set, CodeBuild assumes `sc-automation-codebuild-role` in that account. Leave blank to run with the CodeBuild role's credentials (only works for resources in csvd-dev). | | **Template Repository** | Name of a GHE repo containing Jinja2 or raw file templates (optional) | | **Template Variables** | A JSON object of key/value pairs passed to Jinja2 when rendering templates (optional) | | **Extra Config Files** | A JSON object of `{"relative/path": "file content"}` written directly into the repo, bypassing templates (optional) | @@ -120,6 +121,7 @@ event of type `Create` (first provision) or `Update` (stack update). - `account_repo` — required, must be a non-empty string - `layer` — must be one of `common`, `infrastructure`, `vpc` - `region_dir` — must be one of `east`, `west` + - `target_account_id` — optional; 12-digit AWS account ID or empty string - `template_repo` — optional string; empty string means no template rendering - `template_vars` — optional; accepts a JSON string (as CFN sends it) or a dict - `extra_files` — optional; accepts a JSON string or a dict @@ -142,6 +144,7 @@ event of type `Create` (first provision) or `Update` (stack update). | `REGION_DIR` | `region_dir` field | | `GIT_BRANCH` | `git_branch` field | | `TF_RUN_START_TAG` | `tf_run_start_tag` field | + | `TARGET_ACCOUNT_ID` | `target_account_id` field | | `TEMPLATE_REPO` | `template_repo` field | | `TEMPLATE_VARS` | `template_vars` serialized to JSON string | | `EXTRA_FILES` | `extra_files` serialized to JSON string | @@ -166,14 +169,24 @@ no action. Terraform changes are not automatically reversed. ### Step 3 — CodeBuild: Install Phase -The `tf-run-executor` CodeBuild project runs on Amazon Linux 2. The install +The `tf-run-executor` CodeBuild project runs on **Amazon Linux 2023**. The install phase sets up every tool the build needs: -- **Terraform binary** — downloaded from - `s3://csvd-packer-pipeline-assets/terraform/terraform_1.9.1_linux_amd64.zip`. - `registry.terraform.io` is blocked on the Census network; direct provider - downloads from S3 bypass this. After extraction, `tf` is symlinked to the - binary. +- **Version governance** — the build first clones + `github.e.it.census.gov/terraform/support` (the org-canonical version registry) + and reads two `VERSION` files from it: + - `terraform/VERSION` → the Terraform version to install + - `github-cli-releases/VERSION` → the gh CLI version to install + + This means Terraform and gh CLI versions are governed centrally in that repo — + updating a VERSION file there automatically affects all future builds here + without any change to this repo. + +- **Terraform binary** — downloaded from S3 at + `s3://csvd-packer-pipeline-assets/terraform/terraform_{VERSION}_linux_amd64.zip`. + `registry.terraform.io` is blocked on the Census network. The S3 bucket must + contain the version pinned in `terraform/support` (kept in sync as an ops task). + After extraction, `tf` is symlinked to the binary. - **tf-{action} symlinks** — `tf-plan`, `tf-apply`, `tf-init`, etc. are all symlinks to `tf-control.sh`, which wraps each Terraform operation with logging, proxy settings, and version pinning from `.tf-control`. @@ -184,20 +197,33 @@ phase sets up every tool the build needs: already checked out as its source). - **Python packages** — `jinja2`, `python-dateutil`, `pyyaml` installed for template rendering and `tf-directory-setup.py`. -- **gh CLI** — downloaded from S3, used to open pull requests. +- **gh CLI** — downloaded from S3 at + `s3://csvd-packer-pipeline-assets/tools/gh_{VERSION}_linux_amd64.tar.gz`, + using the version read from `terraform/support`. --- ### Step 4 — CodeBuild: Clone Account Repo ```bash +# Rewrite SSH → HTTPS so Terraform module sources work without deploy keys +git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "ssh://git@github.e.it.census.gov/" +git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "git@github.e.it.census.gov:" + git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${ACCOUNT_REPO}.git" repo cd repo git checkout -B "${GIT_BRANCH}" ``` -The account repo is cloned using HTTPS with the GHE PAT embedded in the URL. -SSH is not used — the Census proxy blocks SSH host key exchange. +The account repo is cloned over HTTPS with the GHE PAT embedded in the URL. +GHE is on the Census internal network; no proxy is required (it is in `NO_PROXY`). + +The **SSH→HTTPS git URL rewrite** handles a key problem: account repos reference +Terraform modules via SSH (`git::ssh://github.e.it.census.gov/...` or +`git@github.e.it.census.gov:...`). When `tf-run apply` executes, Terraform fetches +those modules — and without an SSH key in CodeBuild, those fetches would fail. +The global git rewrite transparently redirects all SSH-form GHE URLs to HTTPS + +PAT, so all module sources resolve without needing a per-repo deploy key. `git checkout -B` creates the branch if it does not exist, or resets it to the current `HEAD` if it does. Subsequent pushes will force-update this branch. @@ -301,6 +327,14 @@ between steps. In CodeBuild there is no TTY, so `read -t $TIMEOUT` returns immediately with a non-zero exit code and the default `y` is used — `tf-run` auto-proceeds through all steps without any manual intervention needed. +**Cross-account credentials**: If `TARGET_ACCOUNT_ID` is set, the build assumes +`arn:{partition}:iam::{TARGET_ACCOUNT_ID}:role/sc-automation-codebuild-role` via +`aws sts assume-role` immediately before `cd`-ing into the layer directory and +running `tf-run`. The assumed credentials are exported as `AWS_ACCESS_KEY_ID`, +`AWS_SECRET_ACCESS_KEY`, and `AWS_SESSION_TOKEN`, which Terraform and the AWS +provider pick up automatically. If `TARGET_ACCOUNT_ID` is empty, CodeBuild runs +with its own IAM role (scoped to csvd-dev). + **For the target directory to work**, the account repo must already have been bootstrapped. That means `//` must contain: @@ -447,7 +481,48 @@ specific file from the template repo for a particular deployment — use ## Key Constraints and Limitations -### Idempotency: Stack Updates +### Moving This System to a Different AWS Account + +The infrastructure is straightforward to relocate. All account-specific values +are Terraform variables — there are no hardcoded account IDs in the code. To +move the Lambda and CodeBuild project to a different account: + +1. Update `deploy/terraform.tfvars` with the new account's values +2. Run `tf init && tf apply` in the new account +3. Rebuild the Lambda image with `packer-pipeline` in the new account context +4. Update the Service Catalog `ServiceToken` ARN in each SC product template to + point to the new account's Lambda ARN (`deploy/service_catalog.tf` handles + this automatically via `!Sub` with the new account ID) +5. Update portfolio sharing in `deploy/service_catalog.tf` to share to the org + +No code changes are required — only variable and deployment target changes. + +### How CodeBuild Runs Terraform in a Different Account + +CodeBuild itself always runs in the account where it is deployed (initially +csvd-dev). When `TARGET_ACCOUNT_ID` is set on a product launch, the buildspec +assumes a cross-account IAM role in that account before running `tf-run apply`. + +**For this to work, the target account must have a role named +`sc-automation-codebuild-role`** with: +- A trust policy allowing the CodeBuild IAM role from csvd-dev to assume it: + ```json + { + "Principal": { + "AWS": "arn:aws-us-gov:iam::229685449397:role/tf-run-executor-codebuild" + }, + "Action": "sts:AssumeRole" + } + ``` +- Permissions to read/write the S3 Terraform state bucket, create/modify the + resources the Terraform config manages, and write CloudWatch Logs + +This role is **not** created by this repo — it must be provisioned separately in +each target account before the SC product can successfully apply there. +Creating that role is a future work item (potentially via another SC product). + +If `TARGET_ACCOUNT_ID` is left blank, CodeBuild uses its own IAM role and can +only apply Terraform that targets resources within csvd-dev. The Lambda uses `{account_repo}-{layer}-{region_dir}` as the CloudFormation `PhysicalResourceId`. This means that if you update a stack but those three @@ -480,11 +555,17 @@ The Lambda timeout is 900 seconds (15 minutes). The CodeBuild project timeout is gives up, returning `LAMBDA_TIMEOUT`. The CodeBuild build continues running after a Lambda timeout — check the CodeBuild console for full logs. -### No SSH Git +### SSH Module Sources Are Supported (via URL Rewrite) + +Account repos reference Terraform modules using SSH-style URLs +(`git::ssh://github.e.it.census.gov/...` or `git@github.e.it.census.gov:...`). +These work transparently in CodeBuild because the build configures a global git +URL rewrite at the start of the BUILD phase that redirects all SSH-form GHE URLs +to HTTPS + PAT. No deploy keys or SSH configuration are required. -The Census HTTPS proxy blocks SSH `git clone`. All cloning uses HTTPS with the -GHE PAT embedded in the URL. Do not use SSH-based Terraform module sources -(`git::ssh://`) in any Terraform code that will run through this system. +Direct `git clone` of the account repo and template repo also uses HTTPS + PAT. +GHE is on the Census internal network and is in `NO_PROXY`, so no proxy is +involved in any GHE git operation. ### Delete Is a No-op diff --git a/lambda/app.py b/lambda/app.py index 9d4cf47..80e88ae 100644 --- a/lambda/app.py +++ b/lambda/app.py @@ -52,6 +52,7 @@ class TfRunRequest(BaseModel): extra_files: dict = Field(default_factory=dict, description='JSON map {"relative/path": "content"} written into account repo (after template rendering; overrides template output)') git_branch: str = Field(default="repo-init", description="Branch to commit and open PR from") dry_run: bool = Field(default=False, description="true = tf plan only, no apply") + target_account_id: str = Field(default="", description="AWS account ID to assume sc-automation-codebuild-role in before running tf-run; empty = run with CodeBuild role (csvd-dev)") @field_validator("extra_files", "template_vars", mode="before") @classmethod @@ -157,11 +158,12 @@ def start_codebuild_build( {"name": "REGION_DIR", "value": tf_req.region_dir, "type": "PLAINTEXT"}, {"name": "GIT_BRANCH", "value": tf_req.git_branch, "type": "PLAINTEXT"}, {"name": "TF_RUN_START_TAG", "value": tf_req.tf_run_start_tag, "type": "PLAINTEXT"}, - {"name": "TEMPLATE_REPO", "value": tf_req.template_repo, "type": "PLAINTEXT"}, - {"name": "TEMPLATE_VARS", "value": json.dumps(tf_req.template_vars), "type": "PLAINTEXT"}, - {"name": "EXTRA_FILES", "value": json.dumps(tf_req.extra_files), "type": "PLAINTEXT"}, - {"name": "DRY_RUN", "value": str(tf_req.dry_run).lower(), "type": "PLAINTEXT"}, - {"name": "GITHUB_TOKEN", "value": github_token, "type": "PLAINTEXT"}, + {"name": "TEMPLATE_REPO", "value": tf_req.template_repo, "type": "PLAINTEXT"}, + {"name": "TEMPLATE_VARS", "value": json.dumps(tf_req.template_vars), "type": "PLAINTEXT"}, + {"name": "EXTRA_FILES", "value": json.dumps(tf_req.extra_files), "type": "PLAINTEXT"}, + {"name": "DRY_RUN", "value": str(tf_req.dry_run).lower(), "type": "PLAINTEXT"}, + {"name": "TARGET_ACCOUNT_ID", "value": tf_req.target_account_id, "type": "PLAINTEXT"}, + {"name": "GITHUB_TOKEN", "value": github_token, "type": "PLAINTEXT"}, ], ) build_id = response["build"]["id"] diff --git a/service-catalog/product-template.yaml b/service-catalog/product-template.yaml index 7699101..7f0b1c7 100644 --- a/service-catalog/product-template.yaml +++ b/service-catalog/product-template.yaml @@ -11,11 +11,12 @@ Metadata: - Layer - RegionDir - Label: - default: "Execution Options" + default: "Execution Environment" Parameters: - GitBranch - TfRunStartTag - DryRun + - TargetAccountId - Label: default: "Template Repository (optional)" Parameters: @@ -39,6 +40,8 @@ Metadata: default: "tf-run Start Tag (optional)" DryRun: default: "Dry Run (plan only)" + TargetAccountId: + default: "Target AWS Account ID (optional)" TemplateRepo: default: "Template Repository Name (optional)" TemplateVars: @@ -97,6 +100,19 @@ Parameters: - "false" Default: "false" + TargetAccountId: + Type: String + Description: >- + AWS account ID that CodeBuild should run terraform apply against. + When set, CodeBuild assumes arn:{partition}:iam::{TargetAccountId}:role/sc-automation-codebuild-role + before executing tf-run. That role must exist in the target account with a + trust policy allowing the CodeBuild IAM role from csvd-dev (229685449397). + Leave blank to run with the CodeBuild role's own credentials (csvd-dev only). + Default: "" + MaxLength: 12 + AllowedPattern: '^[0-9]{12}$|^$' + ConstraintDescription: Must be a 12-digit AWS account ID or empty + TemplateRepo: Type: String Description: >- @@ -138,6 +154,7 @@ Resources: tf_run_start_tag: !Ref TfRunStartTag template_repo: !Ref TemplateRepo template_vars: !Ref TemplateVars + target_account_id: !Ref TargetAccountId dry_run: !Ref DryRun extra_files: !Ref ExtraFiles From 42bd370389037be0b1e86d71bafd7b45c34a3d86 Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Mon, 11 May 2026 16:54:22 -0400 Subject: [PATCH 03/27] docs: fix outdated sections in HOW-IT-WORKS.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Component overview BUILD phase: add missing step 6 (assume cross-account role) and renumber steps 7-8 accordingly - Component overview POST_BUILD: correct description — Lambda calls GHE API directly; PR_URL= line in logs is informational only (not parsed by Lambda) - Step 7 (Commit and Push): add git -c user.email/user.name config that is actually present in buildspec.yml - Step 10 (Lambda Returns Results): note the informational-only nature of PR_URL= more clearly - CFN outputs table: add snake_case aliases (pull_request_url, repository_url, branch_name) that Lambda actually emits alongside PascalCase variants --- docs/HOW-IT-WORKS.md | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/docs/HOW-IT-WORKS.md b/docs/HOW-IT-WORKS.md index 2d1659e..51d0dfe 100644 --- a/docs/HOW-IT-WORKS.md +++ b/docs/HOW-IT-WORKS.md @@ -66,11 +66,13 @@ without needing to operate their own build infrastructure. │ 3. Clone the template repo and render Jinja2 templates │ │ 4. Write any explicit extra files (override layer) │ │ 5. git commit + push │ -│ 6. cd into // and run tf-run or tf-plan │ -│ 7. Open or update a pull request via the gh CLI │ +│ 6. Assume cross-account role (if TARGET_ACCOUNT_ID is set) │ +│ 7. cd into // and run tf-run or tf-plan │ +│ 8. Open or update a pull request via the gh CLI │ │ │ │ POST_BUILD phase │ -│ • Emit PR_URL= line so Lambda can parse and return it to CFN │ +│ • Fetch PR URL via gh CLI and emit PR_URL= to build logs │ +│ (Lambda independently calls the GHE API to retrieve the PR URL) │ └─────────────────────────────────────────────────────────────────────┘ ``` @@ -290,7 +292,9 @@ customizations that do not belong in the reusable template. ```bash git add -A -git commit -m "SC automation: ${LAYER}/${REGION_DIR} [${ACCOUNT_REPO}]" \ +git -c user.email="sc-automation@census.gov" \ + -c user.name="SC Automation" \ + commit -m "SC automation: ${LAYER}/${REGION_DIR} [${ACCOUNT_REPO}]" \ --allow-empty git push origin "${GIT_BRANCH}" ``` @@ -372,17 +376,21 @@ with `gh pr view`. Once CodeBuild reports `SUCCEEDED`, the Lambda calls the GHE REST API (`/repos/{org}/{repo}/pulls?state=open&head={org}:{branch}`) to fetch the open PR URL for the branch. The `PR_URL=` line emitted in `post_build` is informational -only (visible in CodeBuild logs) — it is not parsed by the Lambda. +only (visible in CodeBuild logs as a convenience) — the Lambda does not parse +build output; it calls the GHE API independently. The Lambda then signals CloudFormation with: | CloudFormation Output | Value | |---|---| -| `PullRequestUrl` | URL of the open PR | -| `RepositoryUrl` | URL of the account repo | -| `BranchName` | Branch that was committed to | +| `PullRequestUrl` / `pull_request_url` | URL of the open PR | +| `RepositoryUrl` / `repository_url` | URL of the account repo | +| `BranchName` / `branch_name` | Branch that was committed to | | `CodeBuildBuildId` | Full CodeBuild build ID (useful for looking up logs) | +Both PascalCase and snake_case variants are returned for the first three outputs so +that `!GetAtt` works regardless of which form the consuming CFN template uses. + The CloudFormation stack transitions to `CREATE_COMPLETE` and the Service Catalog provisioned product shows as **AVAILABLE**. From 28aa9488469f36f9dc70f716241328c24ce8f066 Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Mon, 11 May 2026 17:02:57 -0400 Subject: [PATCH 04/27] review: address morga471 round-2 feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Code changes: - lambda/app.py: add 'global' to valid region_dir values (line 546 comment) - service-catalog/product-template.yaml: add 'global' to RegionDir AllowedValues Doc changes (HOW-IT-WORKS.md): - region_dir: document 'global' as valid value for non-regional resources (SSO, IAM) - Step 8: add tf-run plan vs tf-plan distinction; tf-plan skips symlink setup - Bootstrapping: rewrite section — remote_state.backend.tf is created by the REMOTE_STATE directive in tf-run.data; what must pre-exist is remote_state.yml. Running 'tf-run init' is the correct first-time setup path. - git-secret: add note that CodeBuild cannot run 'git secret reveal' (no GPG key) - Build Timeout: add EventBridge as future improvement over Lambda polling - SSH section: add note about service-user SSH key as alternative approach - Why csvd-dev: add note about future move to operations accounts - Delete: add recommended decommission path (PR-based removal + manual apply) - Rebuild Lambda: replace personal home path with generic, use tf apply instead of manual aws lambda update-function-code CLI call - Deploy: remove personal ~/aws-creds and /home/a/arnol377 paths --- docs/HOW-IT-WORKS.md | 95 ++++++++++++++++++++------- lambda/app.py | 2 +- service-catalog/product-template.yaml | 3 +- 3 files changed, 73 insertions(+), 27 deletions(-) diff --git a/docs/HOW-IT-WORKS.md b/docs/HOW-IT-WORKS.md index 51d0dfe..77abe0f 100644 --- a/docs/HOW-IT-WORKS.md +++ b/docs/HOW-IT-WORKS.md @@ -90,7 +90,7 @@ in the form: |---|---| | **Account Repo Name** | The GHE repo that contains the target account's Terraform. Example: `229685449397-csvd-dev-platform-dev-gov` | | **Terraform Layer** | Which layer of the account repo to operate on: `common`, `infrastructure`, or `vpc` | -| **Region Directory** | `east` or `west` — the subdirectory inside the layer | +| **Region Directory** | `east`, `west`, or `global` — the subdirectory inside the layer. Use `global` for non-regional resources (SSO, IAM, org-level). | | **Target AWS Account ID** | Optional. The AWS account ID where `tf-run apply` should make changes. When set, CodeBuild assumes `sc-automation-codebuild-role` in that account. Leave blank to run with the CodeBuild role's credentials (only works for resources in csvd-dev). | | **Template Repository** | Name of a GHE repo containing Jinja2 or raw file templates (optional) | | **Template Variables** | A JSON object of key/value pairs passed to Jinja2 when rendering templates (optional) | @@ -331,6 +331,14 @@ between steps. In CodeBuild there is no TTY, so `read -t $TIMEOUT` returns immediately with a non-zero exit code and the default `y` is used — `tf-run` auto-proceeds through all steps without any manual intervention needed. +**`tf-run plan` vs `tf-plan`**: Note that the dry-run path invokes `tf-plan` +directly (the bare `tf-control.sh` wrapper around `terraform plan`). This skips +the `tf-run` orchestration layer, which means it will **not** create the initial +symlinks that a first-time `tf-run` pass would normally set up. If the layer has +never been run before, prefer `tf-run plan` (via `TF_RUN_START_TAG` pointing to +an early step, or by running `tf-run init` manually first) rather than dry-run +mode. + **Cross-account credentials**: If `TARGET_ACCOUNT_ID` is set, the build assumes `arn:{partition}:iam::{TARGET_ACCOUNT_ID}:role/sc-automation-codebuild-role` via `aws sts assume-role` immediately before `cd`-ing into the layer directory and @@ -340,14 +348,21 @@ provider pick up automatically. If `TARGET_ACCOUNT_ID` is empty, CodeBuild runs with its own IAM role (scoped to csvd-dev). **For the target directory to work**, the account repo must already have been -bootstrapped. That means `//` must contain: +initialised for that layer. That means `//` must contain: -- `remote_state.backend.tf` — S3 backend configuration -- `remote_state..tf` — symlink to the correct `.s3` or `.local` variant +- `remote_state.yml` — metadata used by `tf-directory-setup.py` to generate backend config - `tf-run.data` — step definitions for this layer/region - `.tf-control` at the repo root — Terraform version pin -These files are present in any properly initialized Census account repo. +`remote_state.backend.tf` is **not** required to pre-exist — the `REMOTE_STATE` +directive in `tf-run.data` generates it on first run by reading `remote_state.yml` +in the parent directories. What does need to exist is `remote_state.yml` and the +containing directory structure. + +**git-secret**: Account repos use `git-secret` for GPG-encrypted secrets. The +CodeBuild environment does not have the team's GPG key and cannot decrypt those +files. Builds that require decrypted secrets must ensure those values are supplied +via another mechanism (e.g. Secrets Manager) rather than relying on `git secret reveal`. --- @@ -425,6 +440,11 @@ of the Lambda, CodeBuild project, ECR repo, or Secrets Manager secret to every account. Centralizing in `csvd-dev` means there is one place to update and one set of credentials to rotate. +> **Future**: when the org has designated **operations accounts** (standard +> Census infrastructure tier), this system should move there rather than living +> in a team-owned account like `csvd-dev`. The move is straightforward — +> see *Moving This System to a Different AWS Account* below. + --- ## Template Repository Design Guide @@ -543,18 +563,25 @@ the new target location). This is intentional behavior. ### Inputs the Lambda Enforces - `layer` must be exactly `common`, `infrastructure`, or `vpc` -- `region_dir` must be exactly `east` or `west` +- `region_dir` must be exactly `east`, `west`, or `global` - `template_vars` and `extra_files` must be valid JSON objects (or empty strings) These constraints exist because `tf-run` is directory-structured and expects to -`cd` into a path like `infrastructure/west/`. +`cd` into a path like `infrastructure/west/` or `common/global/`. -### Account Repo Must Be Pre-bootstrapped +### Account Repo Must Have `remote_state.yml` -`tf-run` does not create `remote_state.backend.tf` or set up the S3 backend -automatically. The target `//` directory must already have been -bootstrapped (via `tf-directory-setup.py`) before this automation can apply -Terraform to it. A future "init mode" will handle first-time setup. +The `REMOTE_STATE` directive in `tf-run.data` generates `remote_state.backend.tf` +automatically on first run — you do **not** need to manually create it beforehand. +However, it does require a `remote_state.yml` to already exist in the +`//` directory (and its parents) with the correct account ID, +bucket name, and region metadata. This file is part of the standard account repo +structure and must be committed to the repo before this automation can run there. + +Running `tf-run init` against the layer is the correct way to set up a new +layer — it creates `remote_state.yml`, generates the backend file, and creates +the initial symlinks. This automation assumes `tf-run init` has already been run +at least once on the target layer/region. ### Build Timeout @@ -563,6 +590,13 @@ The Lambda timeout is 900 seconds (15 minutes). The CodeBuild project timeout is gives up, returning `LAMBDA_TIMEOUT`. The CodeBuild build continues running after a Lambda timeout — check the CodeBuild console for full logs. +> **Future improvement**: polling synchronously inside the Lambda for up to 15 +> minutes is wasteful from a cost and concurrency standpoint. A better pattern +> is to have CodeBuild emit an EventBridge event on build completion, and have +> a separate Lambda (or Step Functions state machine) listen for it and signal +> CloudFormation. This would let the trigger Lambda exit immediately after +> starting the build. This is a known design limitation, tracked as future work. + ### SSH Module Sources Are Supported (via URL Rewrite) Account repos reference Terraform modules using SSH-style URLs @@ -575,12 +609,22 @@ Direct `git clone` of the account repo and template repo also uses HTTPS + PAT. GHE is on the Census internal network and is in `NO_PROXY`, so no proxy is involved in any GHE git operation. +> **Alternative**: A service-account SSH key (rather than a PAT URL rewrite) +> could also support SSH module sources. The URL rewrite approach was chosen +> because it requires no key management and works with the existing GHE PAT +> already used for repository access. A service-user SSH key is a viable +> alternative if the URL rewrite causes issues. + ### Delete Is a No-op CloudFormation `Delete` events (stack deletion or product termination) are acknowledged with `SUCCESS` immediately. No Terraform destroy is run. This is intentional — automated infra teardown is too risky without explicit human review. +The recommended decommission path is to use this same system to open a PR that +removes the resource definitions from the account repo, review and merge the PR +manually, then run `tf apply` outside of automation to destroy the resources. + --- ## Adding Support for a New Terraform Configuration @@ -603,32 +647,33 @@ The Lambda and CodeBuild infrastructure itself does not need to change. ## Rebuilding the Lambda Image When `lambda/app.py`, `lambda/Dockerfile`, or `lambda/requirements.txt` change, -rebuild and push the container image: +rebuild and push the container image. From the repo root (after sourcing your AWS +credentials): ```bash -cd /home/a/arnol377/git/sc-lambda-ghactions -source ~/aws-creds +cd sc-lambda-ghactions packer-pipeline --config csvd_config_packer.hcl ``` -After the `tf-run-executor-builder` CodeBuild build succeeds, force the Lambda -to pull the new image: +After the `tf-run-executor-builder` CodeBuild build succeeds, run `tf apply` in +`deploy/` to update the Lambda to the new image digest — the Terraform resource +for the Lambda already references `latest` and a `tf apply` will detect and +push the update: ```bash -export AWS_DEFAULT_REGION=us-gov-west-1 -aws lambda update-function-code \ - --function-name tf-run-executor-trigger \ - --image-uri "229685449397.dkr.ecr.us-gov-west-1.amazonaws.com/tf-run-executor/lambda:latest" \ - --region us-gov-west-1 +cd deploy/ +tf plan # confirm only the Lambda image digest changes +tf apply ``` ## Deploying Infrastructure Changes +From the `deploy/` directory (after sourcing your AWS credentials): + ```bash export AWS_DEFAULT_REGION=us-gov-west-1 -source ~/aws-creds -cd /home/a/arnol377/git/sc-lambda-ghactions/deploy -tf init # only needed after provider changes +cd sc-lambda-ghactions/deploy +tf init # only needed after provider or backend changes tf plan tf apply ``` diff --git a/lambda/app.py b/lambda/app.py index 80e88ae..eeb20dd 100644 --- a/lambda/app.py +++ b/lambda/app.py @@ -45,7 +45,7 @@ class TfRunRequest(BaseModel): account_repo: str = Field(..., description="Account repo name, e.g. 229685449397-csvd-dev-platform-dev-gov") layer: Literal["common", "infrastructure", "vpc"] = Field(..., description="Terraform layer") - region_dir: Literal["east", "west"] = Field(..., description="Region directory") + region_dir: Literal["east", "west", "global"] = Field(..., description="Region directory (east, west, or global for non-regional resources like SSO/IAM)") tf_run_start_tag: str = Field(default="", description="tf-run.data TAG label to start from; empty = from beginning") template_repo: str = Field(default="", description="GHE repo name containing Jinja2/raw template files to render into the account repo") template_vars: dict = Field(default_factory=dict, description='JSON map of variables passed to Jinja2 when rendering template_repo files') diff --git a/service-catalog/product-template.yaml b/service-catalog/product-template.yaml index 7f0b1c7..92c014d 100644 --- a/service-catalog/product-template.yaml +++ b/service-catalog/product-template.yaml @@ -70,10 +70,11 @@ Parameters: RegionDir: Type: String - Description: Region directory within the layer + Description: Region directory within the layer (use 'global' for non-regional resources like SSO/IAM) AllowedValues: - east - west + - global GitBranch: Type: String From 8cfa5f591308b2ff1632af2dc73bc33407fbcb52 Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Mon, 11 May 2026 17:09:52 -0400 Subject: [PATCH 05/27] fix: source tf-run toolchain from terraform/support, not scripts/ The canonical versions of tf-run, tf-control.sh, and tf-directory-setup.py live in github.e.it.census.gov/terraform/support (local-app/ subtree). We already clone that repo during the INSTALL phase for VERSION governance, so sourcing the scripts from there costs nothing extra. - buildspec.yml: cp from /tmp/tf-support/local-app/ instead of CODEBUILD_SRC_DIR/scripts/ - scripts/: remove tf-run, tf-control.sh, tf-directory-setup.py (bundled copies) tf-run.py and tf-run.data remain (project-specific) - docs/HOW-IT-WORKS.md: update both references (component overview + Step 3) --- buildspec.yml | 9 +- docs/HOW-IT-WORKS.md | 7 +- scripts/tf-control.sh | 392 --------------- scripts/tf-directory-setup.py | 225 --------- scripts/tf-run | 921 ---------------------------------- 5 files changed, 9 insertions(+), 1545 deletions(-) delete mode 100755 scripts/tf-control.sh delete mode 100755 scripts/tf-directory-setup.py delete mode 100755 scripts/tf-run diff --git a/buildspec.yml b/buildspec.yml index b5b7767..f3029ba 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -64,10 +64,11 @@ phases: - aws s3 cp "$CENSUS_CA_S3" /etc/pki/ca-trust/source/anchors/census-ca.pem - update-ca-trust extract - # --- tf-run toolchain (sourced from this repo's scripts/) --- - - cp "$CODEBUILD_SRC_DIR/scripts/tf-run" /usr/local/bin/tf-run - - cp "$CODEBUILD_SRC_DIR/scripts/tf-control.sh" /usr/local/bin/tf-control.sh - - cp "$CODEBUILD_SRC_DIR/scripts/tf-directory-setup.py" /usr/local/bin/tf-directory-setup.py + # --- tf-run toolchain (sourced from terraform/support, already cloned above) --- + # Canonical versions live in terraform/support local-app/ — no copies kept in this repo. + - cp /tmp/tf-support/local-app/tf-run/tf-run.sh /usr/local/bin/tf-run + - cp /tmp/tf-support/local-app/tf-control/tf-control.sh /usr/local/bin/tf-control.sh + - cp /tmp/tf-support/local-app/tf-directory-setup/tf-directory-setup.py /usr/local/bin/tf-directory-setup.py - chmod +x /usr/local/bin/tf-run /usr/local/bin/tf-control.sh /usr/local/bin/tf-directory-setup.py # Create tf-{action} symlinks expected by tf-run and account repo steps - > diff --git a/docs/HOW-IT-WORKS.md b/docs/HOW-IT-WORKS.md index 77abe0f..284148e 100644 --- a/docs/HOW-IT-WORKS.md +++ b/docs/HOW-IT-WORKS.md @@ -57,7 +57,7 @@ without needing to operate their own build infrastructure. │ blocked on the Census network) │ │ • Installs the Census CA cert so GHE TLS works │ │ • Installs tf-run, tf-control.sh, tf-directory-setup.py from │ -│ this repo's scripts/ directory │ +│ terraform/support (already cloned for version governance) │ │ • Installs the gh CLI from S3 │ │ │ │ BUILD phase │ @@ -195,8 +195,9 @@ phase sets up every tool the build needs: - **Census CA cert** — installed from S3 into the system trust store via `update-ca-trust`. Required for TLS connections to `github.e.it.census.gov`. - **tf-run toolchain** — `tf-run`, `tf-control.sh`, and `tf-directory-setup.py` - are copied from the `scripts/` directory of this repo (which CodeBuild has - already checked out as its source). + are copied from `terraform/support` (`local-app/` subtree), which is already + cloned during the version governance step above. There are no script copies + bundled in this repo — `terraform/support` is the single source of truth. - **Python packages** — `jinja2`, `python-dateutil`, `pyyaml` installed for template rendering and `tf-directory-setup.py`. - **gh CLI** — downloaded from S3 at diff --git a/scripts/tf-control.sh b/scripts/tf-control.sh deleted file mode 100755 index ef1b36f..0000000 --- a/scripts/tf-control.sh +++ /dev/null @@ -1,392 +0,0 @@ -#!/bin/bash - -get_git_root() -{ - TOP=$(git rev-parse --show-toplevel 2>/dev/null) - if [ -z "$TOP" ] - then - TOP=$HOME - fi -} - -do_help() -{ - local ACTIONS=$@ - echo "* help: $THIS $VERSION" - echo " tf-{action}: runs command 'terraform {action}' with specific arguments" - echo " tf-{action} less [arguments]: shows the latest log file for the specific {action}" - echo " TFLESS=1 tf-{action} [arguments]: shows the latest log file for the specific {action}" - echo " tf-log {string} [list|{filename}: tails the last log matching the patern logs/{string}*, or provides a list of file, or uses a specific file" - echo " tf-cli {string}: runs the approriate terraform binary with whaterver {string} as arguments" - echo "" - echo "* environment variables" - echo " TFCONTROL: point to alternate .tf-control; default looks lin git-root, then \$HOME" - echo " TF_CLI_CONFIG_FILE: point to alternate .tf-control.tfrc; default looks lin git-root, then \$HOME, and default \$HOME/.terraformrc" - echo " TFARGS: extra args to pass to terraform command. Only applies to actions apply and destroy" - echo " TFNOCOLOR: color disabled by default -no-color, to enable, set this any value" - echo " TFNOLOG: setting this at all will disable logging through 'tee' to a file. Needed to pull tf-state properly" - echo " TFNOPROXY: do not auto-set proxy. With changes to the firewalls, proxy is needed, so we set it for init options" - echo "" - echo "* Available Actions:" - for a in $ACTIONS - do - echo " tf-${a}" - done - echo "" - echo "* Special Actions:" - echo " tf-plan summary: produces a list of items to create, destroy, replace, or update. Requires having run 'tf-plan' first" - echo " tf-apply summary: produces a list of items like plan but under an apply action. Requires having run 'tf-apply' first and then answering 'no'" - echo " tf-destroy summary: produces a list of items to destroy. Requires having run 'tf-destroy' first and then answering 'no'" - return 0 -} - -# pass things like -target= -# make aliases -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-init -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-plan -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-apply -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-destroy -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-refresh -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-output -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-validate -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-import -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-state -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-fmt -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-taint -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-console -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-log -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-cli - -THIS=$(basename $0) -VERSION="1.11.0" -ACTION=$(basename $THIS .sh | sed -e 's/^tf-//') -LOGDIR="logs" - -umask 002 - -# path or name of terraform binary -# get from top of git repo or $HOME/.tf-control -CURRENTDIR=$(pwd) -get_git_root -if [ -z "$TFCONTROL" ] -then - if [ -r $TOP/.tf-control ] - then - TFCONTROL=$TOP/.tf-control - elif [ -r $CURRENTDIR/.tf-control ] - then - TFCONTROL=$CURRENTDIR/.tf-control - elif [ -r $HOME/.tf-control ] - then - TFCONTROL=$HOME/.tf-control - fi -fi -# if .tf-control in git-root, override it with a local .tf-control.override -if [ -r $CURRENTDIR/.tf-control.override ] -then - TFCONTROL=$CURRENTDIR/.tf-control.override -fi -if [ ! -z "$TFCONTROL" ] -then - source $TFCONTROL -fi - -if [ -z $TFCOMMAND ] -then - TFCOMMAND="terraform" -fi - -# look for config file -if [ -z "$TF_CLI_CONFIG_FILE" ] -then - if [ -r $TOP/.tf-control.tfrc ] - then - export TF_CLI_CONFIG_FILE=$TOP/.tf-control.tfrc - elif [ -r $CURRENTDIR/.tf-control.tfrc ] - then - export TF_CLI_CONFIG_FILE=$CURRENTDIR/.tf-control.tfrc - elif [ -r $HOME/.tf-control.tfrc ] - then - export TF_CLI_CONFIG_FILE=$HOME/.tf-control.tfrc - else - unset TF_CLI_CONFIG_FILE - fi -fi - -# based on issue https://github.com/hashicorp/terraform/issues/32901 -# where shared provider cache doesn't work right for multiple users in 1.4. This is needed to get beyond 1.3. -# it really does belong in the .tf-control file but that's a lot of files to change -export TF_PLUGIN_CACHE_MAY_BREAK_DEPENDENCY_LOCK_FILE=1 - -ACTIONS="init plan apply destroy refresh output validate import state fmt taint console log cli" -declare -A actions -for action in $ACTIONS -do - actions["$action"]=$action -done - -if [[ ! -z "$1" ]] && [[ "$1" == "help" ]] -then - do_help $ACTIONS - exit 0 -fi - -# set soem TF_VAR variables ifnot already set -if [ -z "$TF_VAR_os_environment" ] -then - export TF_VAR_os_environment="{\"pwd\":\"$(pwd)\"}" -fi -if [ -z "$TF_VAR_os_username" ] -then - export TF_VAR_os_username=$USER -fi - -# calling the script directly installs into $BINDIR | /apps/terraform/bin -if [ $ACTION == "control" ] -then - if [ -z $BINDIR ] - then - location=$(which $THIS 2> /dev/null) - if [ -z $location ] - then - BINDIR="/apps/terraform/bin" - else - BINDIR=$(dirname $location) - fi - fi - umask 022 - echo "* installing $THIS v$VERSION in $BINDIR" - cp $THIS $BINDIR/ - chmod 755 $BINDIR/$THIS - for action in $ACTIONS - do - echo "+ enabling tf-$action to $BINDIR/$THIS" - ln -sf $BINDIR/$THIS $BINDIR/tf-$action - done - exit 0 -fi - -# pass TFCOLOR=-color if wanting colorized output -if [ -z $TFCOLOR ] -then - if [ ! -z "$TFNOCOLOR" ] - then - TFCOLOR="" - else - TFCOLOR="-no-color" - fi -fi - -if [ ! -d $LOGDIR ] -then - mkdir -p $LOGDIR -fi -YMDSTAMP=$(date +%Y%m%d) -start=$(date +%s) -STAMP="$YMDSTAMP.$start" -LOGFILE="$LOGDIR/$ACTION.$STAMP.log" - -if [[ ! -z $1 ]] && [[ $1 == "less" ]] || [[ ! -z "$TFLESS" ]] -then - file=$(ls $LOGDIR/$ACTION*.log 2> /dev/null | tail -n 1) - if [ -z "$file" ] - then - echo "* No log file for action=$ACTION. Please run 'tf-$ACTION' first" - exit 1 - else - echo "# results from file $file" - less $file - exit 0 - fi -fi -if ( [[ $ACTION == "plan" ]] || [[ $ACTION == "destroy" ]] || [[ $ACTION == "apply" ]] ) && [[ ! -z $1 ]] && [[ $1 == "summary" ]] -then - file=$(ls $LOGDIR/$ACTION*.log 2> /dev/null | tail -n 1) - if [ -z "$file" ] - then - echo "* Previous $ACTION file does not exist. Please run 'tf-$ACTION'" - exit 1 - fi - echo "* tf-$ACTION summary from log $file" - for op in created updated replaced destroyed - do - cc=$(grep -c " $op" $file) - echo "> to-be $op ($cc)" - grep " $op" $file - echo "" - done - for op in changed moved - do - cc=$(grep -cE "has \b$op\b" $file) - echo "> has $op ($cc)" - grep -E " has \b$op\b" $file - echo "" - done - grep ^Plan $file - exit 0 -fi -if [ $ACTION == "log" ] -then - logtype="$1" - logaction="$2" - if [[ ! -z "$logaction" ]] && [[ $logaction == "list" ]] - then - echo "* available files for pattern '${logtype}*':" - ls logs/${logtype}* -C1 | sed -e 's/^/ /' - echo "" - exit 0 - elif [[ ! -z "$logaction" ]] - then - TFLOGFILE=$logaction - else - TFLOGFILE=$(ls logs/${logtype}* 2> /dev/null | tail -n 1) - fi - if [ ! -z "$TFLOGFILE" ] - then - if [[ -z "$logaction" ]] - then - echo "* available files for pattern '${logtype}*':" - ls logs/${logtype}* -C1 | sed -e 's/^/ /' - echo "" - fi - echo "* showing logfile=$TFLOGFILE" - less $TFLOGFILE - exit $? - else - echo "* No log files exist matching pattern '${logtype}*'" - exit 1 - fi -fi - -if [ ${actions[$ACTION]} == $ACTION ] -then - ( echo "# starting v$VERSION action $ACTION file $LOGFILE stamp $STAMP time $start"; \ - echo "# current_directory=$(pwd)"; \ - echo "# git_repository=$(git remote -v show | grep fetch | awk '{print $2}')"; \ - echo "# git_current_branch=$(git branch | grep ^* | awk '{print $2}')"; \ - echo "# terraform_version=$($TFCOMMAND -v|grep ^Terraform)"; \ - echo "# TFCONTROL=$TFCONTROL"; \ - echo "# TF_CLI_CONFIG_FILE=$TF_CLI_CONFIG_FILE"; \ - echo "# TFARGS=\"$TFARGS\" TFNOCLOR=$TFNOCOLOR TFNOLOG=$TFNOLOG TFNOPROXY=$TFNOPROXY"; \ - echo "# env TF_VAR_ variables"; \ - printenv | grep TF_VAR_ | sed -e 's/^/# /'; \ - echo "" ) |& tee $LOGFILE -else - echo "* action:${actions[$ACTION]}" - echo "* invalid action $ACTION, exiting" - exit 1 -fi - -# TFARGS="" -r=0 -if [ $ACTION == "init" ] -then -# $TFCOMMAND init $TFARGS $TFCOLOR $@ |& tee -a $LOGFILE - if [[ -z "$TFNOPROXY" ]] && [[ -z "$HTTPS_PROXY" ]] && [[ -r "/apps/terraform/etc/set-proxy.sh" ]] - then - source /apps/terraform/etc/set-proxy.sh - fi - $TFCOMMAND init $TFCOLOR $@ |& tee -a $LOGFILE - r=$? -fi - -if [ $ACTION == "plan" ] -then -# $TFCOMMAND plan $TFARGS $TFCOLOR $@ |& tee -a $LOGFILE - $TFCOMMAND plan $TFCOLOR $@ |& tee -a $LOGFILE - r=$? -fi - -if [ $ACTION == "apply" ] -then - $TFCOMMAND apply $TFARGS $TFCOLOR $@ |& tee -a $LOGFILE - r=$? -fi - -if [ $ACTION == "destroy" ] -then - $TFCOMMAND destroy $TFARGS $TFCOLOR $@ |& tee -a $LOGFILE - r=$? -fi - -if [ $ACTION == "refresh" ] -then -# $TFCOMMAND refresh $TFARGS $TFCOLOR $@ |& tee -a $LOGFILE - $TFCOMMAND refresh $TFCOLOR $@ |& tee -a $LOGFILE - r=$? -fi - -if [ $ACTION == "validate" ] -then -# $TFCOMMAND validate $TFARGS $TFCOLOR $@ |& tee -a $LOGFILE - $TFCOMMAND validate $TFCOLOR $@ |& tee -a $LOGFILE - r=$? -fi - -if [ $ACTION == "output" ] -then -# $TFCOMMAND output $TFARGS $@ -# $TFCOMMAND output $TFARGS $TFCOLOR $@ |& tee -a $LOGFILE - $TFCOMMAND output $TFCOLOR $@ |& tee -a $LOGFILE - r=$? - exit $r -fi - -if [ $ACTION == "import" ] -then -# $TFCOMMAND import $TFARGS $@ |& tee -a $LOGFILE - $TFCOMMAND import $@ |& tee -a $LOGFILE - r=$? -fi - -# not recommended for pull -if [ $ACTION == "state" ] -then -# $TFCOMMAND state $TFARGS $@ |& tee -a $LOGFILE - if [ ! -z "$TFNOLOG" ] - then - $TFCOMMAND state $@ - else - $TFCOMMAND state $@ |& tee -a $LOGFILE - fi - r=$? -fi - -if [ $ACTION == "fmt" ] -then -# $TFCOMMAND fmt $TFARGS $@ |& tee -a $LOGFILE - $TFCOMMAND fmt $@ |& tee -a $LOGFILE - r=$? -fi - -if [ $ACTION == "taint" ] -then -# $TFCOMMAND taint $TFARGS $@ |& tee -a $LOGFILE - $TFCOMMAND taint $@ |& tee -a $LOGFILE - r=$? -fi - -# This doesnt work to 'tee' because we can't leave stdin on the terminal. -if [ $ACTION == "console" ] -then -# $TFCOMMAND console $TFARGS $@ |& tee -a $LOGFILE -# $TFCOMMAND console $@ |& tee -a $LOGFILE - echo "* TFNOLOG in effect for $ACTION" - $TFCOMMAND console $@ - r=$? -fi - -if [ $ACTION == "cli" ] -then -# $TFCOMMAND $TFARGS $@ |& tee -a $LOGFILE - $TFCOMMAND $@ |& tee -a $LOGFILE - r=$? -fi - -end=$(date +%s) -elapsed=$(( $end - $start )) -(echo "# ending v$VERSION action $ACTION file $LOGFILE stamp $STAMP start $start end $end elapsed $elapsed"; echo "") |& tee -a $LOGFILE - -echo "" -echo "# results in file $LOGFILE stamp $STAMP status=$r" -# echo $r diff --git a/scripts/tf-directory-setup.py b/scripts/tf-directory-setup.py deleted file mode 100755 index 7c7b5e1..0000000 --- a/scripts/tf-directory-setup.py +++ /dev/null @@ -1,225 +0,0 @@ -#!/apps/terraform/python/bin/python -# /bin/env python - -from jinja2 import Environment,FileSystemLoader -import os -#import csv -#import re -import sys -from pprint import pprint -from datetime import datetime,date,time -from dateutil import tz -from dateutil.parser import parse as date_parse -import yaml -import hashlib -import argparse -from pathlib import Path - -def parse_arguments(version): - parser = argparse.ArgumentParser(description="Setup directory for Terraform (remote state, links)",add_help=True) - parser.add_argument('filename', action='store', help="Configuration filename to read (remote_state.yml)", default='remote_state.yml', nargs='?') - parser.add_argument('--version', action='version', version='%(prog)s '+version) - parser.add_argument("-n","--dry-run", action="store_true", dest="dry_run", help="Dry run, do not create links or remote state configuration", default=False) - parser.add_argument("-d","--debug", action="store_true", dest="debug", help="debugging", default=False) - parser.add_argument("-v","--verbose", action="store_true", dest="verbose", help="verbose output", default=False) - parser.add_argument("-f","--force", action="store_true", dest="force", help="Force", default=False) - parser.add_argument("-l","--link", action="store", dest="link", help="Make link to .tf", choices=['none', 'local', 's3']) - args = parser.parse_args() - return args - -def touch_file(file): - if os.path.exists(file): - os.utime(file,None) - else: - open(file,'a').close() - -def read_yaml(file): - data={} - with open(file, 'r') as stream: - try: - data=yaml.full_load(stream) - except yaml.YAMLError as e: - print(e) - return None - return data - -def create_backend(args,version): - data=read_yaml(args.filename) -# initialize missing fields - data['make_links']=data.get('make_links',True) - - if args.debug: - print('* data =') - pprint(data) - print('* args =',args) - print("") - - dry_s="[dry-run] " if args.dry_run else "" - this_dir=os.getcwd() - -# print('args',args) -# sys.exit(0) - - file_loader=FileSystemLoader('/apps/terraform/template') - env=Environment( - loader=file_loader, - trim_blocks=True, - lstrip_blocks=True - ) - - data['directory']=data.get('directory','') - if data['directory'] == "": - print("* error, 'directory' cannot be empty") - sys.exit(1) - - tf_backend=env.get_template('remote_state.backend.tf.j2') - tf_backend_data_local=env.get_template('remote_state.data.tf.local.j2') - tf_backend_data_s3=env.get_template('remote_state.data.tf.s3.j2') - - tf_output=tf_backend.render(data=data) - tf_filename='remote_state.backend.tf' - if os.path.exists(tf_filename): - do_create=args.force - else: - do_create=True - if do_create: - print("* {}creating file {}".format(dry_s,tf_filename)) - if not args.dry_run: - with open(tf_filename, 'w') as tf_file: - tf_file.write(tf_output) - else: - if args.debug or args.verbose: - print("* {}not creating file {}".format(dry_s,tf_filename)) - - d=data['directory'].replace('/','_').replace('.','_') - data['directory_replaced']=d - - base_dir=this_dir.replace(data['directory'],'') - dir_paths=data['directory'].split(os.path.sep) - rp=['..'] * len(dir_paths) - rel_path=os.path.join(*rp) - if args.debug: - print("* this_dir={}\n base_dir={}\n directory={}".format(this_dir,base_dir,data['directory'])) - print(" path_length={}\n relative_path_to_top={}/".format(len(dir_paths),rel_path)) - - - tf_output=tf_backend_data_s3.render(data=data) - tf_filename='remote_state.%s.tf.s3' % d - if do_create: - print("* {}creating file {}".format(dry_s,tf_filename)) - if not args.dry_run: - with open(tf_filename, 'w') as tf_file: - tf_file.write(tf_output) - else: - if args.debug or args.verbose: - print("* {}not creating file {}".format(dry_s,tf_filename)) - - tf_output=tf_backend_data_local.render(data=data) - tf_filename='remote_state.%s.tf.local' % d - if do_create: - print("* {}creating file {}".format(dry_s,tf_filename)) - if not args.dry_run: - with open(tf_filename, 'w') as tf_file: - tf_file.write(tf_output) - else: - if args.debug or args.verbose: - print("* {}not creating file {}".format(dry_s,tf_filename)) - - tf_filename='remote_state.%s.tf.none' % d - if do_create: - print("* {}touching file {}".format(dry_s,tf_filename)) - if not args.dry_run: - touch_file(tf_filename) - else: - if args.debug or args.verbose: - print("* {}not touching file {}".format(dry_s,tf_filename)) - - tf_filename='remote_state.%s.tf' % d - if args.link is not None: - source_file='{}.{}'.format(tf_filename,args.link) - if os.path.exists(tf_filename): - if not os.path.islink(tf_filename): - print("* {}target file {} is not a link, fixing".format(dry_s,tf_filename)) - if not args.dry_run: - os.remove(tf_filename) - if args.verbose: - print("* {}removing file {}".format(dry_s,tf_filename)) - if not args.dry_run: - os.symlink(source_file, tf_filename) - print("* {}link {} to {}".format(dry_s,source_file, tf_filename)) - else: - if do_create: - print("* sample ln commands to run\n") - print("# ln -sf {}.none {}".format(tf_filename,tf_filename)) - print("# ln -sf {}.local {}".format(tf_filename,tf_filename)) - print("# ln -sf {}.s3 {}".format(tf_filename,tf_filename)) - - return - -#--- -# main -#--- -def main(): - version='2.2.2' - args=parse_arguments(version) - - create_backend(args,version) - return - -#--- -# main -#--- -if __name__ == '__main__': - main() - -# if make_links, then read all links in the parent directory and make a link to them -# if link_files [] exists, then make only links to the parent directory for those files -# first time through (or --init, --step1), remote state doesn't exist), make the link to -> none -# after remote state has stuff (--step2), or remote state link does exist, change the link -> s3 -# optional use remote_state as local (local: true | false) -# bring template files into this script (like with rotate-keys.py) -# add logging - -## directory: "common" -## profile: "123456789012-mycloud" -## bucket: "inf-tfstate-123456789012" -## bucket_region: "us-gov-west-1" -## region: "us-gov-west-1" -## regions: ["us-gov-west-1"] -## account_id: "123456789012" -## account_alias: "mycloud" -## aws_environment: "govcloud" -# -# provider_configs: -# - provider.ldap # gets link to ../provider_configs.d/provider.ldap.* ./ -# - provider.dns # gets link to ../provider_configs.d/provider.dns.* ./ -# -# parent_links: -# - (file) # makes link files in the parent directory. Expect this not to be needed with this new setup -# - random_parent_file.tf -# - random_parent_link.tf -# -# consider making TOP/remote_state.d and then link all to that, and link remote_state.d to each directory -# remote_state: -# - (component-directory) # finds and links remote state with the appropriate component. Examples: -# - infrastructure -# - infrastructure/west-1 -# - common/apps/myapp1 - - - -## cwd=Path.cwd() -## top=None -## for p in cwd.parents: -## if (p / "TOP").exists() or ( (p / "init").exists() and (p / "init").is_dir() ): -## top=p -## -## if top: -## rel=cwd.relative_to(top) -## rel_top=['..'] * len(rel.parts) -## rel_top_s=os.path.join(*rel_top) -## else: -## rel=None -## rel_top_s='' -## -## print('cwd={}\ntop={}\nrel={}\nrel_to_top={}'.format(cwd,top,rel,rel_top_s)) diff --git a/scripts/tf-run b/scripts/tf-run deleted file mode 100755 index 3c0bc01..0000000 --- a/scripts/tf-run +++ /dev/null @@ -1,921 +0,0 @@ -#!/bin/bash - -get_git_root() -{ - TOP=$(git rev-parse --show-toplevel 2>/dev/null) -} - -get_relative_to_git_root() -{ -# TOP=$(git rev-parse --show-toplevel 2>/dev/null) - get_git_root - TOPBASE=$(basename $TOP) - CWD=$(pwd) - RELATIVE_PATH=$(echo $CWD | sed -e "s/^.*$TOPBASE//" | tr -cd / | sed -e 's#/#../#g') -# return 0 -} - -get_relative_directory() -{ - local FILE=$1 - if [ -z "$FILE" ] - then - echo "* get_relative_up(): error, missing FILE" - return 1 - fi - CWD=$(pwd) - RELATIVE_PATH="" - local c=0 - while [[ ! -r "$CWD/$FILE" ]] || [[ -L "$CWD/$FILE" ]] - do -## echo "[debug] checking $CWD/$FILE" -# stop if at top of repo (.git directory) or at / - if [[ ! -r "$CWD/.git" ]] && [[ "$CWD" != "/" ]] - then - CWD=$(dirname $CWD) - RELATIVE_PATH+="../" - c=$(( c + 1 )) -## echo "[debug] going up cwd=$CWD c=$c path=$RELATIVE_PATH" - else -## echo "[debug] hit git root or / cwd=$CWD" - RELATIVE_PATH="" - c=-1 - break - fi - done - if [[ $c -ge 0 ]] && [[ -z "$RELATIVE_PATH" ]] - then -## echo "[debug] found in current directory" - RELATIVE_PATH="./" - fi -} - -get_profile() -{ - if [ -z $ERROR_GET_PROFILE ] - then - ERROR_GET_PROFILE=0 - fi - local FILES=$(ls *tfvars 2>/dev/null) - if [ -z "$AWS_PROFILE" ] - then - if [ -z "$FILES" ] - then - [ $ERROR_GET_PROFILE -gt 0 ] && echo "* [WARNING] cannot determine profile from *.tfvars" - ERROR_GET_PROFILE=$(( $ERROR_GET_PROFILE + 1 )) - return 1 - else - PROFILE=$(grep -E '^\bprofile\b *' $FILES | sed -e 's/^.*profile.* =//' -e 's/\"//g' -e 's/#.*$//' -e 's/^ *//' | head -n 1) - fi - else - PROFILE=$AWS_PROFILE - fi -# echo "* using profile=$PROFILE" - ERROR_GET_PROFILE=0 - return 0 -} - -get_region() -{ - if [ -z $ERROR_GET_REGION ] - then - ERROR_GET_REGION=0 - fi - local FILES=$(ls *tfvars 2>/dev/null) - if [ -z "$AWS_REGION" ] - then - if [ -z "$FILES" ] - then - [ $ERROR_GET_REGION -gt 0 ] && echo "* [WARNING] cannot determine region from *.tfvars" - ERROR_GET_REGION=$(( $ERROR_GET_REGION + 1 )) - return 1 - else - REGION=$(grep -E '^\bregion\b *' $FILES | sed -e 's/^.*region.* =//' -e 's/\"//g' -e 's/#.*$//' -e 's/^ *//' | head -n 1) - fi - else - REGION=$AWS_REGION - fi - if [ ! -z $REGION ] - then - if [[ $REGION =~ gov ]] - then - SHORT_REGION=$(echo $REGION | sed -e 's/^us-gov-//' -e 's/-[0-9]$//') - else - SHORT_REGION=$(echo $REGION | sed -e 's/^us-//') - fi - fi -# echo "* using region=$REGION short_region=$SHORT_REGION" - ERROR_GET_REGION=0 - return 0 -} - -# returns value in stdout -replace_placeholders() -{ - local item="$1" - - if [ ! -z $next ] - then - item=$(echo $item | sed -e "s/%%NEXT%%/$next/g") - fi - if [ ! -z $previous ] - then - item=$(echo $item | sed -e "s/%%PREVIOUS%%/$previous/g") - fi - if [ ! -z $PROFILE ] - then - item=$(echo $item | sed -e "s/%%PROFILE%%/$PROFILE/g") - fi - if [ ! -z $REGION ] - then - item=$(echo $item | sed -e "s/%%REGION%%/$REGION/g") - fi - if [ ! -z $SHORT_REGION ] - then - item=$(echo $item | sed -e "s/%%SHORT_REGION%%/${SHORT_REGION}/g") - fi - echo "$item" -} - -get_file_from_git() -{ - local FILE=$1 - local URL=$2 - local status=0 - if [[ -z "$FILE" ]] || [[ -z "$URL" ]] - then - echo "* missing FILE or URL argument" - status=1 - else - if [ -r "$FILE" ] - then - echo "* file $FILE exists, not overwriting" - else - echo "* getting init file $FILE" - curl -q -s -k -o "$FILE" "$URL" - status=$? - fi - fi - return $status -} - -do_clean() -{ - local WHAT=$1 - if [ -z "$WHAT" ] - then - WHAT="clean" - fi - - echo "* executing $WHAT, removing remote_state.*" - echo -n "> " - for f in $(ls remote_state.* -d) - do - rm $f && echo -n " $f" - done - echo "" - - echo "* executing $WHAT, removing links" - echo -n "> " - for f in $(find . -maxdepth 1 -type l -print) - do - rm $f && echo -n " $f" - done - echo "" - return 0 -} - -do_superclean() -{ - do_clean superclean - echo "* executing superclean, removing logs, .terraform files, terraform.tfstate files" - echo -n "> " - for f in $(ls -d logs .terraform .terraform*hcl terraform.tfstate* 2> /dev/null) - do - rm -rf $f && echo -n " $f" - done - echo "" - return 0 -} - -do_help() -{ - local ACTIONS=$@ - echo "* help: $THIS $VERSION" - echo " tf-run: ACTION [list | start_number | tag:start_tag] [end_number | tag:end_tag | only | +N]" - echo " ACTION = plan | apply | destroy | list | init | init-upgrade | clean | superclean | tags" - echo "" - echo " init: get a base tf-run.data if none exists, creates region.tf if not exist, creates locals.tf if not exists; also gets .tf-control* files" - echo " init-upgrade: get the .tf-control* files needed for using TF 1.x" - echo " check: looks over module calls for proper use, proper versions, other things" - echo " list: same as '$THIS plan list'" - echo " plan: run through the contents of tf-run.data and do a plan for each" - echo " apply: run through the contents of tf-run.data and do a apply for each" - echo " destroy: looks for a tf-run.destroy.data, and removes in that specific order. If missing, it attempts all at once." - echo " clean: removes remote_state.*, links." - echo " superclean: removes remote_state.*, links, logs/, .terraform files, terraform.tfstate files" - echo " tags: get a list of tags and its respective step number" - echo "" - echo " arguments:" - echo " * list: list out all of the steps, but don't execute anything" - echo " * start_number: start executing ACTION at step number start_number" - echo " * tag:start_tag: start executing ACTION at tag labelled start_tag. The 'tag:' must be present for this option" - echo " * end_number: stop executing ACTION after step number end_number" - echo " * tag:end_tag: stop executing ACTION on the step before the tag labelled end_tag. The 'tag:' must be present for this option" - echo " * only: execute ACTION for only the one step indicated by start" - echo " * +N: execute ACTION starting at start and ending at start + N" - echo "" - return 0 -} - -ask_continue() { - local continue_status=0 - local PREFIX=$1 - local DURATION=$2 - local DEFAULT=$3 - if [ -z $DURATON ] - then - DURATION=10 - fi - if [ -z $DEFAULT ] - then - DEFAULT="y" - fi -# echo "" -# read -n 1 -p "${PREFIX}continue [y|n: default=$DEFAULT]? " -t $DURATION CONTINUE < /dev/tty - read -n 1 -p "${PREFIX}continue [y|n: default=$DEFAULT]? " -t $DURATION CONTINUE - continue_status=$? - echo "" - - if [ -z $CONTINUE ] - then - CONTINUE=$DEFAULT - fi - if [[ $continue_status != 0 ]] - then - CONTINUE=$DEFAULT - else - if [[ $CONTINUE != "y" ]] && [[ $CONTINUE != "n" ]] - then - CONTINUE="n" - fi - fi -# echo "value=$CONTINUE status=$continue_status" -} - -umask 002 - -THIS=$(basename $0) -VERSION="1.13.13" -LOGDIR="logs" -if [ ! -d $LOGDIR ] -then - mkdir -p $LOGDIR -fi -#GITSYSTEM="gitlab" -GITSYSTEM="github" - -ACTION=$1 -if [ -z "$ACTION" ] -then - echo "* missing ACTION (plan | apply | destroy)" - exit 1 -fi - -if [[ ! -z "$ACTION" ]] && [[ "$ACTION" == "help" ]] -then - do_help - exit 0 -fi - -if [ $ACTION == "list" ] -then - ACTION="plan" - START="list" -fi - -# if [[ $ACTION == "plan" ]] || [[ $ACTION == "apply" ]] || [[ $ACTION == "init" ]] || [[ $ACTION == "destroy" ]] || [[ $ACTION == "clean" ]] || [[ $ACTION = "superclean" ]] -ALL_ACTIONS=(plan apply init init-upgrade check destroy clean superclean tags) -action_valid=0 -for a in "${ALL_ACTIONS[@]}" -do - if [ "$ACTION" == $a ] - then - action_valid=1 - break - fi -done - -if [ $action_valid == 1 ] -then - echo "* running action=$ACTION" -else - echo "* invalid action=$ACTION" - echo "" - do_help - exit 1 -fi - -YMDSTAMP=$(date +%Y%m%d) -stime=$(date +%s) -STAMP="$YMDSTAMP.$stime" -LOGFILE="$LOGDIR/run.$ACTION.$STAMP.log" - -# from: https://stackoverflow.com/questions/25833676/redirect-echo-output-in-shell-script-to-logfile -if [[ "$START" != "list" ]] && [[ "$ACTION" != "init" ]] -then - exec > >(tee -i $LOGFILE) - exec 2>&1 -else - LOGFILE="$LOGFILE (not-created)" -fi - -echo "* START: $THIS $VERSION start=$stime end=$etime logfile=$LOGFILE" -[[ ! -z $PROFILE ]] || get_profile -[[ ! -z $REGION ]] || get_region - -if [ $ACTION == "init" ] -then - get_git_root - - if [ $GITSYSTEM == "github" ] - then - get_file_from_git tf-run.data "https://${GITSYSTEM}.e.it.census.gov/raw/terraform/support/master/local-app/tf-run/applications/base/tf-run.data" - get_file_from_git region.tf "https://${GITSYSTEM}.e.it.census.gov/raw/terraform/support/master/local-app/tf-run/applications/base/region.tf" - get_file_from_git locals.tf "https://${GITSYSTEM}.e.it.census.gov/raw/terraform/support/master/local-app/tf-run/applications/base/locals.tf" - get_file_from_git versions.tf "https://${GITSYSTEM}.e.it.census.gov/raw/terraform/support/master/local-app/tf-run/applications/base/versions.tf" - elif [ $GITSYSTEM == "gitlab" ] - then - get_file_from_git tf-run.data "https://${GITSYSTEM}.e.it.census.gov/terraform/support/-/raw/master/local-app/tf-run/applications/base/tf-run.data" - get_file_from_git region.tf "https://${GITSYSTEM}.e.it.census.gov/terraform/support/-/raw/master/local-app/tf-run/applications/base/region.tf" - get_file_from_git locals.tf "https://${GITSYSTEM}.e.it.census.gov/terraform/support/-/raw/master/local-app/tf-run/applications/base/locals.tf" - get_file_from_git versions.tf "https://${GITSYSTEM}.e.it.census.gov/terraform/support/-/raw/master/local-app/tf-run/applications/base/versions.tf" - fi - - if [ ! -r "$TOP/.tf-control" ] - then - if [ $GITSYSTEM == "github" ] - then - get_file_from_git .tf-control "https://${GITSYSTEM}.e.it.census.gov/raw/terraform/support/master/local-app/aws-account-setup/ansible/roles/setup-git-repo/files/.tf-control" - elif [ $GITSYSTEM == "gitlab" ] - then - get_file_from_git .tf-control "https://${GITSYSTEM}.e.it.census.gov/terraform/support/-/raw/master/local-app/aws-account-setup/ansible/roles/setup-git-repo/files/.tf-control" - fi - fi - if [ ! -r "$TOP/.tf-control.tfrc" ] - then - if [ $GITSYSTEM == "github" ] - then - get_file_from_git .tf-control.tfrc "https://${GITSYSTEM}.e.it.census.gov/raw/terraform/support/master/local-app/aws-account-setup/ansible/roles/setup-git-repo/files/.tf-control.tfrc" - elif [ $GITSYSTEM == "gitlab" ] - then - get_file_from_git .tf-control.tfrc "https://${GITSYSTEM}.e.it.census.gov/terraform/support/-/raw/master/local-app/aws-account-setup/ansible/roles/setup-git-repo/files/.tf-control.tfrc" - fi - fi - exit 0 -fi - -if [ $ACTION == "init-upgrade" ] -then - get_git_root - if [ ! -r "$TOP/.tf-control" ] - then - if [ $GITSYSTEM == "github" ] - then - get_file_from_git .tf-control "https://${GITSYSTEM}.e.it.census.gov/raw/terraform/support/master/local-app/aws-account-setup/ansible/roles/setup-git-repo/files/.tf-control" - elif [ $GITSYSTEM == "gitlab" ] - then - get_file_from_git .tf-control "https://${GITSYSTEM}.e.it.census.gov/terraform/support/-/raw/master/local-app/aws-account-setup/ansible/roles/setup-git-repo/files/.tf-control" - fi - fi - if [ ! -r "$TOP/.tf-control.tfrc" ] - then - if [ $GITSYSTEM == "github" ] - then - get_file_from_git .tf-control.tfrc "https://${GITSYSTEM}.e.it.census.gov/raw/terraform/support/master/local-app/aws-account-setup/ansible/roles/setup-git-repo/files/.tf-control.tfrc" - elif [ $GITSYSTEM == "gitlab" ] - then - get_file_from_git .tf-control.tfrc "https://${GITSYSTEM}.e.it.census.gov/terraform/support/-/raw/master/local-app/aws-account-setup/ansible/roles/setup-git-repo/files/.tf-control.tfrc" - fi - fi - exit 0 -fi - -if [ $ACTION == "check" ] -then - TF_UPGRADE_MODULES="" - TF_UPGRADE_MODULES+=" aws-common-security-groups" - TF_UPGRADE_MODULES+=" aws-edl-launch-instance" - TF_UPGRADE_MODULES+=" aws-iam-role" - TF_UPGRADE_MODULES+=" aws-iam-user" - TF_UPGRADE_MODULES+=" aws-inf-setup" - TF_UPGRADE_MODULES+=" aws-s3" - TF_UPGRADE_MODULES+=" aws-setup-s3-object-logging" - TF_UPGRADE_MODULES+=" aws-tls-certificate" - TF_UPGRADE_MODULES+=" aws-vpc-setup" - TF_UPGRADE_MODULES+=" dns-lookup" - TF_UPGRADE_MODULES+=" aws-ecr-copy-images" - _TFSTRING="" - for f in $TF_UPGRADE_MODULES - do - if [ ! -z $_TFSTRING ] - then - _TFSTRING+="|" - fi - _TFSTRING+="$f" - done - - echo -n "* [check] outdated .tf with git::https format: " - c=$(cat *.tf 2>/dev/null| grep -cE '^[^#]*source.*git::https') - if [ $c == 0 ] - then - echo "OK" - else - echo "NOT-OK" - echo "* found $c source statements, please fix to git@ format." - grep -E '^[^#]*source.*git::https' *.tf - fi - echo "" - - echo -n "* [check] for .tf for eligible modules not using ?ref=tf-upgrade: " - c=$(cat *.tf | grep -E "^[^#]*source.*git.*($_TFSTRING)" | grep -c -v ref=tf-upgrade) - if [ $c == 0 ] - then - echo "OK" - else - echo "NOT-OK" - echo "* found $c source statements not referencing ref=tf-upgrade, please verify with the list at" - echo " https://${GITSYSTEM}.e.it.census.gov/terraform/support/blob/master/docs/how-to/terraform-upgrade/upgrade-code.md#modules" - echo " and change accordingly if the module here is on the list." - grep -E "^[^#]*source.*git.*($_TFSTRING)" *.tf | grep -v ref=tf-upgrade - fi - echo "" - exit 0 -fi - -if [[ $ACTION == "clean" ]] || [[ $ACTION == "superclean" ]] -then - echo ""; echo "About to execute $ACTION. This is destructive and will remove files." - ask_continue "Continue (y|n)? " "" "n" - if [ $CONTINUE == "y" ] - then - if [ $ACTION == "clean" ] - then - do_clean - else - do_superclean - fi - else - echo "* action $ACTION declined" - fi - exit 0 -fi - -TFRUNFILE_VERSION="" -if [ $ACTION == "destroy" ] -then - if [ -r "tf-run.destroy.data" ] - then - RUNFILE="tf-run.destroy.data" - else - TFRUNFILE_VERSION="generated.$STAMP" - RUNFILE="ALL" - fi -else - RUNFILE="tf-run.data" -fi - -# read file tf-run.data -declare -a targets=() -declare -a targets_status=() -declare -A tags -# TFRUNFILE_VERSION="" -# RUNFILE="tf-run.data" -if [ -r $RUNFILE ] -then - c=1 - pos=1 - echo "* reading from $RUNFILE" - while IFS="" read line - do - nline=$(echo $line | sed -e 's/^#.*$//') - if [ ! -z "$nline" ] - then - targets+=( "$line" ) - targets_status+=0 - words=( $line ) - pos=$(( pos + 1 )) - if [ ${words[0]} == "VERSION" ] - then - TFRUNFILE_VERSION=${words[1]} - pos=$(( pos - 1 )) - fi - if [ ${words[0]} == "TAG" ] - then - pos=$(( pos - 1 )) - tags[${words[1]}]=$pos - fi - fi - c=$(( $c + 1 )) - done < $RUNFILE - echo "* read ${#targets[@]} entries from $RUNFILE" -elif [ "$RUNFILE" == "ALL" ] - then - targets+=( "ALL" ) - targets_status+=0 - c=1 -else - echo "* unable to open tf-run.data, exiting" - exit 1 -fi -TOTAL_TARGETS=${#targets[@]} - -if [ $ACTION == "tags" ] -then - echo "* available TAGS and step numbers" - for t in "${!tags[@]}" - do - echo "TAG $t = ${tags[$t]}" - done - echo "" - exit 0 -fi - -if [ -z "$START" ] -then - START=$2 -fi -if [[ ! -z "$START" ]] && [[ "$START" == "list" ]] -then - LIST=1 - echo "> list" -elif [ ! -z "$START" ] -then - LIST=0 - - if [ $(echo "$START" | grep -c "^tag:" ) -gt 0 ] - then - START_TAG=$(echo "$START" | sed -e 's/^tag://') - if [ -z "$START_TAG" ] - then - echo "* start tag:NAME used but NAME is missing" - fi - START=${tags[$START_TAG]} - if [ -z "$START" ] - then - echo "* start tag:$START_TAG used but tag is not found" - fi - fi -else - LIST=0 -fi - -START=$(( START * 1 )) - -END=$3 -if [ ! -z "$END" ] -then - if [ "$END" == "only" ] - then - END=$START - elif [ $(echo "$END" | grep -c "^+") -gt 0 ] - then - NEND=$(echo "$END" | sed -e 's/^+//') - END=$(( START + NEND )) - elif [ $(echo "$END" | grep -c "^tag:" ) -gt 0 ] - then - END_TAG=$(echo "$END" | sed -e 's/^tag://') - if [ -z "$END_TAG" ] - then - echo "* end tag:NAME used but NAME is missing" - fi - END=$(( ${tags[$END_TAG]} - 1 )) - if [ -z "$END" ] - then - echo "* end tag:$END_TAG used but tag is not found" - fi - fi -fi -END=$(( END * 1 )) - -## c=1 -## for t in "${targets[@]}" -## do -## tfargs="" -## if [ "$t" != "ALL" ] -## then -## for tt in $t -## do -## tfargs+="-target=$tt " -## done -## fi -## echo "* $c tf-$ACTION $tfargs" -## c=$(( $c + 1 )) -## done -## exit 0 -## fi - -# add to history: .tf-run.history -TFR_HISTORY=".tf-run.history" - -#if [ ! -r $TFR_HISTORY ] -#then -# echo "timestamp,username,action,start,end,status" >> $TFR_HISTORY -#fi -#echo "$(date +%s),$USER,$ACTION,$START,$END,start" >> $TFR_HISTORY - -status=0 -echo ">> START: start_time=$stime version=$VERSION data.version=$TFRUNFILE_VERSION start=$START end=$END start_tag=$START_TAG" -echo "- profile=$PROFILE region=$REGION short_region=$SHORT_REGION" -c=0 -for t in "${targets[@]}" -do - c=$(( $c + 1 )) - next=$(( $c + 1 )) - - target=$t - words=( $t ) - w=${words[0]} - rest=$(echo "${words[@]:1}" | sed -e "s/%%NEXT%%/$next/g") - rest=$(replace_placeholders "$rest") - - if [[ "$w" == "VERSION" ]] || [[ "$w" == "TAG" ]] - then - TOTAL_TARGETS=$(( $TOTAL_TARGETS - 1 )) - fi - if [[ ! -z $START ]] && [[ $START -gt $c ]] - then - if [[ $w == "VERSION" ]] || [[ $w == "TAG" ]] - then - c=$(( $c - 1 )) - fi - continue - fi - if [[ ! -z $END ]] && [[ $END != 0 ]] && [[ $c -gt $END ]] - then - break -# echo "break c=$c end=$END" -# else -# echo "not break c=$c end=$END" - fi - - case $w in - REMOTE-STATE) - echo "> [$c] $w> generate-remote-state" - if [ $LIST == 0 ] - then - if [ -r ../remote_state.yml ] - then - cat ../remote_state.yml | sed -E s#\(^directory.*\)\"#\\1/$(basename $(pwd))\"# > remote_state.yml - status=$? - echo -n "* generated line: "; grep ^directory remote_state.yml - echo ""; echo "= Complete: $c $w> $rest | status=$status" - else - echo "* missing parent remote_state.yml, exiting" - status=1 - exit $status - fi - targets_status[$c]=$status - fi - continue - ;; - BACKUP-STATE) - echo "> [$c] $w> backup-state to $LOGDIR/backup.$STAMP.tfstate" - if [ $LIST == 0 ] - then - TFNOLOG=1 tf-state pull > $LOGDIR/backup.$STAMP.tftate - status=$? - echo ""; echo "= Complete: $c $w> backup-state | status=$status" - targets_status[$c]=$status - fi - c=$(( $c - 1 )) - continue - ;; - COMMAND) - echo "> [$c] $w> $rest" - if [ $LIST == 0 ] - then - $rest - status=$? - echo ""; echo "= Complete: $c $w> $rest | status=$status" - targets_status[$c]=$status - fi -# should check to see if we are running tf-directory-setup.py, so grab this stuff after each command if missing - [[ ! -z $PROFILE ]] || get_profile - [[ ! -z $REGION ]] || get_region - continue - ;; - LINKTOP) - LINKARG=$(replace_placeholders ${words[@]:1:1}) - get_relative_to_git_root - echo "> [$c] $w> ln -sf ${RELATIVE_PATH}${LINKARG} ./" - if [ $LIST == 0 ] - then - if [ -e "${RELATIVE_PATH}${LINKARG}" ] - then - ln -sf ${RELATIVE_PATH}${LINKARG} ./ - status=$? - else - echo "* linked-to file ${RELATIVE_PATH}${LINKARG} does not exist, skipping" - status=0 - fi - echo ""; echo "= Complete: $c $w> relative-link-to-top $LINKARG | status=$status" - targets_status[$c]=$status - fi - continue - ;; - LINK) - LINKARG=$(replace_placeholders ${words[@]:1:1}) - get_relative_directory $LINKARG - echo "> [$c] $w> ln -sf ${RELATIVE_PATH}${LINKARG} ./" - if [ $LIST == 0 ] - then - if [[ -e "${RELATIVE_PATH}${LINKARG}" ]] && [[ ! -z "${RELATIVE_PATH}" ]] - then -# don't make link if the file is found in the current directory as we'll make a looping link -# the get_relative_directory is looking for the real file in the hierarchy - if [ "${RELATIVE_PATH}" != "./" ] - then - ln -sf ${RELATIVE_PATH}${LINKARG} ./ - status=$? - fi - else - echo "* linked-to file ${RELATIVE_PATH}${LINKARG} does not exist in current or parent dirs, skipping" - status=0 - fi - echo ""; echo "= Complete: $c $w> relative-link $LINKARG | status=$status" - targets_status[$c]=$status - fi - continue - ;; - VERSION) - c=$(( $c - 1 )) - continue - ;; - TAG) -# this is a placeholder, almost like a go-to, but it does not increment the number - echo "" - echo "# [$w] $rest" - c=$(( $c - 1 )) - targets_status[$c]=$status - continue - ;; - COMMENT) - echo "> [$c] $w> $rest" - status=$? -## echo ""; echo "= Complete: $c $w> $rest | status=$status" - targets_status[$c]=$status - continue - ;; - CHECK) - echo "> [$c] $w> $rest" - status=$? - targets_status[$c]=$status - if [ $LIST == 0 ] - then - echo ""; echo "= Complete: $c $w> $rest | status=$status" - ask_continue "} Next: $next, " - if [ $CONTINUE == "n" ] - then - break - fi - fi - continue - ;; - PAUSE) - if [ -z $rest ] - then - SLEEP=15 - else - SLEEP=$rest - fi - if [ $LIST == 0 ] - then - echo "> [$c] $w> sleeping for $SLEEP" - sleep $SLEEP - else - echo "> [$c] $w> would sleep for $SLEEP (actual sleep 2)" - sleep 2 - fi - status=$? - targets_status[$c]=$status - if [ $LIST == 0 ] - then - echo ""; echo "= Complete: $c $w> $rest | status=$status" - ask_continue "} Next: $next, " - if [ $CONTINUE == "n" ] - then - break - fi - fi - continue - ;; - STOP) - echo "> [$c] $w> $rest" - status=$? - if [ $LIST == 0 ] - then - echo ""; echo "= Complete: $c $w> $rest | status=$status" - echo "- Continue $next: $THIS $ACTION $next" - break - else - continue - fi - ;; - CLEAN) - ;; - POLICY) - PFILES="${words[@]:1}" - if [ -z $PFILES ] - then - PFILES="*.tf" - fi - PTARGETS=( $(grep -iE "^resource\b.*aws_iam_policy\b" $PFILES | awk '{print $2 "." $3}' |sed -e 's/"//g') ) - status=$? - echo "> [$c] $w> ($PFILES) ${PTARGETS[@]}" - target="${PTARGETS[@]}" - targets_status[$c]=$status - if [ -z "$target" ] - then - if [ $LIST == 0 ] - then - echo "= No policy targets found, skipping this step | status=$status" - echo "- Continue $next: $THIS $ACTION $next" - fi - continue - fi - ;; - *) - ;; - esac - - if [ $status != 0 ] - then - echo "* error encountered, status=$status; exiting" - exit $status - fi - - tfargs="" - if [ "$t" != "ALL" ] - then - for tt in $target - do - tfargs+="-target=$tt " - done - fi - if [ $LIST == 1 ] - then - echo "> [$c] tf-$ACTION $TFOPTIONS $tfargs" - continue - fi - - echo "> [$c] tf-$ACTION $TFOPTIONS $tfargs" - if [ -z $DRY_RUN ] - then - tf-$ACTION $TFOPTIONS $tfargs - else - echo " (dry-run)" - fi - status=$? - targets_status[$c]=$status - - if [ $status != 0 ] - then - echo "> [$c] exiting status=$status" - break - fi - - if [ -z $DRY_RUN ] - then - echo ""; echo "= Complete: $c $w> $rest | status=$status" - ask_continue "} Next: $next, " -# ask_continue - fi - - if [ $CONTINUE == "n" ] - then - break - fi -done - -etime=$(date +%s) -xtime=$(( $etime - $stime )) -if [ $c -ge $TOTAL_TARGETS ] -then - if [ $LIST == 0 ] - then - echo "<< COMPLETE $c/$TOTAL_TARGETS targets" - fi -else - if [ $LIST == 0 ] - then - echo "<< INCOMPLETE $c/$TOTAL_TARGETS last_item=$c" - fi -fi -echo "<< END: start_time=$stime end_time=$etime elapsed=$xtime logfile=$LOGFILE status=$status" -exit $status - -## TO DO -# add LINK to make a link to a file in successive parent directories (variables.vpc.tf, for example) -# make regionshort -# use get_region and get_profile From 87c09261c1c6677a21b70fb87019bd276f283083 Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Mon, 11 May 2026 17:43:57 -0400 Subject: [PATCH 06/27] feat: Enhance Terraform execution with proposer and executor templates - Updated `TfRunRequest` model to differentiate between propose and apply actions, adding relevant fields for each. - Refactored `start_codebuild_build` function to handle environment variable overrides based on the action type. - Implemented logic in `lambda_handler` to manage responses for both propose and apply actions. - Added new CloudFormation templates for the proposer and executor products, enabling structured Terraform change proposals and applications. - The proposer template handles rendering templates and opening pull requests, while the executor template applies changes after PR approval. --- buildspec-executor.yml | 129 ++++ buildspec-proposer.yml | 155 +++++ buildspec.yml | 225 ------- buildspec.yml.j2 | 93 --- deploy/codebuild.tf | 114 +++- deploy/iam.tf | 16 +- deploy/lambda.tf | 3 +- deploy/service_catalog.tf | 95 ++- docs/HOW-IT-WORKS.md | 841 +++++++------------------ lambda/app.py | 88 ++- service-catalog/executor-template.yaml | 127 ++++ service-catalog/proposer-template.yaml | 145 +++++ 12 files changed, 1027 insertions(+), 1004 deletions(-) create mode 100644 buildspec-executor.yml create mode 100644 buildspec-proposer.yml delete mode 100644 buildspec.yml delete mode 100644 buildspec.yml.j2 create mode 100644 service-catalog/executor-template.yaml create mode 100644 service-catalog/proposer-template.yaml diff --git a/buildspec-executor.yml b/buildspec-executor.yml new file mode 100644 index 0000000..8bf5620 --- /dev/null +++ b/buildspec-executor.yml @@ -0,0 +1,129 @@ +version: 0.2 + +# --------------------------------------------------------------------------- +# tf-run-executor buildspec +# +# Purpose: clone account repo main branch, optionally assume a cross-account +# IAM role, and run tf-run apply in the target layer/region directory. +# This is triggered AFTER a proposer PR has been reviewed and merged. +# It does not render templates or open a PR. +# +# Required env-var overrides per build (supplied by Lambda): +# ACCOUNT_REPO - account repo name, e.g. 229685449397-csvd-dev-platform-dev-gov +# LAYER - terraform layer: common | infrastructure | vpc +# REGION_DIR - region directory: east | west | global +# GITHUB_TOKEN - GHE PAT (PLAINTEXT, value from Secrets Manager) +# +# Optional env-var overrides: +# TARGET_ACCOUNT_ID - AWS account ID to assume sc-automation-codebuild-role in +# (default: empty = run with CodeBuild role, csvd-dev only) +# TF_RUN_START_TAG - tf-run.data TAG label to start from (default: empty = from top) +# DRY_RUN - "true" = tf-run plan only, no apply (default: "false") +# --------------------------------------------------------------------------- + +env: + variables: + GITHUB_ORG: "SCT-Engineering" + TF_BINARY_S3_PREFIX: "s3://csvd-packer-pipeline-assets/terraform" + GH_CLI_S3_PREFIX: "s3://csvd-packer-pipeline-assets/tools" + CENSUS_CA_S3: "s3://csvd-packer-pipeline-assets/certs/census-ca.pem" + TERRAFORM_SUPPORT_REPO: "terraform/support" + HTTPS_PROXY: "http://proxy.tco.census.gov:3128" + NO_PROXY: "github.e.it.census.gov,169.254.169.254,169.254.170.2" + # Per-build defaults (overridden via environmentVariablesOverride in Lambda) + TARGET_ACCOUNT_ID: "" + TF_RUN_START_TAG: "" + DRY_RUN: "false" + +phases: + install: + commands: + # --- Version governance: clone terraform/support to read org-canonical versions --- + - git clone --depth 1 "https://${GITHUB_TOKEN}@github.e.it.census.gov/${TERRAFORM_SUPPORT_REPO}.git" /tmp/tf-support + - export TF_VERSION=$(cat /tmp/tf-support/terraform/VERSION) + - export GH_VERSION=$(cat /tmp/tf-support/github-cli-releases/VERSION) + - echo "Using Terraform ${TF_VERSION}, gh CLI ${GH_VERSION}" + + # --- Terraform binary (registry.terraform.io is blocked on Census network; use S3) --- + - aws s3 cp "${TF_BINARY_S3_PREFIX}/terraform_${TF_VERSION}_linux_amd64.zip" /tmp/terraform.zip + - unzip -o /tmp/terraform.zip -d /usr/local/bin/ && chmod +x /usr/local/bin/terraform + - ln -sf /usr/local/bin/terraform /usr/local/bin/tf + + # --- Census CA certificate (required for TLS to github.e.it.census.gov) --- + - aws s3 cp "$CENSUS_CA_S3" /etc/pki/ca-trust/source/anchors/census-ca.pem + - update-ca-trust extract + + # --- tf-run toolchain (sourced from terraform/support, already cloned above) --- + # Canonical versions live in terraform/support local-app/ — no copies kept in this repo. + - cp /tmp/tf-support/local-app/tf-run/tf-run.sh /usr/local/bin/tf-run + - cp /tmp/tf-support/local-app/tf-control/tf-control.sh /usr/local/bin/tf-control.sh + - cp /tmp/tf-support/local-app/tf-directory-setup/tf-directory-setup.py /usr/local/bin/tf-directory-setup.py + - chmod +x /usr/local/bin/tf-run /usr/local/bin/tf-control.sh /usr/local/bin/tf-directory-setup.py + # Create tf-{action} symlinks expected by tf-run and account repo steps + - > + for action in init plan apply destroy refresh output validate import state fmt taint console; do + ln -sf /usr/local/bin/tf-control.sh /usr/local/bin/tf-${action}; + done + + # --- Python deps for tf-directory-setup.py --- + - pip3 install --quiet python-dateutil pyyaml + + # --- gh CLI (from S3, for any post-apply verification steps) --- + - aws s3 cp "${GH_CLI_S3_PREFIX}/gh_${GH_VERSION}_linux_amd64.tar.gz" /tmp/gh.tar.gz + - mkdir -p /tmp/gh-cli && tar -xzf /tmp/gh.tar.gz -C /tmp/gh-cli --strip-components=1 + - cp /tmp/gh-cli/bin/gh /usr/local/bin/gh && chmod +x /usr/local/bin/gh + + build: + commands: + # --- Configure git to rewrite SSH URLs to HTTPS --- + # Module sources in account repos use ssh://git@github.e.it.census.gov/... or git@... + # This rewrite transparently redirects them to HTTPS + PAT at the git layer. + - git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "ssh://git@github.e.it.census.gov/" + - git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "git@github.e.it.census.gov:" + + # --- Clone account repo from main (the reviewed + merged state) --- + - git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${ACCOUNT_REPO}.git" repo + - cd repo + # Verify we are on main (not a work branch) + - git checkout main + - echo "Applying from $(git rev-parse --short HEAD) on main" + + # --- Assume cross-account role (if TARGET_ACCOUNT_ID is set) --- + # The role sc-automation-codebuild-role must exist in the target account and + # trust the CodeBuild IAM role from the central account (csvd-dev). + - | + if [ -n "${TARGET_ACCOUNT_ID}" ]; then + PARTITION=$(aws sts get-caller-identity --query Arn --output text | cut -d: -f2) + ROLE_ARN="arn:${PARTITION}:iam::${TARGET_ACCOUNT_ID}:role/sc-automation-codebuild-role" + echo "Assuming cross-account role: ${ROLE_ARN}" + CREDS=$(aws sts assume-role \ + --role-arn "${ROLE_ARN}" \ + --role-session-name "sc-automation-${ACCOUNT_REPO}" \ + --query Credentials \ + --output json) + export AWS_ACCESS_KEY_ID=$(echo "$CREDS" | python3 -c "import json,sys; print(json.load(sys.stdin)['AccessKeyId'])") + export AWS_SECRET_ACCESS_KEY=$(echo "$CREDS" | python3 -c "import json,sys; print(json.load(sys.stdin)['SecretAccessKey'])") + export AWS_SESSION_TOKEN=$(echo "$CREDS" | python3 -c "import json,sys; print(json.load(sys.stdin)['SessionToken'])") + echo "Assumed role in account ${TARGET_ACCOUNT_ID}" + else + echo "No TARGET_ACCOUNT_ID set — running with CodeBuild role (csvd-dev only)" + fi + + # --- Run Terraform in target layer/region directory --- + # tf-run auto-proceeds on non-TTY stdin (read -t timeout defaults to "y") + - cd "${LAYER}/${REGION_DIR}" + - | + if [ "${DRY_RUN}" = "true" ]; then + echo "DRY_RUN=true — running tf-run plan only" + TFARGS="-no-color" tf-run plan + elif [ -n "${TF_RUN_START_TAG}" ]; then + TFARGS="-auto-approve" tf-run apply "tag:${TF_RUN_START_TAG}" + else + TFARGS="-auto-approve" tf-run apply + fi + + post_build: + commands: + - echo "BUILD_RESULT=${CODEBUILD_BUILD_SUCCEEDING}" + - echo "ACCOUNT_REPO=${ACCOUNT_REPO}" + - echo "LAYER=${LAYER} REGION_DIR=${REGION_DIR}" diff --git a/buildspec-proposer.yml b/buildspec-proposer.yml new file mode 100644 index 0000000..ddccbaa --- /dev/null +++ b/buildspec-proposer.yml @@ -0,0 +1,155 @@ +version: 0.2 + +# --------------------------------------------------------------------------- +# tf-run-proposer buildspec +# +# Purpose: clone account repo, render template files, write extra files, +# commit + push to a work branch, open a PR for human review. +# Does NOT run Terraform — that is the executor's job after merge. +# +# Required env-var overrides per build (supplied by Lambda): +# ACCOUNT_REPO - account repo name, e.g. 229685449397-csvd-dev-platform-dev-gov +# LAYER - terraform layer: common | infrastructure | vpc +# REGION_DIR - region directory: east | west | global +# GITHUB_TOKEN - GHE PAT (PLAINTEXT, value from Secrets Manager) +# +# Optional env-var overrides: +# GIT_BRANCH - branch to commit/PR from (default: propose/sc-automation) +# TEMPLATE_REPO - GHE repo containing Jinja2/.tf template files +# TEMPLATE_VARS - JSON map of Jinja2 variables for template rendering +# EXTRA_FILES - JSON map {"relative/path": "content"} written after template rendering +# --------------------------------------------------------------------------- + +env: + variables: + GITHUB_ORG: "SCT-Engineering" + CENSUS_CA_S3: "s3://csvd-packer-pipeline-assets/certs/census-ca.pem" + # Org-canonical version governance repo (needed for gh CLI version) + TERRAFORM_SUPPORT_REPO: "terraform/support" + GH_CLI_S3_PREFIX: "s3://csvd-packer-pipeline-assets/tools" + HTTPS_PROXY: "http://proxy.tco.census.gov:3128" + NO_PROXY: "github.e.it.census.gov,169.254.169.254,169.254.170.2" + # Per-build defaults (overridden via environmentVariablesOverride in Lambda) + GIT_BRANCH: "propose/sc-automation" + TEMPLATE_REPO: "" + TEMPLATE_VARS: "{}" + EXTRA_FILES: "{}" + +phases: + install: + commands: + # --- Version governance: clone terraform/support to read org-canonical versions --- + - git clone --depth 1 "https://${GITHUB_TOKEN}@github.e.it.census.gov/${TERRAFORM_SUPPORT_REPO}.git" /tmp/tf-support + - export GH_VERSION=$(cat /tmp/tf-support/github-cli-releases/VERSION) + - echo "Using gh CLI ${GH_VERSION}" + + # --- Census CA certificate (required for TLS to github.e.it.census.gov) --- + - aws s3 cp "$CENSUS_CA_S3" /etc/pki/ca-trust/source/anchors/census-ca.pem + - update-ca-trust extract + + # --- Python deps for template rendering --- + - pip3 install --quiet jinja2 + + # --- gh CLI (from S3, version pinned in terraform/support) --- + - aws s3 cp "${GH_CLI_S3_PREFIX}/gh_${GH_VERSION}_linux_amd64.tar.gz" /tmp/gh.tar.gz + - mkdir -p /tmp/gh-cli && tar -xzf /tmp/gh.tar.gz -C /tmp/gh-cli --strip-components=1 + - cp /tmp/gh-cli/bin/gh /usr/local/bin/gh && chmod +x /usr/local/bin/gh + + build: + commands: + # --- Configure git to rewrite SSH URLs to HTTPS --- + # Account repos reference Terraform modules via ssh://git@github.e.it.census.gov/... + # This rewrite makes those paths visible to git without requiring an SSH key. + - git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "ssh://git@github.e.it.census.gov/" + - git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "git@github.e.it.census.gov:" + + # --- Clone account repo and check out (or create) the work branch --- + - git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${ACCOUNT_REPO}.git" repo + - cd repo + - git checkout -B "${GIT_BRANCH}" + + # --- Render template repo (if specified) --- + # Clone TEMPLATE_REPO, render .j2 files with TEMPLATE_VARS via Jinja2 StrictUndefined, + # copy non-template files as-is. Results land at the same relative paths in the account repo. + - | + if [ -n "${TEMPLATE_REPO}" ]; then + git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${TEMPLATE_REPO}.git" /tmp/template-repo + python3 - <<'PYEOF' + import json, os, pathlib, shutil + from jinja2 import Environment, FileSystemLoader, StrictUndefined + + template_vars = json.loads(os.environ.get('TEMPLATE_VARS', '{}')) + src_root = pathlib.Path('/tmp/template-repo') + dst_root = pathlib.Path('.') # already inside cloned account repo + + rendered = 0 + copied = 0 + for src in src_root.rglob('*'): + if src.is_dir() or any(part.startswith('.git') for part in src.parts): + continue + rel = src.relative_to(src_root) + if src.suffix == '.j2': + dst = dst_root / rel.with_suffix('') + dst.parent.mkdir(parents=True, exist_ok=True) + env = Environment( + loader=FileSystemLoader(str(src.parent)), + undefined=StrictUndefined, + keep_trailing_newline=True, + ) + content = env.get_template(src.name).render(**template_vars) + dst.write_text(content) + rendered += 1 + else: + dst = dst_root / rel + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dst) + copied += 1 + print(f'Template repo: rendered {rendered} .j2 file(s), copied {copied} file(s)') + PYEOF + else + echo 'No TEMPLATE_REPO specified — skipping template rendering' + fi + + # --- Write extra config files (JSON map path -> content); override template output --- + - | + python3 -c " + import json, os, pathlib + files = json.loads(os.environ.get('EXTRA_FILES', '{}')) + for path, content in files.items(): + p = pathlib.Path(path) + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + print(f'Wrote {len(files)} extra file(s)') + " + + # --- Commit and push --- + - git add -A + - | + git -c user.email="sc-automation@census.gov" \ + -c user.name="SC Automation" \ + commit -m "SC propose: ${LAYER}/${REGION_DIR} [${ACCOUNT_REPO}]" \ + --allow-empty + - git push origin "${GIT_BRANCH}" + + # --- Open PR (idempotent: skip if PR already exists for this branch) --- + - | + GH_HOST=github.e.it.census.gov \ + GH_TOKEN="${GITHUB_TOKEN}" \ + gh pr create \ + --title "SC propose: ${LAYER}/${REGION_DIR} [${ACCOUNT_REPO}]" \ + --body "Automated proposal from Service Catalog. Review and merge, then launch the **Apply** product to run \`tf-run apply\`." \ + --base main \ + --head "${GIT_BRANCH}" \ + || echo "PR already exists or create failed — continuing" + + post_build: + commands: + - echo "BUILD_RESULT=${CODEBUILD_BUILD_SUCCEEDING}" + - | + PR_URL=$(GH_HOST=github.e.it.census.gov \ + GH_TOKEN="${GITHUB_TOKEN}" \ + gh pr view \ + --repo "${GITHUB_ORG}/${ACCOUNT_REPO}" \ + "${GIT_BRANCH}" \ + --json url -q .url 2>/dev/null || echo "") + echo "PR_URL=${PR_URL}" diff --git a/buildspec.yml b/buildspec.yml deleted file mode 100644 index f3029ba..0000000 --- a/buildspec.yml +++ /dev/null @@ -1,225 +0,0 @@ -version: 0.2 - -# --------------------------------------------------------------------------- -# tf-run-executor buildspec -# -# Required env-var overrides per build (supplied by Lambda or manual CLI): -# ACCOUNT_REPO - account repo name, e.g. 229685449397-csvd-dev-platform-dev-gov -# LAYER - terraform layer: common | infrastructure | vpc -# REGION_DIR - region directory: east | west -# GITHUB_TOKEN - GHE PAT (type PLAINTEXT, value from Secrets Manager) -# -# Optional env-var overrides: -# GIT_BRANCH - branch to commit/PR from (default: repo-init) -# TF_RUN_START_TAG - tf-run.data TAG label to start from (default: empty = from top) -# TEMPLATE_REPO - GHE repo containing Jinja2/.tf template files (default: empty) -# TEMPLATE_VARS - JSON map of Jinja2 variables for template rendering (default: {}) -# EXTRA_FILES - JSON map {"relative/path": "content"} written after template rendering -# DRY_RUN - "true" = tf plan only, no apply (default: "false") -# TARGET_ACCOUNT_ID - AWS account ID to assume role in before running tf-run -# (default: empty = run with CodeBuild's own credentials, -# i.e. csvd-dev. Set this when targeting a different account.) -# --------------------------------------------------------------------------- - -env: - variables: - GITHUB_ORG: "SCT-Engineering" - # S3 prefixes — filenames are resolved at build time from terraform/support VERSION files. - # The S3 bucket must contain the version pinned in terraform/support (keep in sync). - TF_BINARY_S3_PREFIX: "s3://csvd-packer-pipeline-assets/terraform" - GH_CLI_S3_PREFIX: "s3://csvd-packer-pipeline-assets/tools" - CENSUS_CA_S3: "s3://csvd-packer-pipeline-assets/certs/census-ca.pem" - # Org-canonical version governance: clone this repo to read VERSION files - TERRAFORM_SUPPORT_REPO: "terraform/support" - HTTPS_PROXY: "http://proxy.tco.census.gov:3128" - NO_PROXY: "github.e.it.census.gov,169.254.169.254,169.254.170.2" - # Per-build defaults (overridden via environmentVariablesOverride in Lambda) - GIT_BRANCH: "repo-init" - DRY_RUN: "false" - TF_RUN_START_TAG: "" - TEMPLATE_REPO: "" - TEMPLATE_VARS: "{}" - EXTRA_FILES: "{}" - TARGET_ACCOUNT_ID: "" - -phases: - install: - commands: - # --- Version governance: clone terraform/support to read org-canonical versions --- - # This repo (github.e.it.census.gov/terraform/support) is the single source of truth - # for which Terraform and gh CLI versions the org has blessed. We read VERSION files - # from it rather than hardcoding versions here. - - git clone --depth 1 "https://${GITHUB_TOKEN}@github.e.it.census.gov/${TERRAFORM_SUPPORT_REPO}.git" /tmp/tf-support - - export TF_VERSION=$(cat /tmp/tf-support/terraform/VERSION) - - export GH_VERSION=$(cat /tmp/tf-support/github-cli-releases/VERSION) - - echo "Using Terraform ${TF_VERSION}, gh CLI ${GH_VERSION}" - - # --- Terraform binary (registry.terraform.io is blocked on Census network; use S3) --- - # S3 bucket must contain the version pinned in terraform/support/terraform/VERSION. - - aws s3 cp "${TF_BINARY_S3_PREFIX}/terraform_${TF_VERSION}_linux_amd64.zip" /tmp/terraform.zip - - unzip -o /tmp/terraform.zip -d /usr/local/bin/ && chmod +x /usr/local/bin/terraform - - ln -sf /usr/local/bin/terraform /usr/local/bin/tf - - # --- Census CA certificate (required for TLS to github.e.it.census.gov) --- - - aws s3 cp "$CENSUS_CA_S3" /etc/pki/ca-trust/source/anchors/census-ca.pem - - update-ca-trust extract - - # --- tf-run toolchain (sourced from terraform/support, already cloned above) --- - # Canonical versions live in terraform/support local-app/ — no copies kept in this repo. - - cp /tmp/tf-support/local-app/tf-run/tf-run.sh /usr/local/bin/tf-run - - cp /tmp/tf-support/local-app/tf-control/tf-control.sh /usr/local/bin/tf-control.sh - - cp /tmp/tf-support/local-app/tf-directory-setup/tf-directory-setup.py /usr/local/bin/tf-directory-setup.py - - chmod +x /usr/local/bin/tf-run /usr/local/bin/tf-control.sh /usr/local/bin/tf-directory-setup.py - # Create tf-{action} symlinks expected by tf-run and account repo steps - - > - for action in init plan apply destroy refresh output validate import state fmt taint console; do - ln -sf /usr/local/bin/tf-control.sh /usr/local/bin/tf-${action}; - done - - # --- Python deps for tf-directory-setup.py and template rendering --- - - pip3 install --quiet jinja2 python-dateutil pyyaml - - # --- gh CLI (S3 bucket must contain the version pinned in terraform/support) --- - - aws s3 cp "${GH_CLI_S3_PREFIX}/gh_${GH_VERSION}_linux_amd64.tar.gz" /tmp/gh.tar.gz - - mkdir -p /tmp/gh-cli - - tar -xzf /tmp/gh.tar.gz -C /tmp/gh-cli --strip-components=1 - - cp /tmp/gh-cli/bin/gh /usr/local/bin/gh && chmod +x /usr/local/bin/gh - - build: - commands: - # --- Configure git to rewrite SSH URLs to HTTPS --- - # Account repos reference Terraform modules via ssh://git@github.e.it.census.gov/... - # This rewrite makes those module fetches work transparently via HTTPS + PAT, - # avoiding the need for a per-repo deploy key. - - git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "ssh://git@github.e.it.census.gov/" - - git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "git@github.e.it.census.gov:" - - # --- Clone account repo --- - - git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${ACCOUNT_REPO}.git" repo - - cd repo - - git checkout -B "${GIT_BRANCH}" - - # --- Render template repo (if specified) into account repo --- - # Clone TEMPLATE_REPO, render .j2 files with TEMPLATE_VARS via Jinja2, - # copy non-template files as-is. Results land in the account repo tree - # at the same relative paths. EXTRA_FILES applied afterwards can override. - - | - if [ -n "${TEMPLATE_REPO}" ]; then - git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${TEMPLATE_REPO}.git" /tmp/template-repo - python3 - <<'PYEOF' - import json, os, pathlib, shutil - from jinja2 import Environment, FileSystemLoader, StrictUndefined - - template_vars = json.loads(os.environ.get('TEMPLATE_VARS', '{}')) - src_root = pathlib.Path('/tmp/template-repo') - dst_root = pathlib.Path('.') # already inside cloned account repo - - rendered = 0 - copied = 0 - for src in src_root.rglob('*'): - if src.is_dir() or any(part.startswith('.git') for part in src.parts): - continue - rel = src.relative_to(src_root) - if src.suffix == '.j2': - # Render Jinja2 template; strip .j2 extension in destination - dst = dst_root / rel.with_suffix('') - dst.parent.mkdir(parents=True, exist_ok=True) - env = Environment( - loader=FileSystemLoader(str(src.parent)), - undefined=StrictUndefined, - keep_trailing_newline=True, - ) - content = env.get_template(src.name).render(**template_vars) - dst.write_text(content) - rendered += 1 - else: - dst = dst_root / rel - dst.parent.mkdir(parents=True, exist_ok=True) - shutil.copy2(src, dst) - copied += 1 - print(f'Template repo: rendered {rendered} .j2 file(s), copied {copied} file(s)') - PYEOF - else - echo 'No TEMPLATE_REPO specified — skipping template rendering' - fi - - # --- Write extra config files passed in from Lambda (JSON map path -> content) --- - # Applied after template rendering; keys here override template output. - - | - python3 -c " - import json, os, pathlib - files = json.loads(os.environ.get('EXTRA_FILES', '{}')) - for path, content in files.items(): - p = pathlib.Path(path) - p.parent.mkdir(parents=True, exist_ok=True) - p.write_text(content) - print(f'Wrote {len(files)} extra file(s)') - " - - # --- Commit and push (--allow-empty handles no-change case) --- - - git add -A - - | - git -c user.email="sc-automation@census.gov" \ - -c user.name="SC Automation" \ - commit -m "SC automation: ${LAYER}/${REGION_DIR} [${ACCOUNT_REPO}]" \ - --allow-empty - - git push origin "${GIT_BRANCH}" - - # --- Assume cross-account role (if TARGET_ACCOUNT_ID is set) --- - # CodeBuild runs in csvd-dev by default. To run tf-run apply against resources - # in a different AWS account, set TARGET_ACCOUNT_ID. The role - # sc-automation-codebuild-role must exist in that account and trust the - # CodeBuild IAM role from csvd-dev. - - | - if [ -n "${TARGET_ACCOUNT_ID}" ]; then - PARTITION=$(aws sts get-caller-identity --query Arn --output text | cut -d: -f2) - ROLE_ARN="arn:${PARTITION}:iam::${TARGET_ACCOUNT_ID}:role/sc-automation-codebuild-role" - echo "Assuming cross-account role: ${ROLE_ARN}" - CREDS=$(aws sts assume-role \ - --role-arn "${ROLE_ARN}" \ - --role-session-name "sc-automation-${ACCOUNT_REPO}" \ - --query Credentials \ - --output json) - export AWS_ACCESS_KEY_ID=$(echo "$CREDS" | python3 -c "import json,sys; print(json.load(sys.stdin)['AccessKeyId'])") - export AWS_SECRET_ACCESS_KEY=$(echo "$CREDS" | python3 -c "import json,sys; print(json.load(sys.stdin)['SecretAccessKey'])") - export AWS_SESSION_TOKEN=$(echo "$CREDS" | python3 -c "import json,sys; print(json.load(sys.stdin)['SessionToken'])") - echo "Successfully assumed role in account ${TARGET_ACCOUNT_ID}" - else - echo "No TARGET_ACCOUNT_ID set — running with CodeBuild role (csvd-dev)" - fi - - # --- Run Terraform in target layer/region directory --- - # tf-run auto-proceeds on non-TTY stdin (read -t timeout defaults to "y") - - cd "${LAYER}/${REGION_DIR}" - - | - if [ "${DRY_RUN}" = "true" ]; then - tf-plan -no-color - elif [ -n "${TF_RUN_START_TAG}" ]; then - TFARGS="-auto-approve" tf-run apply "tag:${TF_RUN_START_TAG}" - else - TFARGS="-auto-approve" tf-run apply - fi - - # --- Open PR (idempotent: skip if PR already exists) --- - - | - GH_HOST=github.e.it.census.gov \ - GH_TOKEN="${GITHUB_TOKEN}" \ - gh pr create \ - --title "SC automation: ${LAYER}/${REGION_DIR} [${ACCOUNT_REPO}]" \ - --body "Triggered by Service Catalog provisioning of **${ACCOUNT_REPO}**." \ - --base main \ - --head "${GIT_BRANCH}" \ - || echo "PR already exists or create failed, continuing" - - post_build: - commands: - - echo "BUILD_RESULT=${CODEBUILD_BUILD_SUCCEEDING}" - # Emit PR_URL so Lambda can parse it from the build output - - | - PR_URL=$(GH_HOST=github.e.it.census.gov \ - GH_TOKEN="${GITHUB_TOKEN}" \ - gh pr view \ - --repo "${GITHUB_ORG}/${ACCOUNT_REPO}" \ - "${GIT_BRANCH}" \ - --json url -q .url 2>/dev/null || echo "") - echo "PR_URL=${PR_URL}" diff --git a/buildspec.yml.j2 b/buildspec.yml.j2 deleted file mode 100644 index fd4ea2c..0000000 --- a/buildspec.yml.j2 +++ /dev/null @@ -1,93 +0,0 @@ -version: 0.2 - -env: - variables: - PACKER_TEMPLATE_FILE: "{{ packer_template_file }}" - AWS_REGION: "{{ aws_region }}" - ECR_REPOSITORY: "{{ ecr_repository }}" - AWS_ACCOUNT_ID: "{{ aws_account_id }}" - {% if environment_variables %} - {% for key, value in environment_variables.items() %} - {{ key }}: "{{ value }}" - {% endfor %} - {% endif %} - -phases: - install: - commands: - - echo "Installing Packer and dependencies for Service Catalog Lambda build..." - {% if tools %} - {% for tool in tools %} - - echo "Installing {{ tool.name }} version {{ tool.version }}..." - - aws s3 cp s3://{{ assets_bucket }}/{{ tool.zip_path }} /tmp/{{ tool.zip_path }} - - unzip -o /tmp/{{ tool.zip_path }} -d /tmp/{{ tool.name }} - - chmod +x /tmp/{{ tool.name }}/{{ tool.binary_name }} - - mv /tmp/{{ tool.name }}/{{ tool.binary_name }} {{ tool.install_path }}/ - - {{ tool.binary_name }} version - {% endfor %} - {% endif %} - - echo "Packer installation complete" - - pre_build: - commands: - - echo "Initializing Packer plugins for Lambda container build..." - - packer init ${PACKER_TEMPLATE_FILE} - - echo "Packer plugins initialized successfully" - - build: - commands: - - echo "Building Service Catalog Lambda container image..." - - # Get ECR login credentials - - echo "Logging into ECR..." - - aws ecr get-login-password --region ${AWS_REGION} | docker login --username AWS --password-stdin ${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com - - # Build repository URI for ECR - - | - if [ -n "$ECR_REPOSITORY" ]; then - REPOSITORY_URI="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_REPOSITORY}" - else - echo "ERROR: ECR_REPOSITORY is required for container builds" - exit 1 - fi - - # Set image tag - - | - if [ -n "$IMAGE_VERSION_TAG" ]; then - TAG="$IMAGE_VERSION_TAG" - elif [ -n "$IMAGE_TAG" ]; then - TAG="$IMAGE_TAG" - else - TAG="latest" - fi - - # Get ECR credentials for Packer - - ECR_USERNAME="AWS" - - ECR_PASSWORD=$(aws ecr get-login-password --region ${AWS_REGION}) - - ECR_LOGIN_SERVER="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com" - - # Use the cloned base image from ECR instead of public ECR - - BASE_IMAGE="{{ base_image }}" - - - echo "Building with repository_uri=$REPOSITORY_URI tag=$TAG base_image=$BASE_IMAGE" - - # Run Packer build with required variables for Lambda container - - | - packer build \ - -var "repository_uri=$REPOSITORY_URI" \ - -var "tag=$TAG" \ - -var "base_image=$BASE_IMAGE" \ - -var "ecr_login_username=$ECR_USERNAME" \ - -var "ecr_login_password=$ECR_PASSWORD" \ - -var "ecr_login_server=$ECR_LOGIN_SERVER" \ - ${PACKER_TEMPLATE_FILE} - - post_build: - commands: - - echo "Service Catalog Lambda container image build completed successfully" - - echo "Image pushed to $REPOSITORY_URI:$TAG" - - echo "Lambda function is ready for deployment with EventBridge and Service Catalog" - -artifacts: - files: - - '**/*' diff --git a/deploy/codebuild.tf b/deploy/codebuild.tf index 1c198e7..74d18d5 100644 --- a/deploy/codebuild.tf +++ b/deploy/codebuild.tf @@ -15,10 +15,105 @@ data "aws_secretsmanager_secret_version" "ghe_token" { secret_id = "ghe-runner/github-token" } +resource "aws_codebuild_project" "tf_run_proposer" { + name = "tf-run-proposer" + description = "Clone account repo, render templates, commit, push, open PR — no Terraform execution" + build_timeout = 15 # minutes — fast, no tf-run + service_role = aws_iam_role.codebuild_exec.arn + + artifacts { + type = "NO_ARTIFACTS" + } + + environment { + compute_type = "BUILD_GENERAL1_SMALL" + image = "aws/codebuild/amazonlinux2023-x86_64-standard:4.0" + type = "LINUX_CONTAINER" + privileged_mode = false + + environment_variable { + name = "GITHUB_ORG" + value = var.github_org + } + environment_variable { + name = "CENSUS_CA_S3" + value = var.census_ca_s3 + } + environment_variable { + name = "GH_CLI_S3_PREFIX" + value = var.gh_cli_s3_prefix + } + environment_variable { + name = "HTTPS_PROXY" + value = var.https_proxy + } + environment_variable { + name = "NO_PROXY" + value = "github.e.it.census.gov,169.254.169.254,169.254.170.2" + } + # Placeholder values — always overridden by Lambda per-build + environment_variable { + name = "ACCOUNT_REPO" + value = "OVERRIDE_PER_BUILD" + } + environment_variable { + name = "LAYER" + value = "OVERRIDE_PER_BUILD" + } + environment_variable { + name = "REGION_DIR" + value = "OVERRIDE_PER_BUILD" + } + environment_variable { + name = "GITHUB_TOKEN" + type = "SECRETS_MANAGER" + value = var.github_token_secret_name + } + environment_variable { + name = "GIT_BRANCH" + value = "propose/sc-automation" + } + environment_variable { + name = "TEMPLATE_REPO" + value = "" + } + environment_variable { + name = "TEMPLATE_VARS" + value = "{}" + } + environment_variable { + name = "EXTRA_FILES" + value = "{}" + } + } + + source { + type = "GITHUB_ENTERPRISE" + location = var.source_repo_url + buildspec = "buildspec-proposer.yml" + git_clone_depth = 1 + } + + logs_config { + cloudwatch_logs { + group_name = "/aws/codebuild/tf-run-proposer" + stream_name = "" + status = "ENABLED" + } + } + + tags = { + Project = "sc-automation" + ManagedBy = "terraform" + } + + depends_on = [aws_codebuild_source_credential.ghe] +} + resource "aws_codebuild_project" "tf_run_executor" { name = "tf-run-executor" - description = "Clones account repo, writes config files, runs tf-run, opens PR" - build_timeout = 60 # minutes + description = "Clone account repo main branch, assume cross-account role, run tf-run apply" + build_timeout = 60 # minutes — tf-run apply can be slow service_role = aws_iam_role.codebuild_exec.arn artifacts { @@ -31,7 +126,6 @@ resource "aws_codebuild_project" "tf_run_executor" { type = "LINUX_CONTAINER" privileged_mode = false - # --- Static defaults (overridden per-build via environmentVariablesOverride) --- environment_variable { name = "GITHUB_ORG" value = var.github_org @@ -75,27 +169,23 @@ resource "aws_codebuild_project" "tf_run_executor" { value = var.github_token_secret_name } environment_variable { - name = "GIT_BRANCH" - value = "repo-init" - } - environment_variable { - name = "DRY_RUN" - value = "false" + name = "TARGET_ACCOUNT_ID" + value = "" } environment_variable { name = "TF_RUN_START_TAG" value = "" } environment_variable { - name = "EXTRA_FILES" - value = "{}" + name = "DRY_RUN" + value = "false" } } source { type = "GITHUB_ENTERPRISE" location = var.source_repo_url - buildspec = "buildspec.yml" + buildspec = "buildspec-executor.yml" git_clone_depth = 1 } diff --git a/deploy/iam.tf b/deploy/iam.tf index 0398048..7b0df34 100644 --- a/deploy/iam.tf +++ b/deploy/iam.tf @@ -127,7 +127,7 @@ data "aws_iam_policy_document" "codebuild_exec" { ] } - # CloudWatch Logs: write build output + # CloudWatch Logs: write build output for both proposer and executor projects statement { sid = "CloudWatchLogsWrite" effect = "Allow" @@ -137,6 +137,8 @@ data "aws_iam_policy_document" "codebuild_exec" { "logs:PutLogEvents", ] resources = [ + "arn:${data.aws_partition.current.partition}:logs:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:log-group:/aws/codebuild/tf-run-proposer", + "arn:${data.aws_partition.current.partition}:logs:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:log-group:/aws/codebuild/tf-run-proposer:*", "arn:${data.aws_partition.current.partition}:logs:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:log-group:/aws/codebuild/tf-run-executor", "arn:${data.aws_partition.current.partition}:logs:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:log-group:/aws/codebuild/tf-run-executor:*", ] @@ -154,9 +156,21 @@ data "aws_iam_policy_document" "codebuild_exec" { "codebuild:BatchPutCodeCoverages", ] resources = [ + "arn:${data.aws_partition.current.partition}:codebuild:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:report-group/tf-run-proposer-*", "arn:${data.aws_partition.current.partition}:codebuild:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:report-group/tf-run-executor-*", ] } + + # STS: allow executor to assume a cross-account role in target accounts + # Only the executor needs this; proposer only needs GHE access. + statement { + sid = "StsAssumeRoleCrossAccount" + effect = "Allow" + actions = ["sts:AssumeRole"] + resources = [ + "arn:${data.aws_partition.current.partition}:iam::*:role/sc-automation-codebuild-role", + ] + } } resource "aws_iam_role_policy" "codebuild_exec" { diff --git a/deploy/lambda.tf b/deploy/lambda.tf index 602a46d..3747520 100644 --- a/deploy/lambda.tf +++ b/deploy/lambda.tf @@ -43,7 +43,8 @@ resource "aws_lambda_function" "tf_run_trigger" { environment { variables = { - CODEBUILD_PROJECT_NAME = aws_codebuild_project.tf_run_executor.name + PROPOSER_PROJECT_NAME = aws_codebuild_project.tf_run_proposer.name + EXECUTOR_PROJECT_NAME = aws_codebuild_project.tf_run_executor.name GITHUB_TOKEN_SECRET_NAME = var.github_token_secret_name GITHUB_API = var.github_api GITHUB_ORG_NAME = var.github_org diff --git a/deploy/service_catalog.tf b/deploy/service_catalog.tf index 0f55721..ba746fd 100644 --- a/deploy/service_catalog.tf +++ b/deploy/service_catalog.tf @@ -1,16 +1,31 @@ locals { - product_s3_key = "tf-run-executor/v${var.product_version}/product-template.yaml" - template_url = "https://${var.artifacts_bucket_name}.s3.${data.aws_region.current.name}.amazonaws.com/${local.product_s3_key}" + proposer_s3_key = "tf-run-proposer/v${var.product_version}/proposer-template.yaml" + executor_s3_key = "tf-run-executor/v${var.product_version}/executor-template.yaml" + proposer_url = "https://${var.artifacts_bucket_name}.s3.${data.aws_region.current.name}.amazonaws.com/${local.proposer_s3_key}" + executor_url = "https://${var.artifacts_bucket_name}.s3.${data.aws_region.current.name}.amazonaws.com/${local.executor_s3_key}" } # --------------------------------------------------------------------------- -# Upload product template to the centrally-managed SC artifacts bucket +# Upload product templates to the centrally-managed SC artifacts bucket # --------------------------------------------------------------------------- -resource "aws_s3_object" "product_template" { +resource "aws_s3_object" "proposer_template" { bucket = var.artifacts_bucket_name - key = local.product_s3_key - source = "${path.module}/../service-catalog/product-template.yaml" - etag = filemd5("${path.module}/../service-catalog/product-template.yaml") + key = local.proposer_s3_key + source = "${path.module}/../service-catalog/proposer-template.yaml" + etag = filemd5("${path.module}/../service-catalog/proposer-template.yaml") + + tags = { + "servicecatalog:provisioning" = "true" + Project = "sc-automation" + ManagedBy = "terraform" + } +} + +resource "aws_s3_object" "executor_template" { + bucket = var.artifacts_bucket_name + key = local.executor_s3_key + source = "${path.module}/../service-catalog/executor-template.yaml" + etag = filemd5("${path.module}/../service-catalog/executor-template.yaml") tags = { "servicecatalog:provisioning" = "true" @@ -34,18 +49,40 @@ resource "aws_servicecatalog_portfolio" "this" { } # --------------------------------------------------------------------------- -# Product +# Products # --------------------------------------------------------------------------- -resource "aws_servicecatalog_product" "tf_run" { +resource "aws_servicecatalog_product" "tf_run_proposer" { + name = "${var.portfolio_name_prefix}-tf-run-proposer" + owner = "CSVD Platform Engineering" + description = "Render templates, write config files, and open a PR for human review. Run this before the Apply product." + type = "CLOUD_FORMATION_TEMPLATE" + + provisioning_artifact_parameters { + name = "v${var.product_version}" + description = "Version ${var.product_version}" + template_url = local.proposer_url + type = "CLOUD_FORMATION_TEMPLATE" + disable_template_validation = false + } + + tags = { + Project = "sc-automation" + ManagedBy = "terraform" + } + + depends_on = [aws_s3_object.proposer_template] +} + +resource "aws_servicecatalog_product" "tf_run_executor" { name = "${var.portfolio_name_prefix}-tf-run-executor" owner = "CSVD Platform Engineering" - description = "Trigger tf-run in an account repo layer via CodeBuild. Writes extra config files, applies Terraform, and opens a PR." + description = "Run tf-run apply in an account repo layer. Use after the Proposer PR has been reviewed and merged." type = "CLOUD_FORMATION_TEMPLATE" provisioning_artifact_parameters { name = "v${var.product_version}" description = "Version ${var.product_version}" - template_url = local.template_url + template_url = local.executor_url type = "CLOUD_FORMATION_TEMPLATE" disable_template_validation = false } @@ -55,15 +92,20 @@ resource "aws_servicecatalog_product" "tf_run" { ManagedBy = "terraform" } - depends_on = [aws_s3_object.product_template] + depends_on = [aws_s3_object.executor_template] } # --------------------------------------------------------------------------- -# Associate product with portfolio +# Associate both products with the portfolio # --------------------------------------------------------------------------- -resource "aws_servicecatalog_product_portfolio_association" "this" { +resource "aws_servicecatalog_product_portfolio_association" "proposer" { portfolio_id = aws_servicecatalog_portfolio.this.id - product_id = aws_servicecatalog_product.tf_run.id + product_id = aws_servicecatalog_product.tf_run_proposer.id +} + +resource "aws_servicecatalog_product_portfolio_association" "executor" { + portfolio_id = aws_servicecatalog_portfolio.this.id + product_id = aws_servicecatalog_product.tf_run_executor.id } # --------------------------------------------------------------------------- @@ -78,11 +120,11 @@ resource "aws_servicecatalog_principal_portfolio_association" "this" { } # --------------------------------------------------------------------------- -# Launch constraint role — assumed by CFN when launching the product +# Launch constraint role — shared by both products (same Lambda target) # --------------------------------------------------------------------------- resource "aws_iam_role" "sc_launch" { name = "${var.portfolio_name_prefix}-sc-launch-role" - description = "Role assumed by Service Catalog when launching tf-run-executor product" + description = "Role assumed by Service Catalog when launching proposer or executor product" assume_role_policy = jsonencode({ Version = "2012-10-17" @@ -149,14 +191,27 @@ resource "aws_iam_role_policy" "sc_launch" { }) } -resource "aws_servicecatalog_constraint" "launch" { +resource "aws_servicecatalog_constraint" "proposer_launch" { portfolio_id = aws_servicecatalog_portfolio.this.id - product_id = aws_servicecatalog_product.tf_run.id + product_id = aws_servicecatalog_product.tf_run_proposer.id type = "LAUNCH" parameters = jsonencode({ RoleArn = aws_iam_role.sc_launch.arn }) - description = "Launch constraint — uses a dedicated role to invoke the Lambda" + description = "Launch constraint — uses a dedicated role to invoke the Lambda (proposer)" } + +resource "aws_servicecatalog_constraint" "executor_launch" { + portfolio_id = aws_servicecatalog_portfolio.this.id + product_id = aws_servicecatalog_product.tf_run_executor.id + type = "LAUNCH" + + parameters = jsonencode({ + RoleArn = aws_iam_role.sc_launch.arn + }) + + description = "Launch constraint — uses a dedicated role to invoke the Lambda (executor)" +} + diff --git a/docs/HOW-IT-WORKS.md b/docs/HOW-IT-WORKS.md index 284148e..fe6d117 100644 --- a/docs/HOW-IT-WORKS.md +++ b/docs/HOW-IT-WORKS.md @@ -1,30 +1,34 @@ # How sc-lambda-ghactions Works -This document explains the complete end-to-end flow of the SC Lambda + CodeBuild -automation system — from a user filling out a Service Catalog form to Terraform -running inside an AWS account repository. +This document explains the end-to-end flow of the SC Lambda + CodeBuild +automation system — from a user filling out a Service Catalog form through +to a Terraform plan or apply running inside an AWS account repository. --- -## What This System Does +## Design Overview: Two-Product Model -This system provides a **managed execution environment for arbitrary Terraform -configurations**. Teams maintain reusable template repositories on GitHub -Enterprise (GHE). When a user provisions a product through AWS Service Catalog, -this system: +The system is split into **two distinct Service Catalog products** with a human +review gate between them: -1. Accepts configuration data from the user via a product form -2. Clones a team-owned template repository -3. Renders the templates using the user's configuration data -4. Injects the rendered files into the correct location in the target AWS account - repository -5. Runs `tf-run` (the Census Terraform toolchain) to apply the changes -6. Opens a pull request for review -7. Reports success or failure back to CloudFormation +| Product | CodeBuild Project | What It Does | +|---------|------------------|--------------| +| `tf-run-proposer` | `tf-run-proposer` | Clone repo → render templates → commit → open PR | +| `tf-run-executor` | `tf-run-executor` | Clone `main` → assume role → run `tf-run apply` | -The goal is a single, centrally-operated execution platform that any team can -drive by writing a template repo and defining a Service Catalog product form — -without needing to operate their own build infrastructure. +**Why two products?** + +An earlier single-product design ran `tf-run apply` first and then opened a PR +as a trailing artifact. This made the PR meaningless as a review gate — Terraform +had already changed real infrastructure before anyone saw the diff. + +The two-product model restores the PR as a genuine gate: + +1. A team provisions the **Proposer** → changes are committed to a branch and a PR + is opened. No infrastructure is touched. CFN stack completes quickly (< 60s). +2. A human reviews the diff, approves, and merges the PR. +3. The team provisions the **Executor** → CodeBuild checks out `main` (post-merge), + assumes the target account role, and runs `tf-run apply`. --- @@ -32,683 +36,280 @@ without needing to operate their own build infrastructure. ``` ┌─────────────────────────────────────────────────────────────────────┐ -│ User fills AWS Service Catalog form (any AWS account in the org) │ -└─────────────────────────────┬───────────────────────────────────────┘ - │ CloudFormation Custom Resource event - ▼ -┌─────────────────────────────────────────────────────────────────────┐ -│ Lambda: tf-run-executor-trigger │ -│ (centralized in csvd-dev, account 229685449397, us-gov-west-1) │ +│ PROPOSE FLOW │ │ │ -│ • Validates all inputs (Pydantic v2) │ -│ • Fetches the GHE PAT from Secrets Manager │ -│ • Starts a CodeBuild build with per-run environment variables │ -│ • Polls CodeBuild every 20 seconds │ -│ • Returns PR URL + repo URL to CloudFormation on completion │ -└─────────────────────────────┬───────────────────────────────────────┘ - │ AWS CodeBuild StartBuild API - ▼ +│ User fills SC form → CFN Custom Resource │ +│ └─> Lambda (tf-run-executor-trigger) │ +│ • Validates inputs (action=propose) │ +│ • Starts tf-run-proposer CodeBuild build │ +│ • Polls CodeBuild, captures PR URL │ +│ • Returns PR URL + repo URL to CFN │ +│ └─> CodeBuild: tf-run-proposer │ +│ • Installs: Census CA cert, gh CLI, Jinja2 │ +│ • Clones account repo │ +│ • Checks out / creates branch propose/sc-automation │ +│ • Renders .j2 templates from TEMPLATE_REPO │ +│ • Writes EXTRA_FILES │ +│ • git commit + git push │ +│ • gh pr create (idempotent) │ +│ • POST_BUILD emits PR_URL= │ +└─────────────────────────────────────────────────────────────────────┘ + + ↕ Human reviews PR, approves, merges ↕ + ┌─────────────────────────────────────────────────────────────────────┐ -│ CodeBuild project: tf-run-executor │ -│ (Amazon Linux 2023, 60-minute timeout) │ -│ │ -│ INSTALL phase │ -│ • Downloads Terraform binary from S3 (registry.terraform.io is │ -│ blocked on the Census network) │ -│ • Installs the Census CA cert so GHE TLS works │ -│ • Installs tf-run, tf-control.sh, tf-directory-setup.py from │ -│ terraform/support (already cloned for version governance) │ -│ • Installs the gh CLI from S3 │ -│ │ -│ BUILD phase │ -│ 1. Clone the target account repo over HTTPS │ -│ 2. Check out (or create) the work branch │ -│ 3. Clone the template repo and render Jinja2 templates │ -│ 4. Write any explicit extra files (override layer) │ -│ 5. git commit + push │ -│ 6. Assume cross-account role (if TARGET_ACCOUNT_ID is set) │ -│ 7. cd into // and run tf-run or tf-plan │ -│ 8. Open or update a pull request via the gh CLI │ +│ APPLY FLOW │ │ │ -│ POST_BUILD phase │ -│ • Fetch PR URL via gh CLI and emit PR_URL= to build logs │ -│ (Lambda independently calls the GHE API to retrieve the PR URL) │ +│ User fills SC form → CFN Custom Resource │ +│ └─> Lambda (tf-run-executor-trigger) │ +│ • Validates inputs (action=apply) │ +│ • Starts tf-run-executor CodeBuild build │ +│ • Polls CodeBuild until completion │ +│ • Returns apply status + repo URL to CFN │ +│ └─> CodeBuild: tf-run-executor │ +│ • Installs: Terraform binary (from S3), tf-run │ +│ toolchain, Census CA cert, gh CLI, Python deps │ +│ • Clones account repo at main (post-merge) │ +│ • Optionally assumes cross-account IAM role │ +│ • cd {LAYER}/{REGION_DIR} │ +│ • tf-run apply (respects TF_RUN_START_TAG) │ +│ • POST_BUILD emits BUILD_RESULT= │ └─────────────────────────────────────────────────────────────────────┘ ``` --- -## Step-by-Step Walkthrough - -### Step 1 — User Fills the Service Catalog Form - -A user opens the AWS Service Catalog console in any account that belongs to the -Census AWS Organizations org. They find the **tf-run automation** product and fill -in the form: - -| Field | What to put here | -|---|---| -| **Account Repo Name** | The GHE repo that contains the target account's Terraform. Example: `229685449397-csvd-dev-platform-dev-gov` | -| **Terraform Layer** | Which layer of the account repo to operate on: `common`, `infrastructure`, or `vpc` | -| **Region Directory** | `east`, `west`, or `global` — the subdirectory inside the layer. Use `global` for non-regional resources (SSO, IAM, org-level). | -| **Target AWS Account ID** | Optional. The AWS account ID where `tf-run apply` should make changes. When set, CodeBuild assumes `sc-automation-codebuild-role` in that account. Leave blank to run with the CodeBuild role's credentials (only works for resources in csvd-dev). | -| **Template Repository** | Name of a GHE repo containing Jinja2 or raw file templates (optional) | -| **Template Variables** | A JSON object of key/value pairs passed to Jinja2 when rendering templates (optional) | -| **Extra Config Files** | A JSON object of `{"relative/path": "file content"}` written directly into the repo, bypassing templates (optional) | -| **Git Branch** | The branch to commit to and open the PR from (default: `repo-init`) | -| **tf-run Start Tag** | A `TAG` label from `tf-run.data` to start execution from; leave blank to run all steps | -| **Dry Run** | Set `true` to run `tf plan` only — no apply, no PR | - -The user clicks **Launch Product**. CloudFormation creates a stack with a -`Custom::TerraformRun` resource. This triggers the Lambda. - -> **Note**: `aws_account_id` and `aws_region` are **not** on the form. CloudFormation -> resolves them automatically with `!Sub "${AWS::AccountId}"` and `!Sub "${AWS::Region}"`. -> Do not add them as user-facing parameters. - ---- - -### Step 2 — Lambda Validates and Dispatches - -The Lambda (`tf-run-executor-trigger`) receives a CloudFormation Custom Resource -event of type `Create` (first provision) or `Update` (stack update). - -**What the Lambda does:** - -1. **Normalizes property names** — CloudFormation sends PascalCase parameter names - (`AccountRepo`). The Lambda converts them to `snake_case` (`account_repo`) before - validation. Properties already in `snake_case` are left as-is. - -2. **Validates all inputs** via a Pydantic v2 model (`TfRunRequest`): - - `account_repo` — required, must be a non-empty string - - `layer` — must be one of `common`, `infrastructure`, `vpc` - - `region_dir` — must be one of `east`, `west` - - `target_account_id` — optional; 12-digit AWS account ID or empty string - - `template_repo` — optional string; empty string means no template rendering - - `template_vars` — optional; accepts a JSON string (as CFN sends it) or a dict - - `extra_files` — optional; accepts a JSON string or a dict - - `git_branch` — optional, defaults to `repo-init` - - `tf_run_start_tag` — optional, defaults to empty (run all steps) - - `dry_run` — optional, defaults to `false` - - If validation fails, the Lambda immediately signals CloudFormation `FAILED` - with the validation error as the reason — the build is never started. - -3. **Fetches the GHE PAT** from AWS Secrets Manager (`ghe-runner/github-token`). - -4. **Starts a CodeBuild build** on the `tf-run-executor` project with all the - parameters injected as per-build environment variable overrides: - - | Env var | Source | - |---|---| - | `ACCOUNT_REPO` | `account_repo` field | - | `LAYER` | `layer` field | - | `REGION_DIR` | `region_dir` field | - | `GIT_BRANCH` | `git_branch` field | - | `TF_RUN_START_TAG` | `tf_run_start_tag` field | - | `TARGET_ACCOUNT_ID` | `target_account_id` field | - | `TEMPLATE_REPO` | `template_repo` field | - | `TEMPLATE_VARS` | `template_vars` serialized to JSON string | - | `EXTRA_FILES` | `extra_files` serialized to JSON string | - | `DRY_RUN` | `dry_run` as lowercase string (`"true"` / `"false"`) | - | `GITHUB_TOKEN` | PAT from Secrets Manager (plaintext, not a reference) | - -5. **Polls CodeBuild** by calling `BatchGetBuilds` every 20 seconds until the - build status is no longer `IN_PROGRESS`, or until 60 seconds before the - Lambda timeout (900 seconds total), whichever comes first. - -6. On **success**: Calls the GHE API to fetch the open PR URL for the branch and - signals CloudFormation `SUCCESS` with the PR URL, repo URL, and branch name - as outputs. - - On **failure**: Signals CloudFormation `FAILED` with the CodeBuild build ID - and logs URL so the user knows where to look. - -**Delete events** are no-ops — the Lambda signals `SUCCESS` immediately and takes -no action. Terraform changes are not automatically reversed. - ---- - -### Step 3 — CodeBuild: Install Phase - -The `tf-run-executor` CodeBuild project runs on **Amazon Linux 2023**. The install -phase sets up every tool the build needs: - -- **Version governance** — the build first clones - `github.e.it.census.gov/terraform/support` (the org-canonical version registry) - and reads two `VERSION` files from it: - - `terraform/VERSION` → the Terraform version to install - - `github-cli-releases/VERSION` → the gh CLI version to install - - This means Terraform and gh CLI versions are governed centrally in that repo — - updating a VERSION file there automatically affects all future builds here - without any change to this repo. - -- **Terraform binary** — downloaded from S3 at - `s3://csvd-packer-pipeline-assets/terraform/terraform_{VERSION}_linux_amd64.zip`. - `registry.terraform.io` is blocked on the Census network. The S3 bucket must - contain the version pinned in `terraform/support` (kept in sync as an ops task). - After extraction, `tf` is symlinked to the binary. -- **tf-{action} symlinks** — `tf-plan`, `tf-apply`, `tf-init`, etc. are all - symlinks to `tf-control.sh`, which wraps each Terraform operation with logging, - proxy settings, and version pinning from `.tf-control`. -- **Census CA cert** — installed from S3 into the system trust store via - `update-ca-trust`. Required for TLS connections to `github.e.it.census.gov`. -- **tf-run toolchain** — `tf-run`, `tf-control.sh`, and `tf-directory-setup.py` - are copied from `terraform/support` (`local-app/` subtree), which is already - cloned during the version governance step above. There are no script copies - bundled in this repo — `terraform/support` is the single source of truth. -- **Python packages** — `jinja2`, `python-dateutil`, `pyyaml` installed for - template rendering and `tf-directory-setup.py`. -- **gh CLI** — downloaded from S3 at - `s3://csvd-packer-pipeline-assets/tools/gh_{VERSION}_linux_amd64.tar.gz`, - using the version read from `terraform/support`. - ---- - -### Step 4 — CodeBuild: Clone Account Repo - -```bash -# Rewrite SSH → HTTPS so Terraform module sources work without deploy keys -git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "ssh://git@github.e.it.census.gov/" -git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "git@github.e.it.census.gov:" - -git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${ACCOUNT_REPO}.git" repo -cd repo -git checkout -B "${GIT_BRANCH}" -``` - -The account repo is cloned over HTTPS with the GHE PAT embedded in the URL. -GHE is on the Census internal network; no proxy is required (it is in `NO_PROXY`). - -The **SSH→HTTPS git URL rewrite** handles a key problem: account repos reference -Terraform modules via SSH (`git::ssh://github.e.it.census.gov/...` or -`git@github.e.it.census.gov:...`). When `tf-run apply` executes, Terraform fetches -those modules — and without an SSH key in CodeBuild, those fetches would fail. -The global git rewrite transparently redirects all SSH-form GHE URLs to HTTPS + -PAT, so all module sources resolve without needing a per-repo deploy key. +## Infrastructure Overview -`git checkout -B` creates the branch if it does not exist, or resets it to the -current `HEAD` if it does. Subsequent pushes will force-update this branch. +| Resource | Name | Account / Location | +|---|---|---| +| Lambda | `tf-run-executor-trigger` | csvd-dev (`229685449397`), `us-gov-west-1` | +| CodeBuild (proposer) | `tf-run-proposer` | csvd-dev | +| CodeBuild (executor) | `tf-run-executor` | csvd-dev | +| SC Portfolio | `{prefix}-tf-run` | csvd-dev | +| SC Product (propose) | `{prefix}-tf-run-proposer` | csvd-dev | +| SC Product (apply) | `{prefix}-tf-run-executor` | csvd-dev | +| CFN Template (propose) | `service-catalog/proposer-template.yaml` | S3 artifacts bucket | +| CFN Template (apply) | `service-catalog/executor-template.yaml` | S3 artifacts bucket | +| Launch Role | `{prefix}-sc-launch-role` | csvd-dev | +| GHE PAT | `ghe-runner/github-token` in Secrets Manager | csvd-dev | +| Cross-account role | `sc-automation-codebuild-role` | **Target** account | --- -### Step 5 — Template Rendering (if a template repo is specified) +## Step-by-Step: Propose Flow -If `TEMPLATE_REPO` is set, the build clones that repo and renders its contents -into the account repo: +### 1. User fills the SC form -```bash -git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${TEMPLATE_REPO}.git" /tmp/template-repo -``` +The user opens the **tf-run-proposer** product in the Service Catalog console and +provides: -The Python rendering script then walks every file in the template repo: +- **AccountRepo** — the account repo name (e.g. `229685449397-csvd-dev-platform-dev-gov`) +- **Layer** — `common`, `infrastructure`, or `vpc` +- **RegionDir** — `east`, `west`, or `global` +- **GitBranch** — branch to commit to (default: `propose/sc-automation`) +- **TemplateRepo** _(optional)_ — GHE repo containing `.j2` Jinja2 template files +- **TemplateVars** _(optional)_ — JSON dict of values passed to Jinja2 +- **ExtraFiles** _(optional)_ — JSON dict of `{ "path": "content" }` written directly -- **`.j2` files** are rendered as Jinja2 templates with `TEMPLATE_VARS` as the - variable context. `StrictUndefined` is used, meaning the build **fails** if - a template references a variable that was not provided. The `.j2` extension is - stripped from the output filename. -- **All other files** are copied as-is, preserving directory structure. +### 2. CloudFormation invokes the Lambda -All output files land in the account repo at the same relative paths they had -in the template repo. +CFN creates a `Custom::TerraformPropose` resource with `action: propose`. -**Example**: A template repo with this layout: - -``` -vpc/west/vpc.tf.j2 -vpc/west/README.md -``` +### 3. Lambda validates and starts CodeBuild -...and `TEMPLATE_VARS = {"vpc_cidr": "10.0.0.0/16"}` would write: +`TfRunRequest` is validated by Pydantic. Lambda starts `tf-run-proposer` with these +per-build environment variable overrides: ``` -repo/vpc/west/vpc.tf ← rendered from vpc.tf.j2 -repo/vpc/west/README.md ← copied verbatim +ACCOUNT_REPO, LAYER, REGION_DIR, GIT_BRANCH, +TEMPLATE_REPO, TEMPLATE_VARS, EXTRA_FILES, GITHUB_TOKEN ``` -If `TEMPLATE_REPO` is empty or not set, this step is skipped entirely. +### 4. CodeBuild - INSTALL phase ---- +- Clones `github.e.it.census.gov/terraform/support` for version governance +- Downloads and installs `gh` CLI from S3 (version governed by `VERSION_GH`) +- Downloads and installs Census CA cert from S3 → `update-ca-trust` +- `pip3 install jinja2` +- **Does NOT install Terraform** — no infrastructure changes happen in this build -### Step 6 — Extra Files (direct file injection) +### 5. CodeBuild - BUILD phase -After template rendering, any key/value pairs in `EXTRA_FILES` are written -directly to the account repo: +1. Rewrite git remote URLs (`ssh://` → `https://`) using the GHE PAT +2. `git clone` the account repo; `git checkout -B ${GIT_BRANCH}` +3. If `TEMPLATE_REPO` is set: + - Clone the template repo + - Walk all files; render `.j2` files with Jinja2 (`StrictUndefined`) + - Copy rendered + non-template files into account repo at same relative paths +4. If `EXTRA_FILES` is non-empty: + - Parse the JSON dict; write each `path → content` entry directly (overrides templates) +5. `git add -A && git commit -m "feat: sc-automation propose" --allow-empty` +6. `git push origin ${GIT_BRANCH} --force-with-lease` +7. `gh pr create --base main --head ${GIT_BRANCH} --title "..." --body "..."` (idempotent — skips if PR already exists) -```python -files = json.loads(os.environ.get('EXTRA_FILES', '{}')) -for path, content in files.items(): - pathlib.Path(path).parent.mkdir(parents=True, exist_ok=True) - pathlib.Path(path).write_text(content) -``` +### 6. CodeBuild - POST_BUILD phase -Because this runs **after** template rendering, `EXTRA_FILES` can override -any file that was produced by the template repo. Use this for one-off -customizations that do not belong in the reusable template. - ---- +Emits `PR_URL=` to stdout for Lambda to capture. -### Step 7 — Commit and Push +### 7. Lambda polls and returns -```bash -git add -A -git -c user.email="sc-automation@census.gov" \ - -c user.name="SC Automation" \ - commit -m "SC automation: ${LAYER}/${REGION_DIR} [${ACCOUNT_REPO}]" \ - --allow-empty -git push origin "${GIT_BRANCH}" -``` +Lambda polls CodeBuild every 20 s. On `SUCCEEDED`: +- Fetches PR URL via `gh pr view` output +- Sends CFN `SUCCESS` with: + - `PullRequestUrl` / `pull_request_url` + - `RepositoryUrl` / `repository_url` + - `BranchName` / `branch_name` + - `CodeBuildBuildId` -`--allow-empty` handles the case where the rendered files are identical to what -was already on the branch, preventing the build from failing on a no-change run. +The CFN stack completes and the output panel shows the PR URL. --- -### Step 8 — Run Terraform +## Step-by-Step: Apply Flow -The build changes into the target layer/region directory and runs the Census -`tf-run` toolchain: +### 1. Prerequisites -```bash -cd "${LAYER}/${REGION_DIR}" +- The Proposer has run and its PR has been **reviewed and merged** to `main` +- The target account has the `sc-automation-codebuild-role` IAM role with a trust + policy allowing assume-role from the CodeBuild execution role in csvd-dev -# Dry run: plan only, no apply, no PR -if [ "${DRY_RUN}" = "true" ]; then - tf-plan -no-color +### 2. User fills the SC form -# Start from a specific tf-run.data step -elif [ -n "${TF_RUN_START_TAG}" ]; then - TFARGS="-auto-approve" tf-run apply "tag:${TF_RUN_START_TAG}" +The user opens the **tf-run-executor** product and provides: -# Full run from the beginning -else - TFARGS="-auto-approve" tf-run apply -fi -``` +- **AccountRepo** — same repo name as the Proposer +- **Layer** and **RegionDir** — same as the Proposer +- **TargetAccountId** _(optional)_ — if set, CodeBuild assumes the cross-account role +- **TfRunStartTag** _(optional)_ — start tf-run from a specific `TAG` step +- **DryRun** — `true` for plan-only, `false` to apply -**`tf-run` in non-interactive mode**: `tf-run` normally prompts `continue [y/n]` -between steps. In CodeBuild there is no TTY, so `read -t $TIMEOUT` returns -immediately with a non-zero exit code and the default `y` is used — `tf-run` -auto-proceeds through all steps without any manual intervention needed. - -**`tf-run plan` vs `tf-plan`**: Note that the dry-run path invokes `tf-plan` -directly (the bare `tf-control.sh` wrapper around `terraform plan`). This skips -the `tf-run` orchestration layer, which means it will **not** create the initial -symlinks that a first-time `tf-run` pass would normally set up. If the layer has -never been run before, prefer `tf-run plan` (via `TF_RUN_START_TAG` pointing to -an early step, or by running `tf-run init` manually first) rather than dry-run -mode. - -**Cross-account credentials**: If `TARGET_ACCOUNT_ID` is set, the build assumes -`arn:{partition}:iam::{TARGET_ACCOUNT_ID}:role/sc-automation-codebuild-role` via -`aws sts assume-role` immediately before `cd`-ing into the layer directory and -running `tf-run`. The assumed credentials are exported as `AWS_ACCESS_KEY_ID`, -`AWS_SECRET_ACCESS_KEY`, and `AWS_SESSION_TOKEN`, which Terraform and the AWS -provider pick up automatically. If `TARGET_ACCOUNT_ID` is empty, CodeBuild runs -with its own IAM role (scoped to csvd-dev). - -**For the target directory to work**, the account repo must already have been -initialised for that layer. That means `//` must contain: - -- `remote_state.yml` — metadata used by `tf-directory-setup.py` to generate backend config -- `tf-run.data` — step definitions for this layer/region -- `.tf-control` at the repo root — Terraform version pin - -`remote_state.backend.tf` is **not** required to pre-exist — the `REMOTE_STATE` -directive in `tf-run.data` generates it on first run by reading `remote_state.yml` -in the parent directories. What does need to exist is `remote_state.yml` and the -containing directory structure. - -**git-secret**: Account repos use `git-secret` for GPG-encrypted secrets. The -CodeBuild environment does not have the team's GPG key and cannot decrypt those -files. Builds that require decrypted secrets must ensure those values are supplied -via another mechanism (e.g. Secrets Manager) rather than relying on `git secret reveal`. +### 3. CloudFormation invokes the Lambda ---- +CFN creates a `Custom::TerraformApply` resource with `action: apply`. -### Step 9 — Open Pull Request - -```bash -GH_HOST=github.e.it.census.gov \ -GH_TOKEN="${GITHUB_TOKEN}" \ -gh pr create \ - --title "SC automation: ${LAYER}/${REGION_DIR} [${ACCOUNT_REPO}]" \ - --body "Triggered by Service Catalog provisioning of ${ACCOUNT_REPO}." \ - --base main \ - --head "${GIT_BRANCH}" \ -|| echo "PR already exists or create failed, continuing" -``` +### 4. Lambda validates and starts CodeBuild -If the PR already exists for this branch (e.g., from a previous provision of the -same stack), the `gh pr create` command exits non-zero but the `|| echo` prevents -the build from failing. The `post_build` phase then fetches the existing PR URL -with `gh pr view`. +Lambda starts `tf-run-executor` with: ---- - -### Step 10 — Lambda Returns Results to CloudFormation - -Once CodeBuild reports `SUCCEEDED`, the Lambda calls the GHE REST API -(`/repos/{org}/{repo}/pulls?state=open&head={org}:{branch}`) to fetch the open PR -URL for the branch. The `PR_URL=` line emitted in `post_build` is informational -only (visible in CodeBuild logs as a convenience) — the Lambda does not parse -build output; it calls the GHE API independently. - -The Lambda then signals CloudFormation with: - -| CloudFormation Output | Value | -|---|---| -| `PullRequestUrl` / `pull_request_url` | URL of the open PR | -| `RepositoryUrl` / `repository_url` | URL of the account repo | -| `BranchName` / `branch_name` | Branch that was committed to | -| `CodeBuildBuildId` | Full CodeBuild build ID (useful for looking up logs) | +``` +ACCOUNT_REPO, LAYER, REGION_DIR, +TARGET_ACCOUNT_ID, TF_RUN_START_TAG, DRY_RUN, GITHUB_TOKEN +``` -Both PascalCase and snake_case variants are returned for the first three outputs so -that `!GetAtt` works regardless of which form the consuming CFN template uses. +### 5. CodeBuild - INSTALL phase -The CloudFormation stack transitions to `CREATE_COMPLETE` and the Service Catalog -provisioned product shows as **AVAILABLE**. +- Clones `github.e.it.census.gov/terraform/support` for version governance +- Downloads Terraform binary from S3 (version governed by `VERSION_TF`) +- Installs tf-run toolchain scripts from the support repo +- Downloads and installs Census CA cert +- Downloads and installs `gh` CLI +- `pip3 install python-dateutil pyyaml` ---- +### 6. CodeBuild - BUILD phase -## Infrastructure Overview +1. Rewrite git remotes; `git clone` account repo; `git checkout main` +2. If `TARGET_ACCOUNT_ID` is set: `aws sts assume-role` → + `arn:aws:iam::{TARGET_ACCOUNT_ID}:role/sc-automation-codebuild-role` + and export the temporary credentials +3. `cd ${LAYER}/${REGION_DIR}` +4. If `DRY_RUN=true`: `tf-run plan`; else: `tf-run apply` (with optional `--start-tag ${TF_RUN_START_TAG}`) -All infrastructure lives in `deploy/` and is managed by Terraform. It is deployed -once to `csvd-dev` (`229685449397`, `us-gov-west-1`) and shared to all other -accounts in the org via Service Catalog portfolio sharing. +### 7. Lambda polls and returns -| Resource | Purpose | -|---|---| -| `aws_ecr_repository.lambda` | Container image registry for the Lambda | -| `aws_lambda_function.tf_run_trigger` | The Lambda function (`tf-run-executor-trigger`) | -| `aws_lambda_permission.cfn_invoke` | Allows CloudFormation in any org account to invoke the Lambda cross-account | -| `aws_codebuild_project.tf_run_executor` | The CodeBuild project that runs builds | -| `aws_codebuild_source_credential.ghe` | GHE PAT credential for CodeBuild to clone from GHE | -| `aws_iam_role.lambda_exec` | Lambda execution role: Secrets Manager read, CodeBuild start/poll, CloudWatch Logs write | -| `aws_iam_role.codebuild_exec` | CodeBuild service role: S3 read, Secrets Manager read, CloudWatch Logs write | -| `aws_s3_object.product_template` | The CFN product template uploaded to S3 for Service Catalog | -| `aws_servicecatalog_portfolio.this` | The SC portfolio | -| `aws_servicecatalog_product.tf_run` | The SC product (CloudFormation template type) | -| `aws_servicecatalog_constraint.launch` | Launch constraint tying the product to the SC launch IAM role | -| `aws_iam_role.sc_launch` | SC launch role: invoke Lambda + CloudFormation operations | - -### Why Everything Is Centralized in csvd-dev - -The Lambda only calls GHE and CodeBuild — it makes zero AWS API calls in the -account where the SC product is provisioned. There is no reason to deploy a copy -of the Lambda, CodeBuild project, ECR repo, or Secrets Manager secret to every -account. Centralizing in `csvd-dev` means there is one place to update and one -set of credentials to rotate. - -> **Future**: when the org has designated **operations accounts** (standard -> Census infrastructure tier), this system should move there rather than living -> in a team-owned account like `csvd-dev`. The move is straightforward — -> see *Moving This System to a Different AWS Account* below. +On `SUCCEEDED`: +- Sends CFN `SUCCESS` with: + - `ApplyStatus: SUCCEEDED` + - `RepositoryUrl` / `repository_url` + - `CodeBuildBuildId` --- -## Template Repository Design Guide +## Key Constraints -A template repo is any GHE repository (in the same org) that contains the files -you want written into the account repo. There are no required conventions — the -directory structure you use in the template repo becomes the directory structure -in the account repo. +### Census Network Proxy -### Jinja2 Templates (`.j2` files) +CodeBuild runs on standard Amazon Linux 2023 inside the Census VPC. Registry +traffic (Terraform providers, Python packages) must go through the Census HTTP +proxy: -Files ending in `.j2` are rendered with Jinja2 before being written. The -`TemplateVars` JSON object passed through the SC form provides the variable -context. - -**Example template** (`vpc/west/main.tf.j2`): -```hcl -# Generated by SC automation -locals { - vpc_name = "{{ vpc_name }}" - cidr_block = "{{ cidr_block }}" - env = "{{ environment }}" -} -``` - -**Example TemplateVars JSON**: -```json -{ - "vpc_name": "my-vpc", - "cidr_block": "10.20.0.0/16", - "environment": "dev" -} ``` - -**Output** written to `vpc/west/main.tf` in the account repo: -```hcl -# Generated by SC automation -locals { - vpc_name = "my-vpc" - cidr_block = "10.20.0.0/16" - env = "dev" -} +HTTPS_PROXY=http://proxy.tco.census.gov:3128 +NO_PROXY=github.e.it.census.gov,169.254.169.254,169.254.170.2 ``` -`StrictUndefined` is enabled — if a template references `{{ some_variable }}` and -`some_variable` is not in `TemplateVars`, the build fails immediately with a clear -error message. - -### Non-template Files +`github.e.it.census.gov` is in `NO_PROXY` because it is accessed directly +(the proxy does not handle GHE traffic). -Files without the `.j2` extension are copied verbatim. Use these for static -Terraform files, `tf-run.data` step definitions, `.tf-control` version pins, and -anything else that does not vary per deployment. +### TLS — Census CA Certificate -### Extra Files Override +The internal GHE host uses a Census-signed TLS certificate. The Census CA cert +must be installed into the OS trust store (`update-ca-trust`) before any `git`, +`gh`, or `pip` commands that touch GHE or Census-mirrored registries. -If you need to inject a file that is not in the template repo — or override a -specific file from the template repo for a particular deployment — use -`ExtraFiles`. It runs after template rendering and wins on any path conflict. +### Terraform Binary from S3 ---- - -## Key Constraints and Limitations - -### Moving This System to a Different AWS Account - -The infrastructure is straightforward to relocate. All account-specific values -are Terraform variables — there are no hardcoded account IDs in the code. To -move the Lambda and CodeBuild project to a different account: - -1. Update `deploy/terraform.tfvars` with the new account's values -2. Run `tf init && tf apply` in the new account -3. Rebuild the Lambda image with `packer-pipeline` in the new account context -4. Update the Service Catalog `ServiceToken` ARN in each SC product template to - point to the new account's Lambda ARN (`deploy/service_catalog.tf` handles - this automatically via `!Sub` with the new account ID) -5. Update portfolio sharing in `deploy/service_catalog.tf` to share to the org - -No code changes are required — only variable and deployment target changes. - -### How CodeBuild Runs Terraform in a Different Account - -CodeBuild itself always runs in the account where it is deployed (initially -csvd-dev). When `TARGET_ACCOUNT_ID` is set on a product launch, the buildspec -assumes a cross-account IAM role in that account before running `tf-run apply`. - -**For this to work, the target account must have a role named -`sc-automation-codebuild-role`** with: -- A trust policy allowing the CodeBuild IAM role from csvd-dev to assume it: - ```json - { - "Principal": { - "AWS": "arn:aws-us-gov:iam::229685449397:role/tf-run-executor-codebuild" - }, - "Action": "sts:AssumeRole" - } - ``` -- Permissions to read/write the S3 Terraform state bucket, create/modify the - resources the Terraform config manages, and write CloudWatch Logs - -This role is **not** created by this repo — it must be provisioned separately in -each target account before the SC product can successfully apply there. -Creating that role is a future work item (potentially via another SC product). - -If `TARGET_ACCOUNT_ID` is left blank, CodeBuild uses its own IAM role and can -only apply Terraform that targets resources within csvd-dev. - -The Lambda uses `{account_repo}-{layer}-{region_dir}` as the CloudFormation -`PhysicalResourceId`. This means that if you update a stack but those three -fields stay the same, CloudFormation treats it as an in-place `Update` of the -existing resource (and triggers another build). If any of those three fields -change, CloudFormation treats it as a replacement — it will call `Delete` on the -old resource (a no-op) and `Create` on the new one (triggering a fresh build in -the new target location). This is intentional behavior. - -### Inputs the Lambda Enforces - -- `layer` must be exactly `common`, `infrastructure`, or `vpc` -- `region_dir` must be exactly `east`, `west`, or `global` -- `template_vars` and `extra_files` must be valid JSON objects (or empty strings) - -These constraints exist because `tf-run` is directory-structured and expects to -`cd` into a path like `infrastructure/west/` or `common/global/`. - -### Account Repo Must Have `remote_state.yml` - -The `REMOTE_STATE` directive in `tf-run.data` generates `remote_state.backend.tf` -automatically on first run — you do **not** need to manually create it beforehand. -However, it does require a `remote_state.yml` to already exist in the -`//` directory (and its parents) with the correct account ID, -bucket name, and region metadata. This file is part of the standard account repo -structure and must be committed to the repo before this automation can run there. - -Running `tf-run init` against the layer is the correct way to set up a new -layer — it creates `remote_state.yml`, generates the backend file, and creates -the initial symlinks. This automation assumes `tf-run init` has already been run -at least once on the target layer/region. - -### Build Timeout - -The Lambda timeout is 900 seconds (15 minutes). The CodeBuild project timeout is -60 minutes. The Lambda polls until 60 seconds before its own deadline and then -gives up, returning `LAMBDA_TIMEOUT`. The CodeBuild build continues running after -a Lambda timeout — check the CodeBuild console for full logs. - -> **Future improvement**: polling synchronously inside the Lambda for up to 15 -> minutes is wasteful from a cost and concurrency standpoint. A better pattern -> is to have CodeBuild emit an EventBridge event on build completion, and have -> a separate Lambda (or Step Functions state machine) listen for it and signal -> CloudFormation. This would let the trigger Lambda exit immediately after -> starting the build. This is a known design limitation, tracked as future work. - -### SSH Module Sources Are Supported (via URL Rewrite) - -Account repos reference Terraform modules using SSH-style URLs -(`git::ssh://github.e.it.census.gov/...` or `git@github.e.it.census.gov:...`). -These work transparently in CodeBuild because the build configures a global git -URL rewrite at the start of the BUILD phase that redirects all SSH-form GHE URLs -to HTTPS + PAT. No deploy keys or SSH configuration are required. - -Direct `git clone` of the account repo and template repo also uses HTTPS + PAT. -GHE is on the Census internal network and is in `NO_PROXY`, so no proxy is -involved in any GHE git operation. - -> **Alternative**: A service-account SSH key (rather than a PAT URL rewrite) -> could also support SSH module sources. The URL rewrite approach was chosen -> because it requires no key management and works with the existing GHE PAT -> already used for repository access. A service-user SSH key is a viable -> alternative if the URL rewrite causes issues. - -### Delete Is a No-op - -CloudFormation `Delete` events (stack deletion or product termination) are -acknowledged with `SUCCESS` immediately. No Terraform destroy is run. This is -intentional — automated infra teardown is too risky without explicit human review. - -The recommended decommission path is to use this same system to open a PR that -removes the resource definitions from the account repo, review and merge the PR -manually, then run `tf apply` outside of automation to destroy the resources. - ---- - -## Adding Support for a New Terraform Configuration - -To onboard a new type of Terraform work to this platform: - -1. **Create a template repo** in the `SCT-Engineering` GHE org. Add your `.tf` - files, using `.j2` extension for any file that needs variable substitution. -2. **Document the required `TemplateVars`** so SC product users know what JSON - to supply. -3. **Create a new SC product** (a new CloudFormation template in - `service-catalog/`) that pre-fills `TemplateRepo` with your repo name and - guides users toward filling in the right `TemplateVars`. -4. **Add the product to the portfolio** by referencing it from `deploy/service_catalog.tf`. - -The Lambda and CodeBuild infrastructure itself does not need to change. - ---- +`registry.terraform.io/hashicorp` is blocked on the Census network. The Terraform +binary is pre-staged in S3 (`csvd-packer-pipeline-assets`) and downloaded during +the INSTALL phase. The version is governed by `VERSION_TF` in the +`github.e.it.census.gov/terraform/support` repo. -## Rebuilding the Lambda Image +### GitHub Provider: CSVD Module Only -When `lambda/app.py`, `lambda/Dockerfile`, or `lambda/requirements.txt` change, -rebuild and push the container image. From the repo root (after sourcing your AWS -credentials): +The executor uses the `CSVD/terraform-github-repo` internal Terraform module. +The public `HappyPathway/terraform-github-repo` is pinned to `github ~> 6.0` +which conflicts with the `>= 6.11.0` constraint used here. Do not switch modules. -```bash -cd sc-lambda-ghactions -packer-pipeline --config csvd_config_packer.hcl -``` +### Cross-Account Role -After the `tf-run-executor-builder` CodeBuild build succeeds, run `tf apply` in -`deploy/` to update the Lambda to the new image digest — the Terraform resource -for the Lambda already references `latest` and a `tf apply` will detect and -push the update: +For the executor to apply Terraform in an account other than csvd-dev, the target +account must have: -```bash -cd deploy/ -tf plan # confirm only the Lambda image digest changes -tf apply +```hcl +resource "aws_iam_role" "sc_automation_codebuild" { + name = "sc-automation-codebuild-role" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Principal = { AWS = "arn:aws-us-gov:iam::229685449397:role/tf-run-executor-codebuild" } + Action = "sts:AssumeRole" + }] + }) +} ``` -## Deploying Infrastructure Changes +The executor role in csvd-dev has `sts:AssumeRole` on +`arn:*:iam::*:role/sc-automation-codebuild-role`. -From the `deploy/` directory (after sourcing your AWS credentials): +### Proposer is Idempotent -```bash -export AWS_DEFAULT_REGION=us-gov-west-1 -cd sc-lambda-ghactions/deploy -tf init # only needed after provider or backend changes -tf plan -tf apply -``` +The proposer uses `git push --force-with-lease` and `gh pr create` with a check +for an existing open PR. Re-provisioning the Proposer product will update the +branch and PR rather than creating a duplicate. --- -## Troubleshooting - -### CloudFormation stack stuck in CREATE_IN_PROGRESS +## Parameter Naming Convention -The Lambda is polling CodeBuild. Check the Lambda logs in CloudWatch -(`/aws/lambda/tf-run-executor-trigger`) and find the CodeBuild build ID in the -log output, then open that build in the CodeBuild console. +The CFN product templates pass all parameters in `snake_case` directly to the +Lambda. The Lambda Pydantic model uses `snake_case` field names. Passing +`snake_case` from CFN avoids the PascalCase→snake_case normalizer, which +mishandles acronyms (`AWSAccountId` → `a_w_s_account_id`). -### FAILED: validation error - -The Lambda rejected the inputs before the build started. The CFN stack event -reason field will contain the Pydantic validation error. Fix the parameter and -update the stack. - -### FAILED: CodeBuild build FAILED - -Open the CodeBuild build using the build ID in the failure reason. The most -common causes are: -- Template rendered with missing variables (`StrictUndefined` error) — add the - missing key to `TemplateVars` -- `tf-run` failed on a Terraform plan/apply error — check the full build log for - the Terraform error output -- Account repo not bootstrapped — the target `//` directory - is missing `remote_state.backend.tf` +--- -### LAMBDA_TIMEOUT +## Files Reference -The build took longer than the Lambda poll window (~14 minutes). The CodeBuild -build is still running. Wait for it to complete, check the PR on GHE, and if -needed do a stack Update (which will start a new build) to re-sync CFN with the -result. +| File | Purpose | +|---|---| +| `buildspec-proposer.yml` | CodeBuild build definition for the proposer project | +| `buildspec-executor.yml` | CodeBuild build definition for the executor project | +| `lambda/app.py` | Lambda entry point: validates inputs, routes to proposer or executor | +| `deploy/codebuild.tf` | Terraform: `aws_codebuild_project.tf_run_proposer` + `tf_run_executor` | +| `deploy/lambda.tf` | Terraform: Lambda function with `PROPOSER_PROJECT_NAME` + `EXECUTOR_PROJECT_NAME` | +| `deploy/iam.tf` | Terraform: IAM roles for Lambda, CodeBuild (with `sts:AssumeRole`), SC launch | +| `deploy/service_catalog.tf` | Terraform: Portfolio, two products, two launch constraints | +| `service-catalog/proposer-template.yaml` | CFN template for the Propose product | +| `service-catalog/executor-template.yaml` | CFN template for the Apply product | diff --git a/lambda/app.py b/lambda/app.py index eeb20dd..decab1d 100644 --- a/lambda/app.py +++ b/lambda/app.py @@ -43,16 +43,21 @@ class TfRunRequest(BaseModel): """Validated input for a tf-run-executor CodeBuild invocation.""" + action: Literal["propose", "apply"] = Field(..., description="propose = render templates + open PR; apply = tf-run apply on main after PR is merged") account_repo: str = Field(..., description="Account repo name, e.g. 229685449397-csvd-dev-platform-dev-gov") layer: Literal["common", "infrastructure", "vpc"] = Field(..., description="Terraform layer") region_dir: Literal["east", "west", "global"] = Field(..., description="Region directory (east, west, or global for non-regional resources like SSO/IAM)") - tf_run_start_tag: str = Field(default="", description="tf-run.data TAG label to start from; empty = from beginning") + + # --- Proposer fields (action=propose only) --- template_repo: str = Field(default="", description="GHE repo name containing Jinja2/raw template files to render into the account repo") template_vars: dict = Field(default_factory=dict, description='JSON map of variables passed to Jinja2 when rendering template_repo files') - extra_files: dict = Field(default_factory=dict, description='JSON map {"relative/path": "content"} written into account repo (after template rendering; overrides template output)') - git_branch: str = Field(default="repo-init", description="Branch to commit and open PR from") - dry_run: bool = Field(default=False, description="true = tf plan only, no apply") + extra_files: dict = Field(default_factory=dict, description='JSON map {"relative/path": "content"} written into account repo (after template rendering)') + git_branch: str = Field(default="propose/sc-automation", description="Branch to commit and open PR from (propose only)") + + # --- Executor fields (action=apply only) --- target_account_id: str = Field(default="", description="AWS account ID to assume sc-automation-codebuild-role in before running tf-run; empty = run with CodeBuild role (csvd-dev)") + tf_run_start_tag: str = Field(default="", description="tf-run.data TAG label to start from; empty = from beginning (apply only)") + dry_run: bool = Field(default=False, description="true = tf-run plan only, no apply (apply action only)") @field_validator("extra_files", "template_vars", mode="before") @classmethod @@ -137,34 +142,45 @@ def start_codebuild_build( github_token: str, request_id: str, ) -> str: - """Start the tf-run-executor CodeBuild project with per-build env-var overrides. + """Start the proposer or executor CodeBuild project with per-build env-var overrides. Returns the CodeBuild build ID. """ - project_name = os.environ.get("CODEBUILD_PROJECT_NAME", "tf-run-executor") + if tf_req.action == "propose": + project_name = os.environ.get("PROPOSER_PROJECT_NAME", "tf-run-proposer") + env_overrides = [ + {"name": "ACCOUNT_REPO", "value": tf_req.account_repo, "type": "PLAINTEXT"}, + {"name": "LAYER", "value": tf_req.layer, "type": "PLAINTEXT"}, + {"name": "REGION_DIR", "value": tf_req.region_dir, "type": "PLAINTEXT"}, + {"name": "GIT_BRANCH", "value": tf_req.git_branch, "type": "PLAINTEXT"}, + {"name": "TEMPLATE_REPO", "value": tf_req.template_repo, "type": "PLAINTEXT"}, + {"name": "TEMPLATE_VARS", "value": json.dumps(tf_req.template_vars), "type": "PLAINTEXT"}, + {"name": "EXTRA_FILES", "value": json.dumps(tf_req.extra_files), "type": "PLAINTEXT"}, + {"name": "GITHUB_TOKEN", "value": github_token, "type": "PLAINTEXT"}, + ] + else: # apply + project_name = os.environ.get("EXECUTOR_PROJECT_NAME", "tf-run-executor") + env_overrides = [ + {"name": "ACCOUNT_REPO", "value": tf_req.account_repo, "type": "PLAINTEXT"}, + {"name": "LAYER", "value": tf_req.layer, "type": "PLAINTEXT"}, + {"name": "REGION_DIR", "value": tf_req.region_dir, "type": "PLAINTEXT"}, + {"name": "TARGET_ACCOUNT_ID", "value": tf_req.target_account_id, "type": "PLAINTEXT"}, + {"name": "TF_RUN_START_TAG", "value": tf_req.tf_run_start_tag, "type": "PLAINTEXT"}, + {"name": "DRY_RUN", "value": str(tf_req.dry_run).lower(), "type": "PLAINTEXT"}, + {"name": "GITHUB_TOKEN", "value": github_token, "type": "PLAINTEXT"}, + ] + region = os.environ.get("AWS_REGION", os.environ.get("AWS_DEFAULT_REGION", "us-gov-west-1")) cb = boto3.client("codebuild", region_name=region) logger.info( - f"[{request_id}] Starting CodeBuild '{project_name}' for " + f"[{request_id}] Starting CodeBuild '{project_name}' (action={tf_req.action}) for " f"repo={tf_req.account_repo} layer={tf_req.layer}/{tf_req.region_dir}" ) response = cb.start_build( projectName=project_name, - environmentVariablesOverride=[ - {"name": "ACCOUNT_REPO", "value": tf_req.account_repo, "type": "PLAINTEXT"}, - {"name": "LAYER", "value": tf_req.layer, "type": "PLAINTEXT"}, - {"name": "REGION_DIR", "value": tf_req.region_dir, "type": "PLAINTEXT"}, - {"name": "GIT_BRANCH", "value": tf_req.git_branch, "type": "PLAINTEXT"}, - {"name": "TF_RUN_START_TAG", "value": tf_req.tf_run_start_tag, "type": "PLAINTEXT"}, - {"name": "TEMPLATE_REPO", "value": tf_req.template_repo, "type": "PLAINTEXT"}, - {"name": "TEMPLATE_VARS", "value": json.dumps(tf_req.template_vars), "type": "PLAINTEXT"}, - {"name": "EXTRA_FILES", "value": json.dumps(tf_req.extra_files), "type": "PLAINTEXT"}, - {"name": "DRY_RUN", "value": str(tf_req.dry_run).lower(), "type": "PLAINTEXT"}, - {"name": "TARGET_ACCOUNT_ID", "value": tf_req.target_account_id, "type": "PLAINTEXT"}, - {"name": "GITHUB_TOKEN", "value": github_token, "type": "PLAINTEXT"}, - ], + environmentVariablesOverride=env_overrides, ) build_id = response["build"]["id"] logger.info(f"[{request_id}] CodeBuild build started: {build_id}") @@ -295,7 +311,7 @@ def lambda_handler(event: dict, context) -> dict: tf_req = TfRunRequest(**normalized) logger.info( - f"[{request_id}] repo={tf_req.account_repo} " + f"[{request_id}] action={tf_req.action} repo={tf_req.account_repo} " f"layer={tf_req.layer}/{tf_req.region_dir} " f"branch={tf_req.git_branch} dry_run={tf_req.dry_run}" ) @@ -312,23 +328,31 @@ def lambda_handler(event: dict, context) -> dict: build_status, logs_url = poll_codebuild_build(build_id, request_id, poll_budget_min) if build_status == "SUCCEEDED": - pr_url = fetch_pr_url(github_token, tf_req.account_repo, tf_req.git_branch, request_id) github_base = os.environ.get("GITHUB_API", "https://github.e.it.census.gov/api/v3").rstrip("/").removesuffix("/api/v3") github_org = os.environ.get("GITHUB_ORG_NAME", "SCT-Engineering") repo_url = f"{github_base}/{github_org}/{tf_req.account_repo}" - response_data = { - "PullRequestUrl": pr_url, - "pull_request_url": pr_url, - "RepositoryUrl": repo_url, - "repository_url": repo_url, - "BranchName": tf_req.git_branch, - "branch_name": tf_req.git_branch, - "CodeBuildBuildId": build_id, - } + if tf_req.action == "propose": + pr_url = fetch_pr_url(github_token, tf_req.account_repo, tf_req.git_branch, request_id) + response_data = { + "PullRequestUrl": pr_url, + "pull_request_url": pr_url, + "RepositoryUrl": repo_url, + "repository_url": repo_url, + "BranchName": tf_req.git_branch, + "branch_name": tf_req.git_branch, + "CodeBuildBuildId": build_id, + } + else: # apply + response_data = { + "ApplyStatus": "SUCCEEDED", + "RepositoryUrl": repo_url, + "repository_url": repo_url, + "CodeBuildBuildId": build_id, + } send_cfn_response( event, context, "SUCCESS", response_data, - physical_resource_id=f"{tf_req.account_repo}-{tf_req.layer}-{tf_req.region_dir}", + physical_resource_id=f"{tf_req.action}-{tf_req.account_repo}-{tf_req.layer}-{tf_req.region_dir}", ) return {"statusCode": 200, "body": json.dumps(response_data)} diff --git a/service-catalog/executor-template.yaml b/service-catalog/executor-template.yaml new file mode 100644 index 0000000..06d55c2 --- /dev/null +++ b/service-catalog/executor-template.yaml @@ -0,0 +1,127 @@ +AWSTemplateFormatVersion: '2010-09-09' +Description: >- + Service Catalog Product: Apply Terraform changes for an account repo layer. + Clones the account repo at main, optionally assumes a cross-account role, + and runs tf-run apply. Intended for use AFTER the corresponding Propose + product's pull request has been reviewed and merged. + +Metadata: + AWS::CloudFormation::Interface: + ParameterGroups: + - Label: + default: "Target Repository" + Parameters: + - AccountRepo + - Layer + - RegionDir + - Label: + default: "Execution Options" + Parameters: + - TargetAccountId + - TfRunStartTag + - DryRun + + ParameterLabels: + AccountRepo: + default: "Account Repo Name" + Layer: + default: "Terraform Layer" + RegionDir: + default: "Region Directory" + TargetAccountId: + default: "Target AWS Account ID (optional)" + TfRunStartTag: + default: "tf-run Start Tag (optional)" + DryRun: + default: "Dry Run (plan only)" + +Parameters: + AccountRepo: + Type: String + Description: >- + Account repo name, e.g. 229685449397-csvd-dev-platform-dev-gov. + Must already exist in the SCT-Engineering GitHub org. + AllowedPattern: '^[a-z0-9][a-z0-9-]*[a-z0-9]$' + ConstraintDescription: Lowercase letters, numbers, and hyphens only + MinLength: 3 + MaxLength: 100 + + Layer: + Type: String + Description: Terraform layer to run tf-run apply in + AllowedValues: + - common + - infrastructure + - vpc + + RegionDir: + Type: String + Description: Region directory within the layer + AllowedValues: + - east + - west + - global + + TargetAccountId: + Type: String + Description: >- + AWS account ID that CodeBuild should run terraform apply against. + When set, CodeBuild assumes arn:{partition}:iam::{TargetAccountId}:role/sc-automation-codebuild-role + before executing tf-run. That role must exist in the target account with a + trust policy allowing the CodeBuild IAM role from csvd-dev (229685449397). + Leave blank to run with the CodeBuild role's own credentials (csvd-dev only). + Default: "" + MaxLength: 12 + AllowedPattern: '^[0-9]{12}$|^$' + ConstraintDescription: Must be a 12-digit AWS account ID or empty + + TfRunStartTag: + Type: String + Description: >- + tf-run.data TAG label to start execution from. + Leave blank to run all steps from the beginning. + Default: "" + MaxLength: 100 + + DryRun: + Type: String + Description: >- + Set to 'true' to run tf plan only (no apply). + Useful for validating before committing to an apply. + AllowedValues: + - "true" + - "false" + Default: "false" + +Resources: + ApplyResource: + Type: Custom::TerraformApply + Properties: + ServiceToken: !Sub "arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:tf-run-executor-trigger" + # action is hardcoded — this product always applies + action: apply + account_repo: !Ref AccountRepo + layer: !Ref Layer + region_dir: !Ref RegionDir + target_account_id: !Ref TargetAccountId + tf_run_start_tag: !Ref TfRunStartTag + dry_run: !Ref DryRun + +Outputs: + ApplyStatus: + Description: Result of the tf-run apply ("SUCCEEDED" or "FAILED") + Value: !GetAtt ApplyResource.ApplyStatus + Export: + Name: !Sub '${AWS::StackName}-ApplyStatus' + + RepositoryUrl: + Description: URL of the account repository + Value: !GetAtt ApplyResource.repository_url + Export: + Name: !Sub '${AWS::StackName}-RepositoryUrl' + + CodeBuildBuildId: + Description: ID of the CodeBuild executor build + Value: !GetAtt ApplyResource.CodeBuildBuildId + Export: + Name: !Sub '${AWS::StackName}-CodeBuildBuildId' diff --git a/service-catalog/proposer-template.yaml b/service-catalog/proposer-template.yaml new file mode 100644 index 0000000..bc3dd50 --- /dev/null +++ b/service-catalog/proposer-template.yaml @@ -0,0 +1,145 @@ +AWSTemplateFormatVersion: '2010-09-09' +Description: >- + Service Catalog Product: Propose Terraform changes for an account repo layer. + Clones the account repo, renders template files, writes extra config files, + commits to a branch, and opens a pull request for human review. + A separate "Apply" product executes the changes after the PR is merged. + +Metadata: + AWS::CloudFormation::Interface: + ParameterGroups: + - Label: + default: "Target Repository" + Parameters: + - AccountRepo + - Layer + - RegionDir + - Label: + default: "Proposal Branch" + Parameters: + - GitBranch + - Label: + default: "Template Repository (optional)" + Parameters: + - TemplateRepo + - TemplateVars + - Label: + default: "Extra Files (optional)" + Parameters: + - ExtraFiles + + ParameterLabels: + AccountRepo: + default: "Account Repo Name" + Layer: + default: "Terraform Layer" + RegionDir: + default: "Region Directory" + GitBranch: + default: "Proposal Branch Name" + TemplateRepo: + default: "Template Repository Name (optional)" + TemplateVars: + default: "Template Variables (JSON)" + ExtraFiles: + default: "Extra Config Files (JSON)" + +Parameters: + AccountRepo: + Type: String + Description: >- + Account repo name, e.g. 229685449397-csvd-dev-platform-dev-gov. + Must already exist in the SCT-Engineering GitHub org. + AllowedPattern: '^[a-z0-9][a-z0-9-]*[a-z0-9]$' + ConstraintDescription: Lowercase letters, numbers, and hyphens only + MinLength: 3 + MaxLength: 100 + + Layer: + Type: String + Description: Terraform layer to write files into + AllowedValues: + - common + - infrastructure + - vpc + + RegionDir: + Type: String + Description: Region directory within the layer + AllowedValues: + - east + - west + - global + + GitBranch: + Type: String + Description: Branch to commit proposed changes to and open the PR from + Default: propose/sc-automation + MinLength: 1 + MaxLength: 100 + + TemplateRepo: + Type: String + Description: >- + Name of a GHE repo (in the same org) containing template files. + Files ending in .j2 are rendered as Jinja2 templates using TemplateVars. + All other files are copied as-is. Results land in the account repo at + the same relative paths. + Leave blank to skip template rendering. + Default: "" + MaxLength: 100 + + TemplateVars: + Type: String + Description: >- + JSON object of variables passed to Jinja2 when rendering .j2 files + from the TemplateRepo. Example: {"cluster_name": "my-eks", "env": "dev"} + Default: "{}" + + ExtraFiles: + Type: String + Description: >- + JSON object mapping relative repo paths to raw file contents. + Applied after TemplateRepo rendering; keys here override template output. + Example: {"vpc/west/my-config.tf": "# placeholder"} + Default: "{}" + +Resources: + ProposeResource: + Type: Custom::TerraformPropose + Properties: + ServiceToken: !Sub "arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:tf-run-executor-trigger" + # action is hardcoded — this product always proposes (open PR only) + action: propose + account_repo: !Ref AccountRepo + layer: !Ref Layer + region_dir: !Ref RegionDir + git_branch: !Ref GitBranch + template_repo: !Ref TemplateRepo + template_vars: !Ref TemplateVars + extra_files: !Ref ExtraFiles + +Outputs: + PullRequestUrl: + Description: URL of the pull request opened by the proposer + Value: !GetAtt ProposeResource.pull_request_url + Export: + Name: !Sub '${AWS::StackName}-PullRequestUrl' + + RepositoryUrl: + Description: URL of the account repository + Value: !GetAtt ProposeResource.repository_url + Export: + Name: !Sub '${AWS::StackName}-RepositoryUrl' + + BranchName: + Description: Branch that was committed to + Value: !GetAtt ProposeResource.branch_name + Export: + Name: !Sub '${AWS::StackName}-BranchName' + + CodeBuildBuildId: + Description: ID of the CodeBuild proposer build + Value: !GetAtt ProposeResource.CodeBuildBuildId + Export: + Name: !Sub '${AWS::StackName}-CodeBuildBuildId' From 97921ff50cfe3a9570aafa0443728f5b15765bcc Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Mon, 11 May 2026 17:44:05 -0400 Subject: [PATCH 07/27] feat: Add initial buildspec.yml for tf-run-executor configuration --- buildspec.yml | 225 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 buildspec.yml diff --git a/buildspec.yml b/buildspec.yml new file mode 100644 index 0000000..f3029ba --- /dev/null +++ b/buildspec.yml @@ -0,0 +1,225 @@ +version: 0.2 + +# --------------------------------------------------------------------------- +# tf-run-executor buildspec +# +# Required env-var overrides per build (supplied by Lambda or manual CLI): +# ACCOUNT_REPO - account repo name, e.g. 229685449397-csvd-dev-platform-dev-gov +# LAYER - terraform layer: common | infrastructure | vpc +# REGION_DIR - region directory: east | west +# GITHUB_TOKEN - GHE PAT (type PLAINTEXT, value from Secrets Manager) +# +# Optional env-var overrides: +# GIT_BRANCH - branch to commit/PR from (default: repo-init) +# TF_RUN_START_TAG - tf-run.data TAG label to start from (default: empty = from top) +# TEMPLATE_REPO - GHE repo containing Jinja2/.tf template files (default: empty) +# TEMPLATE_VARS - JSON map of Jinja2 variables for template rendering (default: {}) +# EXTRA_FILES - JSON map {"relative/path": "content"} written after template rendering +# DRY_RUN - "true" = tf plan only, no apply (default: "false") +# TARGET_ACCOUNT_ID - AWS account ID to assume role in before running tf-run +# (default: empty = run with CodeBuild's own credentials, +# i.e. csvd-dev. Set this when targeting a different account.) +# --------------------------------------------------------------------------- + +env: + variables: + GITHUB_ORG: "SCT-Engineering" + # S3 prefixes — filenames are resolved at build time from terraform/support VERSION files. + # The S3 bucket must contain the version pinned in terraform/support (keep in sync). + TF_BINARY_S3_PREFIX: "s3://csvd-packer-pipeline-assets/terraform" + GH_CLI_S3_PREFIX: "s3://csvd-packer-pipeline-assets/tools" + CENSUS_CA_S3: "s3://csvd-packer-pipeline-assets/certs/census-ca.pem" + # Org-canonical version governance: clone this repo to read VERSION files + TERRAFORM_SUPPORT_REPO: "terraform/support" + HTTPS_PROXY: "http://proxy.tco.census.gov:3128" + NO_PROXY: "github.e.it.census.gov,169.254.169.254,169.254.170.2" + # Per-build defaults (overridden via environmentVariablesOverride in Lambda) + GIT_BRANCH: "repo-init" + DRY_RUN: "false" + TF_RUN_START_TAG: "" + TEMPLATE_REPO: "" + TEMPLATE_VARS: "{}" + EXTRA_FILES: "{}" + TARGET_ACCOUNT_ID: "" + +phases: + install: + commands: + # --- Version governance: clone terraform/support to read org-canonical versions --- + # This repo (github.e.it.census.gov/terraform/support) is the single source of truth + # for which Terraform and gh CLI versions the org has blessed. We read VERSION files + # from it rather than hardcoding versions here. + - git clone --depth 1 "https://${GITHUB_TOKEN}@github.e.it.census.gov/${TERRAFORM_SUPPORT_REPO}.git" /tmp/tf-support + - export TF_VERSION=$(cat /tmp/tf-support/terraform/VERSION) + - export GH_VERSION=$(cat /tmp/tf-support/github-cli-releases/VERSION) + - echo "Using Terraform ${TF_VERSION}, gh CLI ${GH_VERSION}" + + # --- Terraform binary (registry.terraform.io is blocked on Census network; use S3) --- + # S3 bucket must contain the version pinned in terraform/support/terraform/VERSION. + - aws s3 cp "${TF_BINARY_S3_PREFIX}/terraform_${TF_VERSION}_linux_amd64.zip" /tmp/terraform.zip + - unzip -o /tmp/terraform.zip -d /usr/local/bin/ && chmod +x /usr/local/bin/terraform + - ln -sf /usr/local/bin/terraform /usr/local/bin/tf + + # --- Census CA certificate (required for TLS to github.e.it.census.gov) --- + - aws s3 cp "$CENSUS_CA_S3" /etc/pki/ca-trust/source/anchors/census-ca.pem + - update-ca-trust extract + + # --- tf-run toolchain (sourced from terraform/support, already cloned above) --- + # Canonical versions live in terraform/support local-app/ — no copies kept in this repo. + - cp /tmp/tf-support/local-app/tf-run/tf-run.sh /usr/local/bin/tf-run + - cp /tmp/tf-support/local-app/tf-control/tf-control.sh /usr/local/bin/tf-control.sh + - cp /tmp/tf-support/local-app/tf-directory-setup/tf-directory-setup.py /usr/local/bin/tf-directory-setup.py + - chmod +x /usr/local/bin/tf-run /usr/local/bin/tf-control.sh /usr/local/bin/tf-directory-setup.py + # Create tf-{action} symlinks expected by tf-run and account repo steps + - > + for action in init plan apply destroy refresh output validate import state fmt taint console; do + ln -sf /usr/local/bin/tf-control.sh /usr/local/bin/tf-${action}; + done + + # --- Python deps for tf-directory-setup.py and template rendering --- + - pip3 install --quiet jinja2 python-dateutil pyyaml + + # --- gh CLI (S3 bucket must contain the version pinned in terraform/support) --- + - aws s3 cp "${GH_CLI_S3_PREFIX}/gh_${GH_VERSION}_linux_amd64.tar.gz" /tmp/gh.tar.gz + - mkdir -p /tmp/gh-cli + - tar -xzf /tmp/gh.tar.gz -C /tmp/gh-cli --strip-components=1 + - cp /tmp/gh-cli/bin/gh /usr/local/bin/gh && chmod +x /usr/local/bin/gh + + build: + commands: + # --- Configure git to rewrite SSH URLs to HTTPS --- + # Account repos reference Terraform modules via ssh://git@github.e.it.census.gov/... + # This rewrite makes those module fetches work transparently via HTTPS + PAT, + # avoiding the need for a per-repo deploy key. + - git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "ssh://git@github.e.it.census.gov/" + - git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "git@github.e.it.census.gov:" + + # --- Clone account repo --- + - git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${ACCOUNT_REPO}.git" repo + - cd repo + - git checkout -B "${GIT_BRANCH}" + + # --- Render template repo (if specified) into account repo --- + # Clone TEMPLATE_REPO, render .j2 files with TEMPLATE_VARS via Jinja2, + # copy non-template files as-is. Results land in the account repo tree + # at the same relative paths. EXTRA_FILES applied afterwards can override. + - | + if [ -n "${TEMPLATE_REPO}" ]; then + git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${TEMPLATE_REPO}.git" /tmp/template-repo + python3 - <<'PYEOF' + import json, os, pathlib, shutil + from jinja2 import Environment, FileSystemLoader, StrictUndefined + + template_vars = json.loads(os.environ.get('TEMPLATE_VARS', '{}')) + src_root = pathlib.Path('/tmp/template-repo') + dst_root = pathlib.Path('.') # already inside cloned account repo + + rendered = 0 + copied = 0 + for src in src_root.rglob('*'): + if src.is_dir() or any(part.startswith('.git') for part in src.parts): + continue + rel = src.relative_to(src_root) + if src.suffix == '.j2': + # Render Jinja2 template; strip .j2 extension in destination + dst = dst_root / rel.with_suffix('') + dst.parent.mkdir(parents=True, exist_ok=True) + env = Environment( + loader=FileSystemLoader(str(src.parent)), + undefined=StrictUndefined, + keep_trailing_newline=True, + ) + content = env.get_template(src.name).render(**template_vars) + dst.write_text(content) + rendered += 1 + else: + dst = dst_root / rel + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dst) + copied += 1 + print(f'Template repo: rendered {rendered} .j2 file(s), copied {copied} file(s)') + PYEOF + else + echo 'No TEMPLATE_REPO specified — skipping template rendering' + fi + + # --- Write extra config files passed in from Lambda (JSON map path -> content) --- + # Applied after template rendering; keys here override template output. + - | + python3 -c " + import json, os, pathlib + files = json.loads(os.environ.get('EXTRA_FILES', '{}')) + for path, content in files.items(): + p = pathlib.Path(path) + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + print(f'Wrote {len(files)} extra file(s)') + " + + # --- Commit and push (--allow-empty handles no-change case) --- + - git add -A + - | + git -c user.email="sc-automation@census.gov" \ + -c user.name="SC Automation" \ + commit -m "SC automation: ${LAYER}/${REGION_DIR} [${ACCOUNT_REPO}]" \ + --allow-empty + - git push origin "${GIT_BRANCH}" + + # --- Assume cross-account role (if TARGET_ACCOUNT_ID is set) --- + # CodeBuild runs in csvd-dev by default. To run tf-run apply against resources + # in a different AWS account, set TARGET_ACCOUNT_ID. The role + # sc-automation-codebuild-role must exist in that account and trust the + # CodeBuild IAM role from csvd-dev. + - | + if [ -n "${TARGET_ACCOUNT_ID}" ]; then + PARTITION=$(aws sts get-caller-identity --query Arn --output text | cut -d: -f2) + ROLE_ARN="arn:${PARTITION}:iam::${TARGET_ACCOUNT_ID}:role/sc-automation-codebuild-role" + echo "Assuming cross-account role: ${ROLE_ARN}" + CREDS=$(aws sts assume-role \ + --role-arn "${ROLE_ARN}" \ + --role-session-name "sc-automation-${ACCOUNT_REPO}" \ + --query Credentials \ + --output json) + export AWS_ACCESS_KEY_ID=$(echo "$CREDS" | python3 -c "import json,sys; print(json.load(sys.stdin)['AccessKeyId'])") + export AWS_SECRET_ACCESS_KEY=$(echo "$CREDS" | python3 -c "import json,sys; print(json.load(sys.stdin)['SecretAccessKey'])") + export AWS_SESSION_TOKEN=$(echo "$CREDS" | python3 -c "import json,sys; print(json.load(sys.stdin)['SessionToken'])") + echo "Successfully assumed role in account ${TARGET_ACCOUNT_ID}" + else + echo "No TARGET_ACCOUNT_ID set — running with CodeBuild role (csvd-dev)" + fi + + # --- Run Terraform in target layer/region directory --- + # tf-run auto-proceeds on non-TTY stdin (read -t timeout defaults to "y") + - cd "${LAYER}/${REGION_DIR}" + - | + if [ "${DRY_RUN}" = "true" ]; then + tf-plan -no-color + elif [ -n "${TF_RUN_START_TAG}" ]; then + TFARGS="-auto-approve" tf-run apply "tag:${TF_RUN_START_TAG}" + else + TFARGS="-auto-approve" tf-run apply + fi + + # --- Open PR (idempotent: skip if PR already exists) --- + - | + GH_HOST=github.e.it.census.gov \ + GH_TOKEN="${GITHUB_TOKEN}" \ + gh pr create \ + --title "SC automation: ${LAYER}/${REGION_DIR} [${ACCOUNT_REPO}]" \ + --body "Triggered by Service Catalog provisioning of **${ACCOUNT_REPO}**." \ + --base main \ + --head "${GIT_BRANCH}" \ + || echo "PR already exists or create failed, continuing" + + post_build: + commands: + - echo "BUILD_RESULT=${CODEBUILD_BUILD_SUCCEEDING}" + # Emit PR_URL so Lambda can parse it from the build output + - | + PR_URL=$(GH_HOST=github.e.it.census.gov \ + GH_TOKEN="${GITHUB_TOKEN}" \ + gh pr view \ + --repo "${GITHUB_ORG}/${ACCOUNT_REPO}" \ + "${GIT_BRANCH}" \ + --json url -q .url 2>/dev/null || echo "") + echo "PR_URL=${PR_URL}" From c6ac44784cc929629342d184a9f1833f1a538d02 Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Mon, 11 May 2026 17:52:47 -0400 Subject: [PATCH 08/27] docs: ADR-001 webhook auto-apply on merge to main (proposed) --- docs/decisions/001-webhook-auto-apply.md | 167 +++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 docs/decisions/001-webhook-auto-apply.md diff --git a/docs/decisions/001-webhook-auto-apply.md b/docs/decisions/001-webhook-auto-apply.md new file mode 100644 index 0000000..795833f --- /dev/null +++ b/docs/decisions/001-webhook-auto-apply.md @@ -0,0 +1,167 @@ +# ADR-001: Webhook-Triggered Auto-Apply on Merge to Main + +**Status:** Proposed +**Date:** 2026-05-11 +**Branch:** feature/template-repo-rendering + +--- + +## Context + +The current two-product model requires a human to manually provision the +`tf-run-executor` Service Catalog product after a Proposer PR is reviewed and +merged. This adds unnecessary friction to the apply step: + +1. Platform engineer reviews and merges the PR opened by the Proposer +2. Platform engineer opens Service Catalog, finds the executor product, fills in + the same parameters they already specified during the Propose step, and + clicks Launch + +Step 2 is pure operational overhead. The information needed to start the executor +build (account repo, layer, region dir, target account) is already known at merge +time and could be stored in the repo itself. + +--- + +## Decision + +Add a **GitHub Enterprise webhook handler** to the Lambda that automatically +starts an executor CodeBuild build whenever a push event lands on `main` in a +watched account repo. + +Target apply configuration is stored in a `.sc-automation.yml` file committed to +the root of each account repo by the Proposer (or manually by a platform engineer). + +--- + +## Proposed Design + +### `.sc-automation.yml` — committed to the account repo root + +```yaml +# Written by the Proposer CodeBuild build or manually by a platform engineer. +# Each entry triggers one executor CodeBuild build when changes land on main. +apply_on_merge: + - layer: infrastructure + region_dir: west + target_account_id: "229685449397" + - layer: infrastructure + region_dir: east + target_account_id: "229685449397" + - layer: vpc + region_dir: west + target_account_id: "229685449397" +``` + +Fields per entry: + +| Field | Required | Description | +|---|---|---| +| `layer` | yes | `common`, `infrastructure`, or `vpc` | +| `region_dir` | yes | `east`, `west`, or `global` | +| `target_account_id` | no | 12-digit AWS account ID; omit to run in csvd-dev | +| `tf_run_start_tag` | no | tf-run TAG label to start from | +| `dry_run` | no | `true` to plan only (default: `false`) | + +### Lambda changes + +Add a `/webhook` path handler alongside the existing CFN handler in +`lambda/app.py`. + +**Invocation:** Lambda Function URL (no API Gateway needed — GHE can POST to +a Function URL directly). The URL is added to the GHE org or repo webhook +settings. + +**Request flow:** + +``` +GHE push event (main branch, account repo) + → Lambda Function URL POST / + → verify HMAC-SHA256 signature (secret in SM: ghe-runner/webhook-secret) + → parse X-GitHub-Event: push + → filter: ref == refs/heads/main + → filter: repo name matches account repo pattern + → fetch .sc-automation.yml via GHE API (no clone needed) + → for each entry in apply_on_merge: + start_codebuild_build(action="apply", account_repo=..., layer=..., ...) + (fire-and-forget — do NOT block for CodeBuild completion) + → return 200 OK immediately +``` + +**Key differences from the CFN handler:** + +- **No polling.** The webhook handler starts builds and returns immediately. + Build results are visible in CodeBuild logs and CloudWatch. There is no CFN + stack to signal. +- **No CFN resource.** The executor product is still available for manual use, + but webhook-triggered runs bypass Service Catalog entirely. +- **Idempotent.** If GHE retries the webhook (network blip), a duplicate build + is started. This is acceptable — `tf-run apply` on an already-applied state is + a no-op. + +### Infrastructure changes + +| Resource | Change | +|---|---| +| Lambda Function URL | Add `aws_lambda_function_url` resource in `deploy/lambda.tf` | +| Lambda invoke permission | Add `aws_lambda_permission` allowing `lambda:InvokeFunctionUrl` from `*` (HMAC signature is the auth mechanism) | +| Secrets Manager | Add a `ghe-runner/webhook-secret` secret for HMAC verification | +| Lambda IAM | No change — existing `codebuild:StartBuild` permission covers webhook-triggered builds | +| GHE Webhook | Manual one-time setup: org or per-repo webhook → Function URL, content-type `application/json`, events: `push` | + +### `.sc-automation.yml` lifecycle + +- **Proposer writes it** when it first creates the branch (if the file doesn't + exist yet). The Proposer knows `layer`, `region_dir`, and `target_account_id` + from its build environment variables. It commits `.sc-automation.yml` alongside + the rendered template files. +- **Platform engineers edit it** directly via PR if they need to add or remove + apply targets. +- **The file is idempotent** — subsequent Proposer runs `--force-with-lease` push + won't break it because the Proposer will only write the file if it doesn't + already exist (avoiding clobbering manual edits). + +--- + +## Consequences + +### Benefits + +- Eliminates the manual "provision executor product" step after PR merge +- Apply is fully traceable: GHE push event → CloudWatch Logs → CodeBuild build ID +- No new infrastructure services (no EventBridge, no SQS, no API Gateway) +- The executor SC product remains available for manual one-off runs and + day-2 operations (re-run from a specific tag, dry-run, etc.) + +### Trade-offs + +- Build results are no longer surfaced in a CloudFormation stack output — users + must check CodeBuild or CloudWatch Logs directly +- GHE webhook requires a one-time manual setup per org (or per repo for + fine-grained control) +- A merge to `main` that does not involve Terraform changes (e.g. README edit) + will still trigger executor builds. Mitigation: add a `paths` filter in + `.sc-automation.yml` (future enhancement) or rely on `tf-run apply` being a + safe no-op + +### Out of scope for this ADR + +- Result notification (Slack, email) after a webhook-triggered apply — tracked + separately +- Path filtering (only trigger on changes under `{layer}/{region_dir}/`) — + tracked separately + +--- + +## Alternatives Considered + +**CodeStar connection + CodePipeline watch:** Requires CodePipeline infrastructure +per repo, CodeStar connector host setup for GHE on-prem, and loses the per-run +environment variable flexibility that the Lambda `StartBuild` override model +provides. Rejected. + +**EventBridge + S3 source:** Would require mirroring the GHE repo to CodeCommit +or S3 to get an EventBridge trigger. Adds a sync layer with no benefit. Rejected. + +**Poll-based apply (Lambda on schedule):** Adds latency and unnecessary API calls. +Rejected. From 8ca7f3047c670ab1e23084204607885686713116 Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Mon, 11 May 2026 18:02:57 -0400 Subject: [PATCH 09/27] =?UTF-8?q?docs:=20ADR-001=20update=20=E2=80=94=20we?= =?UTF-8?q?bhook=20payload=20details,=20commit=20status=20writeback,=20.sc?= =?UTF-8?q?-automation.yml=20lifecycle?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/decisions/001-webhook-auto-apply.md | 77 ++++++++++++++++++------ 1 file changed, 59 insertions(+), 18 deletions(-) diff --git a/docs/decisions/001-webhook-auto-apply.md b/docs/decisions/001-webhook-auto-apply.md index 795833f..95e5ceb 100644 --- a/docs/decisions/001-webhook-auto-apply.md +++ b/docs/decisions/001-webhook-auto-apply.md @@ -72,6 +72,27 @@ Add a `/webhook` path handler alongside the existing CFN handler in a Function URL directly). The URL is added to the GHE org or repo webhook settings. +### Webhook payload — what GHE sends + +The GHE `push` event payload contains everything the Lambda needs to identify +the repo without any out-of-band mapping: + +```json +{ + "ref": "refs/heads/main", + "after": "abc123def456...", + "repository": { + "name": "229685449397-csvd-dev-platform-dev-gov", + "full_name": "SCT-Engineering/229685449397-csvd-dev-platform-dev-gov", + "clone_url": "https://github.e.it.census.gov/SCT-Engineering/229685449397-csvd-dev-platform-dev-gov.git" + } +} +``` + +- `repository.name` → `ACCOUNT_REPO` passed to CodeBuild +- `after` → merge commit SHA used for GHE commit status writeback +- No repo→callback URL map is needed or maintained + **Request flow:** ``` @@ -79,15 +100,29 @@ GHE push event (main branch, account repo) → Lambda Function URL POST / → verify HMAC-SHA256 signature (secret in SM: ghe-runner/webhook-secret) → parse X-GitHub-Event: push - → filter: ref == refs/heads/main - → filter: repo name matches account repo pattern - → fetch .sc-automation.yml via GHE API (no clone needed) + → filter: ref == refs/heads/main AND repository.name matches account repo pattern + → fetch .sc-automation.yml from main via GHE API (no clone — single API call) + → if .sc-automation.yml missing: post ❌ commit status "no .sc-automation.yml on main" and exit + → post ⏳ "pending" commit status on merge SHA → for each entry in apply_on_merge: - start_codebuild_build(action="apply", account_repo=..., layer=..., ...) - (fire-and-forget — do NOT block for CodeBuild completion) - → return 200 OK immediately + start_codebuild_build( + action="apply", + account_repo=payload["repository"]["name"], # from webhook + layer=entry["layer"], # from .sc-automation.yml + region_dir=entry["region_dir"], # from .sc-automation.yml + target_account_id=entry.get("target_account_id", ""), + commit_sha=payload["after"] # for status writeback + ) + (fire-and-forget — do NOT poll CodeBuild) + → return HTTP 200 immediately ``` +**Executor buildspec writeback:** +The executor CodeBuild build receives `COMMIT_SHA` as an env var. In its +POST_BUILD phase it calls `gh api` to post a GHE commit status (`success` or +`failure`) back to the merge commit. Teams see ✅ or ❌ directly on the commit +in the PR history — no CloudWatch required. + **Key differences from the CFN handler:** - **No polling.** The webhook handler starts builds and returns immediately. @@ -111,15 +146,19 @@ GHE push event (main branch, account repo) ### `.sc-automation.yml` lifecycle -- **Proposer writes it** when it first creates the branch (if the file doesn't - exist yet). The Proposer knows `layer`, `region_dir`, and `target_account_id` - from its build environment variables. It commits `.sc-automation.yml` alongside - the rendered template files. -- **Platform engineers edit it** directly via PR if they need to add or remove +- **Proposer writes it** on the first run for a branch, if the file doesn't + already exist on `main`. The Proposer knows `layer`, `region_dir`, and + `target_account_id` from its CodeBuild env vars. It commits `.sc-automation.yml` + alongside the rendered template files so the file is reviewed in the same PR. +- **Proposer does NOT overwrite it** on subsequent runs — it checks whether the + file already exists on `main` and skips writing if so, preserving any manual + edits made by platform engineers. +- **Platform engineers edit it** directly via PR to add, remove, or reorder apply targets. -- **The file is idempotent** — subsequent Proposer runs `--force-with-lease` push - won't break it because the Proposer will only write the file if it doesn't - already exist (avoiding clobbering manual edits). +- **GHE commit status missing → blocked** — if `.sc-automation.yml` is not + present on `main` when a push webhook fires, the Lambda posts a `failure` + commit status and does not start any builds. This surfaces the problem + immediately without a silent no-op. --- @@ -129,7 +168,9 @@ GHE push event (main branch, account repo) - Eliminates the manual "provision executor product" step after PR merge - Apply is fully traceable: GHE push event → CloudWatch Logs → CodeBuild build ID +- GHE commit status writeback gives teams ✅/❌ feedback directly on the merge commit - No new infrastructure services (no EventBridge, no SQS, no API Gateway) +- No repo→callback URL map to maintain — repo identity comes from the webhook payload - The executor SC product remains available for manual one-off runs and day-2 operations (re-run from a specific tag, dry-run, etc.) @@ -146,10 +187,10 @@ GHE push event (main branch, account repo) ### Out of scope for this ADR -- Result notification (Slack, email) after a webhook-triggered apply — tracked - separately -- Path filtering (only trigger on changes under `{layer}/{region_dir}/`) — - tracked separately +- SNS / Slack / email notification after a webhook-triggered apply — tracked separately +- Path filtering (only trigger on changes under `{layer}/{region_dir}/`) — tracked separately +- Idempotency guard against GHE webhook retries firing duplicate builds — `tf-run apply` + on an already-converged state is a safe no-op, so this is deferred --- From b67df30b148ddcbe8bd7e0c60250959e131dcb2b Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Mon, 11 May 2026 18:35:02 -0400 Subject: [PATCH 10/27] docs: Update ADR-001 to clarify webhook-triggered auto-apply process --- docs/decisions/001-webhook-auto-apply.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/decisions/001-webhook-auto-apply.md b/docs/decisions/001-webhook-auto-apply.md index 95e5ceb..369a01e 100644 --- a/docs/decisions/001-webhook-auto-apply.md +++ b/docs/decisions/001-webhook-auto-apply.md @@ -1,5 +1,14 @@ + # ADR-001: Webhook-Triggered Auto-Apply on Merge to Main +## In Plain Language + +This document explains a new way to make our automation system easier and faster. Right now, after someone reviews and merges a pull request (PR) in GitHub, a person has to go into AWS Service Catalog and start the next step by hand. This is slow and can lead to mistakes or delays. + +We want to change this so that when a PR is merged to the main branch, our system will automatically start the next step without anyone having to do it manually. We will do this by using a GitHub webhook, which is a tool that tells our system when something important happens in the repo. When the webhook sees a new change on main, it will trigger our automation to run the apply step right away. + +This paper describes how this automatic process will work, what files and settings are needed, and what changes we have to make to our system. The goal is to make things smoother, faster, and less error-prone for everyone who uses our platform. + **Status:** Proposed **Date:** 2026-05-11 **Branch:** feature/template-repo-rendering From 7a537eb74a37dc62fc53b6e28c3b8bab56e2bba0 Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Tue, 19 May 2026 16:38:08 -0400 Subject: [PATCH 11/27] docs: generalized architecture, webhook auto-apply ADR, Vault ADR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - docs/README.md: high-level index with reading paths by use case - docs/HOW-IT-WORKS.md: reframe from two-product to single Proposer + webhook auto-apply; remove executor SC product framing - docs/decisions/001-webhook-auto-apply.md: status Proposed → Accepted; update context and consequences to reflect removal of executor SC product - docs/decisions/002-vault-aws-secrets-engine.md: new ADR for Vault AWS Secrets Engine; dynamic cross-account credentials; per-product IAM scope via Proposer terraform apply; account baseline prerequisite pattern - docs/generalized-terraform-product-architecture.md: new - docs/template-management.md: Executor flow, .sc-automation.yml schema - docs/repo-vars-and-secrets.md: CodeBuild environmentVariablesOverride pattern - docs/workflow-flowcharts.md: Mermaid diagrams for propose/apply flows - docs/fleet-governance-at-scale.md: new - docs/service-catalog-census-integration.md: new - docs/cross-account-visibility.md: new --- docs/HOW-IT-WORKS.md | 136 +++--- docs/README.md | 145 +++++++ docs/cross-account-visibility.md | 353 +++++++++++++++ docs/decisions/001-webhook-auto-apply.md | 38 +- .../decisions/002-vault-aws-secrets-engine.md | 345 +++++++++++++++ docs/fleet-governance-at-scale.md | 403 ++++++++++++++++++ ...eralized-terraform-product-architecture.md | 246 +++++++++++ docs/repo-vars-and-secrets.md | 250 +++++++++++ docs/service-catalog-census-integration.md | 312 ++++++++++++++ docs/template-management.md | 271 ++++++++++++ docs/workflow-flowcharts.md | 135 ++++++ 11 files changed, 2560 insertions(+), 74 deletions(-) create mode 100644 docs/README.md create mode 100644 docs/cross-account-visibility.md create mode 100644 docs/decisions/002-vault-aws-secrets-engine.md create mode 100644 docs/fleet-governance-at-scale.md create mode 100644 docs/generalized-terraform-product-architecture.md create mode 100644 docs/repo-vars-and-secrets.md create mode 100644 docs/service-catalog-census-integration.md create mode 100644 docs/template-management.md create mode 100644 docs/workflow-flowcharts.md diff --git a/docs/HOW-IT-WORKS.md b/docs/HOW-IT-WORKS.md index fe6d117..4be7d99 100644 --- a/docs/HOW-IT-WORKS.md +++ b/docs/HOW-IT-WORKS.md @@ -6,29 +6,34 @@ to a Terraform plan or apply running inside an AWS account repository. --- -## Design Overview: Two-Product Model +## Design Overview: Proposer Product + Webhook Auto-Apply -The system is split into **two distinct Service Catalog products** with a human -review gate between them: +The system uses a **single user-facing Service Catalog product** with a human +review gate before Terraform runs any infrastructure changes: -| Product | CodeBuild Project | What It Does | -|---------|------------------|--------------| -| `tf-run-proposer` | `tf-run-proposer` | Clone repo → render templates → commit → open PR | -| `tf-run-executor` | `tf-run-executor` | Clone `main` → assume role → run `tf-run apply` | +| Component | CodeBuild Project | What It Does | +|-----------|------------------|--------------| +| SC Product: `tf-run-proposer` | `tf-run-proposer` | Clone repo → render templates → commit → open PR | +| Webhook (automatic) | `tf-run-executor` | Clone `main` → assume role → run `tf-run apply` | -**Why two products?** +**Why not two SC products?** -An earlier single-product design ran `tf-run apply` first and then opened a PR -as a trailing artifact. This made the PR meaningless as a review gate — Terraform -had already changed real infrastructure before anyone saw the diff. +An earlier design exposed the executor as a second Service Catalog product, +requiring a human to return to the SC console after merging the PR, re-enter the +same parameters, and click Launch. This is pure operational overhead — the review +already happened at PR merge time, and the parameters needed to run the apply are +already recorded in `.sc-automation.yml` in the repo. -The two-product model restores the PR as a genuine gate: +The current design restores the PR as a genuine gate with no extra manual steps: -1. A team provisions the **Proposer** → changes are committed to a branch and a PR - is opened. No infrastructure is touched. CFN stack completes quickly (< 60s). +1. A team provisions the **Proposer** product → changes are committed to a branch + and a PR is opened. No infrastructure is touched. 2. A human reviews the diff, approves, and merges the PR. -3. The team provisions the **Executor** → CodeBuild checks out `main` (post-merge), - assumes the target account role, and runs `tf-run apply`. +3. The GHE push-to-main webhook fires automatically → Lambda reads + `.sc-automation.yml` → starts `tf-run-executor` CodeBuild. No SC product, + no CFN stack, no user action required. + +See [ADR-001](decisions/001-webhook-auto-apply.md) for the full decision record. --- @@ -58,14 +63,13 @@ The two-product model restores the PR as a genuine gate: ↕ Human reviews PR, approves, merges ↕ ┌─────────────────────────────────────────────────────────────────────┐ -│ APPLY FLOW │ +│ AUTO-APPLY (webhook — no user action required) │ │ │ -│ User fills SC form → CFN Custom Resource │ -│ └─> Lambda (tf-run-executor-trigger) │ -│ • Validates inputs (action=apply) │ -│ • Starts tf-run-executor CodeBuild build │ -│ • Polls CodeBuild until completion │ -│ • Returns apply status + repo URL to CFN │ +│ GHE push to main → Lambda Function URL (HMAC verified) │ +│ └─> Lambda (tf-run-webhook-handler) │ +│ • Reads .sc-automation.yml from default branch │ +│ • Starts tf-run-executor CodeBuild (fire-and-forget) │ +│ • Posts pending commit status to GHE │ │ └─> CodeBuild: tf-run-executor │ │ • Installs: Terraform binary (from S3), tf-run │ │ toolchain, Census CA cert, gh CLI, Python deps │ @@ -73,7 +77,7 @@ The two-product model restores the PR as a genuine gate: │ • Optionally assumes cross-account IAM role │ │ • cd {LAYER}/{REGION_DIR} │ │ • tf-run apply (respects TF_RUN_START_TAG) │ -│ • POST_BUILD emits BUILD_RESULT= │ +│ • POST_BUILD writes commit status ✅/❌ to GHE │ └─────────────────────────────────────────────────────────────────────┘ ``` @@ -88,9 +92,9 @@ The two-product model restores the PR as a genuine gate: | CodeBuild (executor) | `tf-run-executor` | csvd-dev | | SC Portfolio | `{prefix}-tf-run` | csvd-dev | | SC Product (propose) | `{prefix}-tf-run-proposer` | csvd-dev | -| SC Product (apply) | `{prefix}-tf-run-executor` | csvd-dev | | CFN Template (propose) | `service-catalog/proposer-template.yaml` | S3 artifacts bucket | -| CFN Template (apply) | `service-catalog/executor-template.yaml` | S3 artifacts bucket | +| Lambda Function URL | `tf-run-webhook-handler` HTTPS endpoint | csvd-dev | +| GHE Webhook | Org-level push webhook → Lambda Function URL | GHE (manual one-time setup) | | Launch Role | `{prefix}-sc-launch-role` | csvd-dev | | GHE PAT | `ghe-runner/github-token` in Secrets Manager | csvd-dev | | Cross-account role | `sc-automation-codebuild-role` | **Target** account | @@ -166,38 +170,37 @@ The CFN stack completes and the output panel shows the PR URL. --- -## Step-by-Step: Apply Flow +## Auto-Apply on Merge (Webhook) ### 1. Prerequisites - The Proposer has run and its PR has been **reviewed and merged** to `main` +- `.sc-automation.yml` was committed by the Proposer alongside the rendered files - The target account has the `sc-automation-codebuild-role` IAM role with a trust policy allowing assume-role from the CodeBuild execution role in csvd-dev +- The GHE org webhook is configured once: push events → Lambda Function URL -### 2. User fills the SC form - -The user opens the **tf-run-executor** product and provides: - -- **AccountRepo** — same repo name as the Proposer -- **Layer** and **RegionDir** — same as the Proposer -- **TargetAccountId** _(optional)_ — if set, CodeBuild assumes the cross-account role -- **TfRunStartTag** _(optional)_ — start tf-run from a specific `TAG` step -- **DryRun** — `true` for plan-only, `false` to apply - -### 3. CloudFormation invokes the Lambda +### 2. GHE fires the push webhook -CFN creates a `Custom::TerraformApply` resource with `action: apply`. +On merge to `main`, GHE sends a `push` event to the Lambda Function URL with +an HMAC-SHA256 signature (`X-Hub-Signature-256` header). The Lambda verifies +the signature against the `ghe-runner/webhook-secret` Secrets Manager secret. -### 4. Lambda validates and starts CodeBuild +### 3. Lambda reads `.sc-automation.yml` and starts CodeBuild -Lambda starts `tf-run-executor` with: +The Lambda (webhook handler mode): +1. Fetches `.sc-automation.yml` from the default branch of the pushed repo +2. Extracts `account_repo`, `layer`, `region_dir`, `target_account_id`, + `dry_run`, and optional `tf_run_start_tag` +3. Calls `codebuild:StartBuild` on `tf-run-executor` with override env vars: + ``` + ACCOUNT_REPO, LAYER, REGION_DIR, + TARGET_ACCOUNT_ID, TF_RUN_START_TAG, DRY_RUN, GITHUB_TOKEN + ``` +4. Posts a `pending` commit status to the merge commit on GHE +5. Returns HTTP 200 immediately — the webhook call is fire-and-forget -``` -ACCOUNT_REPO, LAYER, REGION_DIR, -TARGET_ACCOUNT_ID, TF_RUN_START_TAG, DRY_RUN, GITHUB_TOKEN -``` - -### 5. CodeBuild - INSTALL phase +### 4. CodeBuild - INSTALL phase - Clones `github.e.it.census.gov/terraform/support` for version governance - Downloads Terraform binary from S3 (version governed by `VERSION_TF`) @@ -206,22 +209,39 @@ TARGET_ACCOUNT_ID, TF_RUN_START_TAG, DRY_RUN, GITHUB_TOKEN - Downloads and installs `gh` CLI - `pip3 install python-dateutil pyyaml` -### 6. CodeBuild - BUILD phase +### 5. CodeBuild - BUILD phase 1. Rewrite git remotes; `git clone` account repo; `git checkout main` 2. If `TARGET_ACCOUNT_ID` is set: `aws sts assume-role` → - `arn:aws:iam::{TARGET_ACCOUNT_ID}:role/sc-automation-codebuild-role` + `arn:${AWS::Partition}:iam::{TARGET_ACCOUNT_ID}:role/sc-automation-codebuild-role` and export the temporary credentials 3. `cd ${LAYER}/${REGION_DIR}` -4. If `DRY_RUN=true`: `tf-run plan`; else: `tf-run apply` (with optional `--start-tag ${TF_RUN_START_TAG}`) +4. If `DRY_RUN=true`: `tf-run plan`; else: `tf-run apply` + (with optional `--start-tag ${TF_RUN_START_TAG}`) -### 7. Lambda polls and returns +### 6. CodeBuild - POST_BUILD phase -On `SUCCEEDED`: -- Sends CFN `SUCCESS` with: - - `ApplyStatus: SUCCEEDED` - - `RepositoryUrl` / `repository_url` - - `CodeBuildBuildId` +Writes a `success` or `failure` commit status to GHE on the merge commit, +linking to the CodeBuild log. Platform engineers see ✅/❌ on the commit +without checking CloudWatch directly. + +### Manual One-Off Runs + +For re-apply, dry-run, or partial runs (start from a TAG), trigger the executor +build directly: + +```bash +export AWS_DEFAULT_REGION=us-gov-west-1 +aws codebuild start-build \ + --project-name tf-run-executor \ + --environment-variables-override \ + name=ACCOUNT_REPO,value=229685449397-csvd-dev-platform-dev-gov,type=PLAINTEXT \ + name=LAYER,value=infrastructure,type=PLAINTEXT \ + name=REGION_DIR,value=west,type=PLAINTEXT \ + name=DRY_RUN,value=true,type=PLAINTEXT +``` + +No Service Catalog product is needed. --- @@ -310,6 +330,6 @@ mishandles acronyms (`AWSAccountId` → `a_w_s_account_id`). | `deploy/codebuild.tf` | Terraform: `aws_codebuild_project.tf_run_proposer` + `tf_run_executor` | | `deploy/lambda.tf` | Terraform: Lambda function with `PROPOSER_PROJECT_NAME` + `EXECUTOR_PROJECT_NAME` | | `deploy/iam.tf` | Terraform: IAM roles for Lambda, CodeBuild (with `sts:AssumeRole`), SC launch | -| `deploy/service_catalog.tf` | Terraform: Portfolio, two products, two launch constraints | +| `deploy/service_catalog.tf` | Terraform: Portfolio, single Proposer product, launch constraint | +| `deploy/webhook.tf` | Terraform: Lambda Function URL, HMAC secret, GHE webhook IAM | | `service-catalog/proposer-template.yaml` | CFN template for the Propose product | -| `service-catalog/executor-template.yaml` | CFN template for the Apply product | diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..9052243 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,145 @@ +# sc-lambda-ghactions Documentation + +This directory contains the design, operating model, and rollout guidance for +`sc-lambda-ghactions` — the centralized Lambda + CodeBuild system that provisions +and manages Terraform-backed account repo changes through AWS Service Catalog. + +## What This System Does + +At a high level, the platform supports this workflow: + +1. A user launches a Service Catalog product +2. CloudFormation invokes a centralized Lambda in `csvd-dev` +3. The Lambda validates inputs and starts a CodeBuild build +4. CodeBuild clones a template repo, renders Terraform/HCL/YAML content, and opens a PR +5. After merge, the executor path can run Terraform against the target workload +6. CSVD can also operate the full managed fleet centrally + +## How to Read This Documentation + +This doc set currently contains both: + +- **Current or near-term implementation guidance** for the CodeBuild-based rollout +- **Proposed design evolution** for auto-apply, generalized product types, and fleet-scale operations + +Because of that, the best entry point depends on what you need. + +## Recommended Reading Paths + +### 1. "I need the quickest overview" + +Start with: + +- [HOW-IT-WORKS.md](HOW-IT-WORKS.md) — end-to-end explanation of the proposer/executor model, the main infrastructure components, and the current CodeBuild execution flow +- [workflow-flowcharts.md](workflow-flowcharts.md) — visual walkthrough of provisioning, apply-on-merge, and fleet update flows + +### 2. "I need to understand the target generalized architecture" + +Start with: + +- [generalized-terraform-product-architecture.md](generalized-terraform-product-architecture.md) — explains how the system expands from EKS-only into a reusable pattern for any Terraform workload +- [template-management.md](template-management.md) — explains how template repos, Jinja2 rendering, `.sc-automation.yml`, and repo injection work +- [repo-vars-and-secrets.md](repo-vars-and-secrets.md) — explains how SSM and Secrets Manager values are injected into CodeBuild builds + +### 3. "I need to onboard a new Service Catalog product" + +Read in this order: + +- [generalized-terraform-product-architecture.md](generalized-terraform-product-architecture.md) — required moving parts for a new `product_type` +- [template-management.md](template-management.md) — template repo structure and rendering expectations +- [service-catalog-census-integration.md](service-catalog-census-integration.md) — how to register the product in `terraform-service-catalog-census` +- [repo-vars-and-secrets.md](repo-vars-and-secrets.md) — how product-scoped configuration and secrets reach the build + +### 4. "I need to understand operations and governance at scale" + +Start with: + +- [fleet-governance-at-scale.md](fleet-governance-at-scale.md) — the `terraform-sc-fleet` operating model, workload inventory structure, maintenance windows, and governance controls +- [cross-account-visibility.md](cross-account-visibility.md) — hub-and-spoke IAM model and options for centralized visibility across accounts +- [workflow-flowcharts.md](workflow-flowcharts.md) — visual summary of fleet-wide operations + +### 5. "I need to understand the webhook auto-apply proposal" + +Read: + +- [decisions/001-webhook-auto-apply.md](decisions/001-webhook-auto-apply.md) — ADR for triggering executor builds automatically from GitHub Enterprise webhook events +- [workflow-flowcharts.md](workflow-flowcharts.md) — flow-level view of the apply-on-merge path +- [template-management.md](template-management.md) — `.sc-automation.yml` schema and executor behavior + +## Document Guide + +### Core system overview + +- [HOW-IT-WORKS.md](HOW-IT-WORKS.md) + - Best for understanding the end-to-end proposer/executor model + - Covers the centralized Lambda, CodeBuild projects, SC products, and step-by-step runtime behavior + - Use this as the main operational baseline + +- [workflow-flowcharts.md](workflow-flowcharts.md) + - Best for stakeholder demos and quick architectural orientation + - Includes flows for provisioning, apply-on-merge, and fleet-wide updates + +### Generalization and product onboarding + +- [generalized-terraform-product-architecture.md](generalized-terraform-product-architecture.md) + - Explains how the platform generalizes to any Terraform workload + - Defines the core onboarding units: template repo, Jinja2 templates, Pydantic model, CFN product template, census registration + +- [template-management.md](template-management.md) + - Canonical guide for template repo usage + - Covers full-repo vs subdirectory templates, Jinja2 rendering, `.sc-automation.yml`, proposer behavior, and executor re-rendering into existing account repos + +- [repo-vars-and-secrets.md](repo-vars-and-secrets.md) + - Canonical guide for runtime config injection + - Covers AWS Parameter Store layout, Secrets Manager layout, Lambda IAM, and CodeBuild `environmentVariablesOverride` + +- [service-catalog-census-integration.md](service-catalog-census-integration.md) + - Canonical guide for enterprise product registration + - Covers central vs StackSet vs census-managed resources, launch roles, portfolio/product YAML, and rollout into `terraform-service-catalog-census` + +### Operations, governance, and visibility + +- [fleet-governance-at-scale.md](fleet-governance-at-scale.md) + - Defines the `terraform-sc-fleet` model for operating many workloads across many repos + - Covers workload entry files, account repo layout, update scripts, maintenance windows, CODEOWNERS, and branch protection + +- [cross-account-visibility.md](cross-account-visibility.md) + - Covers read-only access patterns for viewing managed resources across accounts + - Describes the hub-and-spoke IAM role chain and Resource Explorer-first UI approach + +### Architecture decisions + +- [decisions/001-webhook-auto-apply.md](decisions/001-webhook-auto-apply.md) + - ADR for the proposed webhook-triggered executor path + - Useful for understanding why the manual post-merge step should disappear and how `.sc-automation.yml` participates in the design + +## Suggested Canonical Interpretation + +Where multiple docs overlap, use this interpretation: + +- [HOW-IT-WORKS.md](HOW-IT-WORKS.md) is the best **runtime/system overview** +- [template-management.md](template-management.md) is the best **template repo and account repo injection** reference +- [repo-vars-and-secrets.md](repo-vars-and-secrets.md) is the best **config/secrets injection** reference +- [service-catalog-census-integration.md](service-catalog-census-integration.md) is the best **enterprise rollout** reference +- [fleet-governance-at-scale.md](fleet-governance-at-scale.md) is the best **day-2 fleet operations** reference +- [decisions/001-webhook-auto-apply.md](decisions/001-webhook-auto-apply.md) is the best **design rationale** for auto-apply on merge + +## Current Gaps and Notes + +This doc set is now broad enough to explain: + +- how template repos are leveraged +- how rendered content is injected into new and existing account repos +- how CodeBuild receives configuration and secrets +- how new products are registered in Census +- how CSVD governs and operates the resulting fleet + +A few documents are still explicitly marked **Proposed** or **Draft**, so treat them as design intent unless and until the code and deployment match them. + +## If You Only Read Three Docs + +Read these first: + +1. [HOW-IT-WORKS.md](HOW-IT-WORKS.md) +2. [template-management.md](template-management.md) +3. [service-catalog-census-integration.md](service-catalog-census-integration.md) diff --git a/docs/cross-account-visibility.md b/docs/cross-account-visibility.md new file mode 100644 index 0000000..91cf421 --- /dev/null +++ b/docs/cross-account-visibility.md @@ -0,0 +1,353 @@ +# Cross-Account Fleet Visibility — Credentials and Console UI + +**Date:** 2026-05-19 +**Status:** Proposed +**Scope:** Read-only visibility across all accounts managed by sc-lambda-ghactions + +--- + +## Problem + +The `terraform-sc-fleet` manifest and `update_fleet.py` give CSVD a single operational +view of all managed workloads at the Terraform / GHE layer. But engineers also need to +locate and inspect those resources in the **AWS console** — CloudFormation stacks, +Service Catalog provisioned products, Lambda functions, S3 buckets, EKS clusters — +across all accounts simultaneously, without switching console sessions or holding +long-lived credentials for each account. + +--- + +## Credential Model — Hub-and-Spoke IAM Role Chain + +The UI server and any tooling that reads across accounts **never holds long-lived +credentials**. It uses `sts:AssumeRole` to obtain temporary credentials scoped to +each target account on demand. + +``` +csvd-dev (229685449397) — hub + └─> sc-fleet-ui-server role (instance profile / ECS task role) + └─> sts:AssumeRole ─────────────────────────────────────────────┐ + ▼ + Any spoke account + └─> sc-fleet-readonly role + └─> ReadOnlyAccess (AWS managed policy) +``` + +Temporary credentials are cached for up to 1 hour (the STS session duration). +Rotation is automatic. No keys are stored in environment variables, SSM, or Secrets Manager. + +--- + +## Infrastructure + +### 1. Spoke role — deployed to every target account via StackSet + +One role per account, deployed automatically via the existing +`CensusServiceCatalog-RoleAndAction` StackSet alongside the SC launch roles. + +**CFN role template** (`templates/role-templates/sc-fleet-readonly-role.yaml`): + +```yaml +Type: AWS::IAM::Role +Properties: + RoleName: sc-fleet-readonly + AssumeRolePolicyDocument: + Version: "2012-10-17" + Statement: + - Effect: Allow + Principal: + AWS: !Sub "arn:${AWS::Partition}:iam::${HubAccountId}:role/sc-fleet-ui-server" + Action: sts:AssumeRole + Condition: + StringEquals: + "sts:ExternalId": !Ref ExternalId # optional but recommended + ManagedPolicyArns: + - !Sub "arn:${AWS::Partition}:iam::aws:policy/ReadOnlyAccess" + Tags: + - Key: managed-by + Value: sc-lambda-ghactions +``` + +**`roles.yaml.tftpl` entry** (census repo): + +```yaml +- template: sc-fleet-readonly-role.yaml + parameters: + - parameter: HubAccountId + value: "229685449397" + - parameter: ExternalId + value: "sc-fleet-ui" +``` + +This propagates to all OU-shared accounts automatically. New accounts joining the OU +receive the role via `auto_deployment { enabled = true }`. + +### 2. Hub role — deployed in csvd-dev + +Lives in `sc-lambda-ghactions/deploy/iam.tf`. This is the role assumed by the UI server +(ECS task, Lambda, or EC2 instance profile). + +```hcl +resource "aws_iam_role" "sc_fleet_ui_server" { + name = "sc-fleet-ui-server" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Principal = { Service = "ecs-tasks.amazonaws.com" } + Action = "sts:AssumeRole" + }] + }) + + tags = { + managed-by = "sc-lambda-ghactions" + } +} + +resource "aws_iam_role_policy" "assume_spoke_roles" { + name = "assume-sc-fleet-readonly" + role = aws_iam_role.sc_fleet_ui_server.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = "sts:AssumeRole" + Resource = "arn:${data.aws_partition.current.partition}:iam::*:role/sc-fleet-readonly" + # Restrict to org accounts only + Condition = { + StringEquals = { + "aws:ResourceOrgID" = var.org_id + } + } + }] + }) +} +``` + +### 3. Python helper — per-account session factory + +Used by the fleet dashboard, `update_fleet.py`, and any other tooling that needs +cross-account AWS API access: + +```python +# scripts/aws_session.py +import boto3 +from functools import lru_cache + +READONLY_ROLE = "sc-fleet-readonly" +PARTITION = "aws-us-gov" +REGION = "us-gov-west-1" + +@lru_cache(maxsize=64) +def session_for(account_id: str) -> boto3.Session: + """Return a boto3 Session scoped to account_id via sts:AssumeRole. + Credentials are cached for the lifetime of the process. + For long-running processes, evict the cache before the 1-hour STS expiry. + """ + sts = boto3.client("sts", region_name=REGION) + assumed = sts.assume_role( + RoleArn=f"arn:{PARTITION}:iam::{account_id}:role/{READONLY_ROLE}", + RoleSessionName="sc-fleet-ui", + ExternalId="sc-fleet-ui", + DurationSeconds=3600, + ) + creds = assumed["Credentials"] + return boto3.Session( + aws_access_key_id=creds["AccessKeyId"], + aws_secret_access_key=creds["SecretAccessKey"], + aws_session_token=creds["SessionToken"], + region_name=REGION, + ) + +def sc_client(account_id: str): + return session_for(account_id).client("servicecatalog") + +def cfn_client(account_id: str): + return session_for(account_id).client("cloudformation") +``` + +--- + +## Centralized UI Options + +Three options in order of implementation cost: + +### Option A — AWS Resource Explorer (recommended first step) + +Resource Explorer with a **multi-account aggregator index** provides a single search +across all accounts with built-in console deep-links. No custom UI to build or maintain. + +#### Setup + +Enable Resource Explorer org-wide with an aggregator in the management (or delegated +admin) account: + +```hcl +# In the management/delegated-admin account +resource "aws_resourceexplorer2_index" "aggregator" { + type = "AGGREGATOR" +} + +resource "aws_resourceexplorer2_view" "sc_fleet" { + name = "sc-fleet" + default_view = true + + filters { + filter_string = "tag:managed-by:sc-lambda-ghactions" + } +} +``` + +Each member account needs a local index (can be enabled via AWS Organizations policy +or Terraform deployed via StackSet): + +```hcl +resource "aws_resourceexplorer2_index" "local" { + type = "LOCAL" +} +``` + +#### Tagging convention + +Every resource provisioned through sc-lambda-ghactions must carry these tags so +Resource Explorer can surface them: + +| Tag key | Example value | Purpose | +|---------|--------------|---------| +| `managed-by` | `sc-lambda-ghactions` | Scope the aggregator view | +| `product-type` | `eks_cluster` | Filter by workload type | +| `workload-name` | `csvd-dev-mcm` | Find a specific workload | +| `team` | `csvd` | Filter by owning team | +| `lifecycle` | `dev` | Filter by environment tier | +| `account-repo` | `229685449397-csvd-dev-gov_apps-adsd-eks` | Trace back to GHE repo | + +The Proposer CodeBuild buildspec applies these tags when rendering HCL files that +create tagged resources. For resources that don't support tags (e.g. some IAM), the CFN +stack itself is tagged and the stack's console link is sufficient. + +#### Example Resource Explorer queries + +``` +# All sc-lambda-ghactions resources +tag:managed-by=sc-lambda-ghactions + +# All EKS provisioned products +tag:managed-by=sc-lambda-ghactions tag:product-type=eks_cluster + +# Specific workload across all resource types +tag:workload-name=csvd-dev-mcm + +# Failed CloudFormation stacks managed by the system +resourcetype:AWS::CloudFormation::Stack tag:managed-by=sc-lambda-ghactions +``` + +Results include a direct "Open in console" link to each resource in its native account. + +--- + +### Option B — Custom Fleet Dashboard + +A lightweight read-only web app when Resource Explorer is insufficient — e.g. you need +to show fleet diff state (pending PRs, last apply status, maintenance windows) alongside +AWS resource state. + +#### Architecture + +``` +csvd-dev + └─> ECS Fargate task (or Lambda + Function URL) + ├─> Assumes sc-fleet-ui-server hub role + ├─> Reads terraform-sc-fleet workloads/** (GHE API) + ├─> Calls sts:AssumeRole per account → reads SC/CFN/resource state + └─> Renders HTML dashboard with console deep-links +``` + +#### Console deep-link construction + +Direct links into the GovCloud console for each resource type: + +```python +BASE = "https://console.amazonaws-us-gov.com" + +def cfn_stack_link(region: str, stack_name: str) -> str: + return f"{BASE}/cloudformation/home?region={region}#/stacks?filteringText={stack_name}" + +def sc_product_link(region: str, product_id: str) -> str: + return f"{BASE}/servicecatalog/home?region={region}#/provisioned-products/{product_id}" + +def lambda_link(region: str, function_name: str) -> str: + return f"{BASE}/lambda/home?region={region}#/functions/{function_name}" + +def eks_link(region: str, cluster_name: str) -> str: + return f"{BASE}/eks/home?region={region}#/clusters/{cluster_name}" +``` + +#### Fleet status aggregation + +```python +from scripts.aws_session import sc_client, cfn_client + +def fleet_status(accounts: list[str]) -> list[dict]: + """Return provisioned product status across all accounts.""" + results = [] + for account_id in accounts: + sc = sc_client(account_id) + products = sc.search_provisioned_products( + Filters={"SearchQuery": ["tag:managed-by:sc-lambda-ghactions"]} + )["ProvisionedProducts"] + for p in products: + results.append({ + "account_id": account_id, + "product_name": p["Name"], + "product_type": p.get("Tags", {}).get("product-type"), + "status": p["Status"], + "status_message": p.get("StatusMessage"), + "console_link": sc_product_link(p["LastProvisioningRecordId"], p["Id"]), + }) + return results +``` + +--- + +### Option C — AWS Systems Manager Explorer + +SSM Fleet Manager and Explorer aggregate resource data, OpsItems, and compliance across +accounts out of the box — zero custom code, built-in console UI. Less flexible than +Options A/B but worth evaluating before building anything custom. + +Enable via AWS Organizations in the SSM console of the management account. No Terraform +changes needed beyond ensuring SSM is activated in all member accounts (already required +for StackSet operations). + +--- + +## Recommended Rollout + +| Phase | Work | Outcome | +|-------|------|---------| +| **1** | Add tags to all sc-lambda-ghactions provisioned resources (Proposer GHA templates) | Every resource carries `managed-by`, `product-type`, `workload-name`, `team`, `lifecycle` | +| **2** | Deploy `sc-fleet-readonly` spoke role via StackSet entry in census repo | CSVD hub can assume into any org account with one `sts:AssumeRole` call | +| **3** | Enable Resource Explorer aggregator index via management account | Single console search across all accounts with deep-links; zero custom UI | +| **4** | Add `aws_session.py` session factory to `terraform-sc-fleet/scripts/` | `update_fleet.py` and any future tooling can query any account with one helper call | +| **5** | *(optional)* Build fleet dashboard if Resource Explorer + GHE PR state is insufficient | Custom ECS task with per-account SC/CFN reads + console deep-link generation | + +Phases 1–3 are the minimum viable set. Phase 4 is a development convenience. Phase 5 +is only needed if the built-in console tools don't cover the operational queries CSVD +actually needs to make. + +--- + +## Security Notes + +- The `sc-fleet-readonly` spoke role grants `ReadOnlyAccess` — it cannot create, modify, + or delete any resource in any spoke account +- The `ExternalId` condition on `sts:AssumeRole` prevents confused-deputy attacks — only + callers that know the external ID can assume the role +- The hub role `sc-fleet-ui-server` is scoped to `sts:AssumeRole` on `*/sc-fleet-readonly` + only — it cannot assume any other role in spoke accounts +- The org condition (`aws:ResourceOrgID`) on the hub policy prevents the server from + assuming the role name in accounts outside the Census org +- No long-lived credentials are stored anywhere; STS temporary credentials expire + automatically after at most 1 hour diff --git a/docs/decisions/001-webhook-auto-apply.md b/docs/decisions/001-webhook-auto-apply.md index 369a01e..3f86962 100644 --- a/docs/decisions/001-webhook-auto-apply.md +++ b/docs/decisions/001-webhook-auto-apply.md @@ -9,26 +9,30 @@ We want to change this so that when a PR is merged to the main branch, our syste This paper describes how this automatic process will work, what files and settings are needed, and what changes we have to make to our system. The goal is to make things smoother, faster, and less error-prone for everyone who uses our platform. -**Status:** Proposed +**Status:** Accepted **Date:** 2026-05-11 -**Branch:** feature/template-repo-rendering +**Supersedes:** the two-product model (proposer SC product + executor SC product) --- ## Context -The current two-product model requires a human to manually provision the -`tf-run-executor` Service Catalog product after a Proposer PR is reviewed and -merged. This adds unnecessary friction to the apply step: +An earlier design split the workflow into two Service Catalog products — a +**Proposer** product to render templates and open a PR, and a separate +**Executor** product to run `tf-run apply` after the PR was merged. While the +Proposer SC product is a natural fit for self-service provisioning (users fill +a form, get a PR URL back), the Executor SC product is not: it requires a +platform engineer to return to Service Catalog, find the product, re-enter the +same parameters already specified at propose time, and click Launch. -1. Platform engineer reviews and merges the PR opened by the Proposer -2. Platform engineer opens Service Catalog, finds the executor product, fills in - the same parameters they already specified during the Propose step, and - clicks Launch +This step is pure operational overhead with no review value — the review already +happened when the PR was merged to `main`. The information needed to start the +executor build (account repo, layer, region dir, target account) is already +recorded in `.sc-automation.yml` in the repo itself. -Step 2 is pure operational overhead. The information needed to start the executor -build (account repo, layer, region dir, target account) is already known at merge -time and could be stored in the repo itself. +**The Executor SC product is removed.** Apply is triggered automatically by a +GHE webhook on merge to `main`. The only user-facing Service Catalog product +remains the Proposer. --- @@ -137,8 +141,10 @@ in the PR history — no CloudWatch required. - **No polling.** The webhook handler starts builds and returns immediately. Build results are visible in CodeBuild logs and CloudWatch. There is no CFN stack to signal. -- **No CFN resource.** The executor product is still available for manual use, - but webhook-triggered runs bypass Service Catalog entirely. +- **No CFN resource.** Webhook-triggered executor runs bypass Service Catalog + entirely. For manual one-off runs (re-apply from a TAG, dry-run), the executor + build can be started directly via the CodeBuild console or AWS CLI — no SC + product is needed or maintained. - **Idempotent.** If GHE retries the webhook (network blip), a duplicate build is started. This is acceptable — `tf-run apply` on an already-applied state is a no-op. @@ -180,8 +186,8 @@ in the PR history — no CloudWatch required. - GHE commit status writeback gives teams ✅/❌ feedback directly on the merge commit - No new infrastructure services (no EventBridge, no SQS, no API Gateway) - No repo→callback URL map to maintain — repo identity comes from the webhook payload -- The executor SC product remains available for manual one-off runs and - day-2 operations (re-run from a specific tag, dry-run, etc.) +- Manual one-off executor runs (re-apply from a TAG, dry-run) are done directly + via `aws codebuild start-build` — no separate SC product is needed or maintained ### Trade-offs diff --git a/docs/decisions/002-vault-aws-secrets-engine.md b/docs/decisions/002-vault-aws-secrets-engine.md new file mode 100644 index 0000000..1d1bd4c --- /dev/null +++ b/docs/decisions/002-vault-aws-secrets-engine.md @@ -0,0 +1,345 @@ +# ADR-002: HashiCorp Vault AWS Secrets Engine for Dynamic Cross-Account Credentials + +## In Plain Language + +Right now, when our automation runs Terraform in an account repo it needs AWS +credentials to assume a role in the target account. Those credentials come from +a long-lived IAM role attached to the CodeBuild service role — a role that +exists permanently and can be used at any time. + +This document proposes replacing those static, always-on IAM credentials with +**short-lived, on-demand credentials** issued by a HashiCorp Vault cluster +running the [AWS Secrets Engine](https://developer.hashicorp.com/vault/docs/secrets/aws). +When a build starts, it authenticates to Vault (using its own AWS identity), +asks for credentials scoped to the target account and the specific Vault role +defined in the product workspace, gets back temporary AWS keys that expire in +minutes, and then runs Terraform. There are no long-lived keys to rotate or +accidentally expose. + +Because the Vault role is a Terraform resource declared inside the product +workspace, the exact IAM permissions granted to any automation run are visible +as a reviewable diff in the same PR that makes the infrastructure change. Review +the code, review the access policy — one approval covers both. + +**Status:** Proposed +**Date:** 2026-05-19 + +--- + +## Context + +The current cross-account credential model works as follows: + +1. The CodeBuild service role (`sc-automation-codebuild-role` in csvd-dev) has + `sts:AssumeRole` permission for `*:role/sc-automation-codebuild-role`. +2. A matching role with the same name is pre-created in each target account and + trusts the csvd-dev CodeBuild role. +3. The executor buildspec calls `aws sts assume-role` and exports `AWS_*` env + vars before running Terraform. + +This works but has the following drawbacks: + +- **Static trust relationship.** The csvd-dev CodeBuild role can assume the + target-account role at any time, not just during a sanctioned automation run. + If the CodeBuild service role or its credentials were ever misused, an attacker + could assume any target-account role without any build being underway. +- **No per-run scope.** Every executor build gets the same level of access, + regardless of what the product workspace actually needs. There is no way to + restrict a build to, say, only VPC-layer permissions. +- **Permission review is disconnected.** The IAM role in the target account is + managed separately from the product workspace. A reviewer approving a product + PR has no visibility into what IAM permissions the automation will use. +- **Static role pre-creation.** Every new account requires a platform engineer + to pre-create the `sc-automation-codebuild-role` role before the first + automation run can succeed. + +The Vault AWS Secrets Engine addresses all four of these gaps. + +--- + +## Decision + +Deploy a HashiCorp Vault cluster (or use an existing Census-managed Vault) with +the **AWS Secrets Engine** enabled. Each SC product workspace declares a +`vault_aws_secret_backend_role` Terraform resource specifying the exact IAM +permissions the automation run requires. The executor buildspec authenticates to +Vault using the **AWS auth method** (the CodeBuild task's own IAM identity) and +requests short-lived STS credentials scoped to that role before running Terraform. + +--- + +## Proposed Design + +### Vault AWS Secrets Engine — how it works + +``` +Vault cluster (csvd-dev or shared platform) + └── secrets/aws/ (AWS Secrets Engine mount) + └── roles/ + └── {vault_aws_role} + credential_type = assumed_role + role_arns = ["arn:aws-us-gov:iam::{account_id}:role/{role}"] + default_ttl = 900s + max_ttl = 1800s +``` + +When the executor build calls `vault read aws/creds/{vault_aws_role}`, Vault +calls `sts:AssumeRole` on its own behalf and returns temporary +`AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_SESSION_TOKEN` that +expire when the TTL elapses. The credentials are scoped to exactly the role ARNs +listed in the Vault role — nothing wider. + +### `.sc-automation.yml` — new field + +```yaml +apply_on_merge: + - layer: infrastructure + region_dir: west + target_account_id: "229685449397" + vault_aws_role: "sc-infra-west-229685449397" # ← new +``` + +The `vault_aws_role` value is the name of the Vault role to read credentials +from. It is written by the Proposer (derived from the product workspace) and +committed to the account repo alongside the rendered HCL files. + +### Product workspace — Vault role as a Terraform resource + +Each SC product workspace (e.g. a VPC product, an EKS product) declares the +Vault role it needs alongside its other infrastructure: + +```hcl +# vault_role.tf — committed inside the product workspace, reviewed in the PR + +resource "vault_aws_secret_backend_role" "automation" { + backend = "aws" + name = "sc-infra-west-${var.target_account_id}" + credential_type = "assumed_role" + + role_arns = [ + "arn:${var.aws_partition}:iam::${var.target_account_id}:role/sc-automation-infra-west" + ] + + default_ttl = 900 + max_ttl = 1800 +} +``` + +**Why this matters for review:** The Proposer PR diff includes `vault_role.tf`. +A reviewer can see exactly which IAM role the automation will assume and in which +account. Access policy and infrastructure change are approved in the same PR +— there is no separate IAM role PR to chase down or forget. + +### CodeBuild authentication to Vault — AWS auth method + +The executor CodeBuild task authenticates to Vault using the +[AWS auth method](https://developer.hashicorp.com/vault/docs/auth/aws). The +CodeBuild service role's IAM identity is used as the authentication credential +— no long-lived Vault token is stored anywhere. + +```bash +# executor buildspec — PRE_BUILD phase +vault login -method=aws \ + -path=auth/aws \ + role=sc-automation-executor \ + header_value=vault.example.census.gov + +# Read dynamic credentials for this specific run +CREDS=$(vault read -format=json aws/creds/${VAULT_AWS_ROLE}) +export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r .data.access_key) +export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r .data.secret_key) +export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r .data.security_token) +``` + +The CodeBuild task's IAM role is added to a Vault auth policy that permits only +`read` on `aws/creds/*` — it cannot create new Vault roles, modify policies, or +read credentials for roles it is not permitted to access. + +### Vault Terraform resources — managed in deploy/ + +```hcl +# deploy/vault.tf + +resource "vault_aws_secret_backend" "aws" { + path = "aws" + # Vault's own IAM user/role that calls sts:AssumeRole on behalf of requestors + # must have sts:AssumeRole on the target roles. +} + +resource "vault_auth_backend" "aws" { + type = "aws" + path = "auth/aws" +} + +resource "vault_aws_auth_backend_role" "codebuild_executor" { + backend = vault_auth_backend.aws.path + role = "sc-automation-executor" + auth_type = "iam" + bound_iam_principal_arns = [aws_iam_role.codebuild_service_role.arn] + token_policies = ["sc-automation-executor"] + token_ttl = 900 +} + +resource "vault_policy" "codebuild_executor" { + name = "sc-automation-executor" + + policy = <<-EOT + path "aws/creds/*" { + capabilities = ["read"] + } + EOT +} +``` + +### Infrastructure summary + +| Component | Location | Purpose | +|---|---|---| +| Vault cluster | Census-managed or csvd-dev | Issues dynamic AWS credentials | +| AWS Secrets Engine | `aws/` mount on Vault | Calls `sts:AssumeRole` and returns short-lived keys | +| AWS auth method | `auth/aws/` mount on Vault | Lets CodeBuild authenticate using its own IAM identity | +| `vault_aws_secret_backend_role` | Product workspace Terraform | Per-product IAM scope, reviewed in the Proposer PR | +| Vault endpoint env var | `deploy/codebuild.tf` | `VAULT_ADDR` set on the executor CodeBuild project | +| Vault IAM user | `deploy/vault.tf` | Has `sts:AssumeRole` on all target-account roles | +| Target-account IAM roles | Per-account Terraform | Trust Vault IAM user; scoped to minimum permissions | + +--- + +## Integration with the Proposer Flow + +The key insight is that the Vault role declaration is **part of the product +workspace**, not managed out-of-band. + +When the Proposer CodeBuild build runs Terraform (`tf apply`) to render and +commit files to the account repo, it also applies `vault_role.tf`. The result: + +1. User fills SC product form → Proposer starts. +2. Proposer runs `terraform apply` in the product workspace → creates + `vault_aws_secret_backend_role` in Vault. +3. Proposer renders HCL templates → opens PR on the account repo. +4. PR includes `.sc-automation.yml` with `vault_aws_role: sc-infra-west-{account_id}`. +5. Reviewer merges PR. +6. Webhook fires executor build with `VAULT_AWS_ROLE=sc-infra-west-{account_id}`. +7. Executor authenticates to Vault, reads credentials for that role, runs Terraform. + +The Vault role and the target-account IAM role both exist by the time the +executor runs because the Proposer created them before the PR was even opened. + +### Account baseline prerequisite + +For the Proposer to create the target-account IAM role, it needs an initial +foothold in that account. A single **proposer-access role** must exist in each +target account before the first product is provisioned into it: + +```hcl +# Created once per account as part of account baseline / landing-zone +resource "aws_iam_role" "sc_automation_proposer" { + name = "sc-automation-proposer" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Principal = { AWS = "arn:${var.aws_partition}:iam::229685449397:role/tf-run-proposer-codebuild" } + Action = "sts:AssumeRole" + }] + }) +} + +# Permissions boundary keeps this role from creating anything outside +# the sc-automation-* namespace regardless of what policy is attached +resource "aws_iam_role_policy" "sc_automation_proposer" { + role = aws_iam_role.sc_automation_proposer.name + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = ["iam:CreateRole", "iam:PutRolePolicy", "iam:AttachRolePolicy", + "iam:DeleteRole", "iam:DeleteRolePolicy", "iam:GetRole"] + Resource = "arn:${var.aws_partition}:iam::*:role/sc-automation-*" + }] + }) +} +``` + +This role is **not** a Vault-specific concept — it is the account-level trust +grant that allows the automation platform (csvd-dev) to manage its own IAM +footprint in a target account. It belongs in the account vending / landing-zone +baseline alongside other platform roles (e.g. Break-Glass, Config recorder, +SSO permission sets). Once created at account birth it never needs to change. + +--- + +## Consequences + +### Benefits + +- **Short-lived credentials.** Dynamic STS credentials expire within the TTL + (default 15 min). A leaked credential is useless after expiry. +- **Per-run scope.** Each executor build reads credentials for the specific + Vault role defined in `.sc-automation.yml`. A build cannot access credentials + for a role it was not explicitly given. +- **Review parity.** IAM permissions (`vault_role.tf`) are changed in the same + PR as infrastructure. No separate IAM PR; no forgotten permission cleanup. +- **No static cross-account trust.** The existing "CodeBuild role can assume + any `sc-automation-codebuild-role` at any time" is replaced with "CodeBuild + can only read credentials for Vault roles it is permitted to access, and only + during an active build." +- **Automatic Vault role and IAM role provisioning.** The Proposer's + `terraform apply` creates both the Vault role and the target-account IAM + role the Vault secrets engine will assume — in the same apply, before the + PR is opened. No manual per-product setup in the target account. +- **Audit log.** Vault logs every credential issuance with the requesting + entity, timestamp, and lease ID. Each executor build's credential request is + independently auditable in Vault audit logs, separate from CloudTrail. + +### Trade-offs + +- **Vault dependency.** The automation chain now requires a healthy Vault + cluster. If Vault is unavailable, executor builds cannot obtain credentials + and will fail. Mitigation: Vault HA, periodic health checks, runbook for + Vault outage. +- **Vault provider version pinning.** The product workspace requires the + `hashicorp/vault` Terraform provider. This must be available via the Census + proxy (or mirrored in the internal provider cache) and pinned to a tested + version. +- **One landing-zone role required per account.** The Proposer needs a + `sc-automation-proposer` role in each target account (see _Account baseline + prerequisite_ above) to create the per-product executor IAM role. This is a + one-time setup per account, lives in the account vending baseline, and is + narrower than today's equivalent (`iam:CreateRole` on `sc-automation-*` only). +- **Executor buildspec changes required.** `vault login` and `vault read` + calls must be added to the PRE_BUILD phase and the prior + `aws sts assume-role` pattern removed. + +### Out of scope for this ADR + +- Vault cluster sizing, HA topology, and DR strategy — tracked separately +- Census Vault namespace design (shared cluster vs. dedicated) — tracked separately +- Migration path for existing accounts already using the static-role model — tracked separately +- Slack / SNS notification on Vault credential issuance failures — tracked separately + +--- + +## Alternatives Considered + +**AWS IAM Roles Anywhere:** Lets workloads outside AWS obtain short-lived +credentials by presenting a certificate signed by a private CA. Requires +managing a private CA and distributing certificates to CodeBuild tasks. +More complex than Vault AWS auth (which reuses the existing IAM identity +already on the CodeBuild task) with no meaningful benefit in this context. +Rejected. + +**Keep static cross-account role assumption + add SCPs to restrict usage to +CodeBuild source IPs:** SCPs cannot restrict by source service (CodeBuild vs +an operator workstation with the same credentials), only by IP range. IP +ranges for CodeBuild are not stable or exclusive. Rejected. + +**AWS Secrets Manager dynamic secrets plugin:** AWS Secrets Manager does not +natively generate STS-assumed-role credentials on demand. The only supported +dynamic rotation pattern is for database passwords. Rejected. + +**OIDC federation (GitHub Actions model):** GHE on-prem does not expose an +OIDC discovery endpoint compatible with the AWS IAM OIDC provider without +additional infrastructure. Vault AWS auth with the existing CodeBuild IAM +identity is simpler and requires no GHE configuration changes. Rejected. diff --git a/docs/fleet-governance-at-scale.md b/docs/fleet-governance-at-scale.md new file mode 100644 index 0000000..92941cc --- /dev/null +++ b/docs/fleet-governance-at-scale.md @@ -0,0 +1,403 @@ +# Infrastructure Fleet Governance at Scale + +**Ported and generalized from:** `lambda-template-repo-generator/design-docs/EKS_CLUSTER_GOVERNANCE_AT_SCALE.md` +**Generalized from:** EKS-only → any Terraform workload managed through sc-lambda-ghactions +**Date:** 2026-05-19 +**Status:** DRAFT + +--- + +## Summary + +This document defines the governance model and work plan for operating the +sc-lambda-ghactions system at scale — across many provisioned workloads, many account +repos, and many product types (EKS clusters, S3 buckets, RDS instances, VPCs, etc.). + +The three requirements that drive the design: + +1. **Individual workloads** can be modified and updated granularly, without touching others. +2. **All workloads** can be managed centrally by CSVD — CSVD retains governance even as + provisioning is self-service for customers. +3. **Workload state lives in the customer's account repo**, in a dedicated folder per workload, + using a consistent Terragrunt structure. + +The overarching constraint: customers cannot realistically manage complex Terraform +infrastructure themselves. If CSVD gives up governance, they will be called in to +remediate. The solution must scale to many workloads while keeping CSVD in control of +configuration correctness and lifecycle. + +--- + +## The Fleet Repository: `terraform-sc-fleet` + +### Why a dedicated fleet repo + +The sc-lambda-ghactions Lambda and CodeBuild builds are the _provisioning plane_ — they +create repos and open initial PRs. GitHub Actions workflows are planned for a later +rollout phase and will replace the CodeBuild executor builds at that point. The _operations plane_ — applying ongoing changes, +fleet-wide version bumps, governance policy enforcement — belongs in a separate repo +that CSVD controls directly. + +**`SCT-Engineering/terraform-sc-fleet`** is this operations plane. It contains one +folder per managed workload instance, each of which is a Terraform module call pointing +at the relevant product workspace. + +### Fleet repo structure + +``` +terraform-sc-fleet/ +├── workloads/ +│ ├── eks_cluster/ +│ │ ├── dev/ +│ │ │ ├── csvd/ +│ │ │ │ ├── csvd-dev-mcm/main.tf +│ │ │ │ └── csvd-lab-dja/main.tf +│ │ │ └── adsd/ +│ │ │ └── adsd-tools-dev/main.tf +│ │ └── prod/ +│ │ └── ois/ +│ │ └── ois-cribl-prod/main.tf +│ ├── s3_bucket/ +│ │ ├── dev/ +│ │ │ └── csvd/ +│ │ │ └── csvd-artifacts/main.tf +│ │ └── prod/ +│ └── {product_type}/ +│ └── {lifecycle}/ +│ └── {team}/ +│ └── {workload-name}/main.tf +├── scripts/ +│ ├── update_fleet.py # Fleet-wide apply runner +│ ├── maintenance_check.py # Window-aware update eligibility +│ └── fleet_query.py # Structured inventory queries +├── .github/ +│ └── workflows/ +│ └── regenerate-workspace.yml # Auto-updates fleet.code-workspace on push +├── fleet.code-workspace # Auto-generated VS Code workspace (all managed repos) +└── README.md +``` + +The directory tree encodes two dimensions: +- **Product type** (`eks_cluster`, `s3_bucket`, etc.) — matches `product_type` in the SC form +- **Lifecycle / team** (`dev/csvd`, `prod/ois`, etc.) — controls blast radius of fleet operations + +--- + +## Per-Workload Entry Files + +Each `workloads/{product_type}/{lifecycle}/{team}/{name}/main.tf` calls the corresponding +Terraform product workspace as a versioned external module: + +```hcl +# workloads/eks_cluster/dev/csvd/csvd-dev-mcm/main.tf +module "workload" { + source = "github.e.it.census.gov/SCT-Engineering/terraform-eks-deployment///?ref=v1.2.0" + + repo_name = "229685449397-csvd-dev-gov_apps-adsd-eks" # account repo + cluster_name = "csvd-dev-mcm" # folder inside that repo + account_name = "csvd-dev-gov" + aws_account_id = "229685449397" + aws_region = "us-gov-west-1" + vpc_name = "csvd-dev-ew-vpc-01" + # ... cluster-specific overrides +} + +locals { + maintenance_window = { + allowed_days = ["Tuesday", "Wednesday"] + allowed_hours = { start = 2, end = 6 } # UTC + blackout_dates = [] + } +} +``` + +Each entry file serves two roles simultaneously: +1. **Workload metadata** — authoritative record of the configuration CSVD intends for + this workload instance (versions, account, region, VPC, overrides) +2. **Injection location map** — specifies which account repo this workload's rendered HCL + was written into, and under which subfolder + +The `workloads/` tree as a whole is the **fleet map**: every workload CSVD manages has +an entry here. No external database, no spreadsheet. The source files are the inventory. + +--- + +## Account Repo Layout + +Each provisioned workload writes its rendered HCL into a folder inside a per-account +GHE repo. The folder path follows the account repo layer conventions: + +``` +{account-id}-{account-name}_apps-{team}/ +└── {product_type}/ + └── {workload-name}/ + ├── .sc-automation.yml # Written by Proposer; drives webhook executor + ├── config.json # Workload metadata (product_type, version pinned) + └── {region}/ + ├── remote_state.yml + └── {rendered HCL files} +``` + +**One account repo per account per team prefix** (e.g. `_apps-adsd-eks`, `_apps-csvd-platform`). +Multiple workload types and multiple instances of the same type can coexist in the same +account repo in separate subdirectories. + +--- + +## Separation of Concerns + +| Layer | Owner | Purpose | +|-------|-------|---------| +| Account repo (`{account}_apps-{team}/`) | Tenant team (read), CSVD (write via PR) | Source of truth for workload HCL config | +| `terraform-sc-fleet/workloads/` | CSVD | Central manifest; drives `tf apply` per workload | +| Product workspace (`terraform-eks-deployment`, etc.) | CSVD | Shared rendering logic and version defaults per product type | +| sc-lambda-ghactions Lambda + CodeBuild | CSVD | Provisioning UI; creates repo + initial config; webhook executor (initial rollout) | + +--- + +## Fleet Operations + +### Single-workload update + +```bash +cd terraform-sc-fleet/workloads/eks_cluster/dev/csvd/csvd-dev-mcm +source ~/aws-creds && tf apply +``` + +Opens a PR in that workload's account repo with the updated rendered HCL. Zero blast +radius to other workloads. + +### Fleet-wide update (`update_fleet.py`) + +```bash +# All workloads (dry run first) +python scripts/update_fleet.py --dry-run + +# All EKS clusters, dev lifecycle only +python scripts/update_fleet.py --product-type eks_cluster --lifecycle dev + +# Production workloads (requires --force) +python scripts/update_fleet.py --lifecycle prod --force + +# Filter by team +python scripts/update_fleet.py --team adsd + +# Filter by name substring +python scripts/update_fleet.py --filter csvd-lab +``` + +The script: +1. Walks `workloads/**/**/main.tf` recursively +2. Applies `--product-type`, `--lifecycle`, `--team`, `--filter` selectors +3. Checks `maintenance_window` locals — skips workloads outside their window unless `--force` +4. Runs `tf apply` (or `tf plan` for `--dry-run`) per workload +5. Reports per-workload success/failure with PR URLs + +**A version bump across 20 clusters is a one-liner.** Every additional workload costs CSVD +zero marginal effort for fleet-wide operations. + +### Maintenance windows + +Each entry file declares an optional `maintenance_window` local: + +```hcl +locals { + maintenance_window = { + allowed_days = ["Tuesday", "Wednesday"] + allowed_hours = { start = 2, end = 6 } # UTC + blackout_dates = ["2026-06-15", "2026-06-16"] + } +} +``` + +`update_fleet.py` reads this before each apply and skips out-of-window workloads. +Customers request a blackout window by opening a PR to their account repo modifying +`.sc-automation.yml` or by contacting CSVD to update the entry file. No out-of-band +emails or calendar coordination required. + +--- + +## Governance Controls + +### CODEOWNERS in provisioned account repos + +The Proposer build commits a `CODEOWNERS` file into every account repo it creates, +via `managed_extra_files` in the Terraform product workspace: + +``` +# CSVD owns all managed workload configuration +{product_type}/ @SCT-Engineering/csvd-platform-admins +``` + +Platform engineers in other teams may open PRs but cannot merge without CSVD approval. + +### Branch protection + +Branch protection (require PR, require CSVD review, no direct push to `main`) is set +at provisioning time via the `CSVD/terraform-github-repo` module call in each product +workspace. Every repo provisioned through sc-lambda-ghactions automatically gets these +rules at creation. + +### CODEOWNERS in `terraform-sc-fleet` + +The fleet repo itself uses a hierarchy-aware CODEOWNERS: + +``` +# Production workloads require senior review +workloads/*/prod/ @SCT-Engineering/csvd-senior-platform-admins + +# Dev/sandbox workloads can be approved by any CSVD engineer +workloads/*/dev/ @SCT-Engineering/csvd-platform-admins +``` + +--- + +## Fleet Workspace (`fleet.code-workspace`) + +A VS Code workspace file that includes all managed account repos and the fleet manifest +gives a CSVD engineer a full fleet view in a single editor window: + +```json +{ + "folders": [ + { "name": "fleet-manifest", "path": "." }, + { "name": "eks: csvd-dev-mcm", "path": "~/git/account-repos/229685449397-csvd-dev-gov_apps-adsd-eks" }, + { "name": "eks: adsd-tools-dev", "path": "~/git/account-repos/066884702657-ent-gov-shared-sa_apps-adsd-eks" }, + { "name": "s3: csvd-artifacts", "path": "~/git/account-repos/229685449397-csvd-dev-gov_apps-csvd-platform" } + // ... one entry per managed workload + ] +} +``` + +**This file is auto-generated** by a script in `terraform-sc-fleet` that is triggered +on every push to `main`. The script walks `workloads/**/**/main.tf`, extracts `repo_name` +and `workload_name`, and writes `fleet.code-workspace`. No operator ever edits it manually. + +> In the initial rollout this is a CodeBuild project triggered by a webhook. GHA +> workflows will replace it when the GHA executor rollout phase is complete. + +With this workspace open, a CSVD engineer can: +- See all workload configs side-by-side in the Explorer without navigating repos +- Ask Copilot fleet questions across all files at once: + _"Which EKS clusters are not on version 1.31?"_ + _"Show me all prod workloads and their maintenance windows"_ +- Grep across all workload configs simultaneously +- Open PRs to specific workload folders directly from the editor + +--- + +## AI Agents for Fleet Operations + +Because all workload config is declarative files in structured repos, AI agents can answer +operational questions without any custom database or API — **the workspace is the inventory**. + +### `sc-fleet` — Fleet Operator Agent + +Scoped to `fleet.code-workspace`. Answers operational questions across all managed workloads. + +Representative prompts: +- _"Which EKS clusters are not on version 1.31?"_ +- _"Show me all workloads in us-gov-east-1 and their account names"_ +- _"What's the maintenance window for adsd-tools-dev?"_ +- _"Which workloads have a pending update PR open right now?"_ + +### `sc-upgrade` — Version Bump Planning Agent + +Scoped to the relevant product workspace (e.g. `terraform-eks-deployment`). Plans and +validates fleet-wide or targeted version changes before applying. + +Representative prompts: +- _"Plan an upgrade of EKS to 1.31 for all dev clusters"_ +- _"Which workloads can receive an update today based on their maintenance windows?"_ +- _"Show me the tf plan diff for bumping the S3 module version fleet-wide"_ + +### `sc-pr-reviewer` — Customer PR Review Agent + +Injected into each account repo via `managed_extra_files` as a `.github/copilot-instructions.md`. +Automatically summarizes incoming customer PRs and flags governance violations before +a CSVD engineer reviews. + +Representative uses (triggered by a CodeBuild build on PR open, or invoked manually): +- Classifies all changed fields and flags any that are CSVD-owned +- Determines if the change requires a maintenance window +- Produces a one-sentence plain-English summary for the CSVD reviewer + +### `sc-provisioner` — Provisioning Debug Agent + +Scoped to `sc-lambda-ghactions`. Helps debug provisioning failures and validate SC inputs. + +Representative prompts: +- _"The SC product failed — here's the CFN error. What went wrong?"_ +- _"Validate these SC input parameters before I submit"_ +- _"What HCL files would be generated for this cluster config?"_ + +--- + +## Proposed Skills (for `~/.copilot/skills/`) + +| Skill | Trigger phrases | What it does | +|-------|----------------|-------------| +| `sc-fleet-query` | "fleet status", "which workloads", "show me all" | Parses `workloads/**/**/main.tf`, returns structured inventory; accepts `--product-type`, `--filter`, `--field` | +| `sc-maintenance-check` | "maintenance window", "can I update", "what's due today" | Reads `maintenance_window` locals, returns workloads eligible for update on a given date | +| `sc-upgrade-planner` | "plan upgrade", "bump version" | Calls `update_fleet.py --dry-run`, returns per-workload plan summary; flags closed maintenance windows | +| `sc-pr-summary` | "review PR", "summarize this diff" | Fetches PR diff via GHE API, classifies changed fields, returns one-sentence summary + governance flag list | + +--- + +## Work Plan + +### Phase 1 — Create `terraform-sc-fleet` repo + +- [ ] Create `SCT-Engineering/terraform-sc-fleet` +- [ ] Move existing `terraform-eks-deployment/clusters/` entries into + `workloads/eks_cluster/{lifecycle}/{team}/{name}/main.tf` +- [ ] Update module source paths from `../../` to versioned external module reference +- [ ] Add `README.md`, `scripts/update_fleet.py` skeleton +- [ ] Add CodeBuild project to regenerate `fleet.code-workspace` on push to `main` *(GHA workflow planned for later rollout)* + +### Phase 2 — Wire sc-lambda-ghactions Proposer to write fleet entries + +- [ ] After Proposer creates the account repo and opens the PR, also commit a new + `workloads/{product_type}/{lifecycle}/{team}/{name}/main.tf` entry to `terraform-sc-fleet` +- [ ] SC form adds optional `team` and `lifecycle` parameters (default: `dev` + name-prefix heuristic) +- [ ] Lambda threads `team` and `lifecycle` to the Proposer CodeBuild build as `environmentVariablesOverride` + +### Phase 3 — Governance controls at provisioning time + +- [ ] Add `CODEOWNERS` and branch protection to every provisioned account repo + via `managed_extra_files` in each product workspace +- [ ] Add CODEOWNERS to `terraform-sc-fleet` scoped by lifecycle + +### Phase 4 — Fleet-wide update automation + +- [ ] Complete `scripts/update_fleet.py` with `--product-type`, `--lifecycle`, `--team`, + `--filter`, `--dry-run`, `--force` flags +- [ ] Add maintenance window parsing (`maintenance_window` locals) +- [ ] Add `scripts/maintenance_check.py` for window-aware eligibility reporting +- [ ] Wire a CodeBuild project as headless fleet runner (optional) + +### Phase 5 — AI agents and skills + +- [ ] Add `fleet.code-workspace` auto-generation CodeBuild project *(GHA workflow planned for later rollout)* +- [ ] Add copilot instructions to `terraform-sc-fleet` scoped for fleet operator queries +- [ ] Define `sc-fleet-query` and `sc-maintenance-check` skills under `~/.copilot/skills/` +- [ ] Add `.github/copilot-instructions.md` to provisioned account repos via `managed_extra_files` + +--- + +## Open Questions + +| # | Question | Owner | +|---|----------|-------| +| 1 | One account repo per workload type, or one per account? | Manuel / Don | +| 2 | Auto-merge for fleet version bumps in dev lifecycle, or always require review? | Matthew / Manuel | +| 3 | Who is CODEOWNER on `main` for each product type — a team or named individuals? | Manuel | +| 4 | Fleet-wide updates: CodeBuild headless runner, or CSVD engineer runs `update_fleet.py` manually? | David / Matthew | + +--- + +## Non-Goals + +- Customers self-managing Terraform — CSVD owns all Terraform execution +- Per-customer forks of product workspaces — single central workspace per product type +- Moving workload config to a database or external registry — `workloads/**` is the registry diff --git a/docs/generalized-terraform-product-architecture.md b/docs/generalized-terraform-product-architecture.md new file mode 100644 index 0000000..71b9772 --- /dev/null +++ b/docs/generalized-terraform-product-architecture.md @@ -0,0 +1,246 @@ +# Generalized Terraform Product Architecture + +**Date:** 2026-05-19 +**Status:** Proposed +**Audience:** Platform Engineering stakeholders +**Context:** Expanding the Service Catalog automation system beyond EKS to support any arbitrary Terraform template repo + +--- + +## Summary + +The Service Catalog (SC) automation system was originally built to create EKS cluster +GitHub repositories. This document describes a path to generalize that system so that +**any Terraform workload** — S3 buckets, RDS databases, VPCs, IAM roles, etc. — can be +onboarded as a new SC product with minimal engineering effort. + +The core Lambda infrastructure, webhook handler, and CodeBuild executor are already +workload-agnostic. The changes required to support a new product type are scoped to: + +1. A **template repo** on GitHub Enterprise +2. A set of **Jinja2 HCL/YAML templates** for the rendered files +3. A **Pydantic config model** describing the product's inputs +4. A **CloudFormation product template** for the Service Catalog form +5. A **census config YAML** to register the product in the portfolio + +No changes to the Lambda runtime, CodeBuild projects, or webhook infrastructure +are needed after the initial generalization work is complete. + +--- + +## Current State (EKS-only) + +``` +SC Console (user fills EKS form) + └─> CFN Stack (Custom::GitHubRepository) + └─> Lambda (eks-terragrunt-repo-gen-template-automation) + ├─> Validates EKS-specific inputs (Pydantic model) + ├─> Fetches GHE token from Secrets Manager + ├─> Triggers executor CodeBuild build + └─> Polls build → returns repo URL + PR URL to CFN +``` + +The Lambda and CodeBuild executor are tightly coupled to EKS field names +(`cluster_name`, `vpc_name`, `vpc_domain_name`, etc.) and the +`template-eks-cluster` template repo. + +--- + +## Target State (Any Terraform Workload) + +``` +SC Console (user fills product form — any workload type) + └─> CFN Stack (Custom::TerraformRepo) + └─> Lambda (sc-template-automation) [shared, central] + ├─> Reads product_type from CFN properties + ├─> Routes to the correct Pydantic model + template set + ├─> Triggers executor CodeBuild build + └─> Returns repo URL + PR URL to CFN + +GitHub Enterprise (any account repo) + └─> push to main + └─> Lambda webhook handler (existing, already generic) + └─> Reads .sc-automation.yml → starts executor build +``` + +The Lambda becomes a **dispatcher**: `product_type` is a single new field in the +CFN `Properties` block that routes the request to the correct handler. + +--- + +## What Is Already Generic + +The following components require **no changes** to support new product types: + +| Component | Why it is already generic | +|---|---| +| Webhook handler | Reads `.sc-automation.yml` from any repo; no workload awareness | +| `.sc-automation.yml` schema | `layer`, `region_dir`, `target_account_id` are workload-agnostic | +| Executor CodeBuild project | Runs `tf apply` in any Terraform workspace; env vars are injected at build time | +| HMAC signature verification | Workload-agnostic GHE push event handling | +| GHE commit status writeback | Writes ✅/❌ to any repo's merge commit | +| Lambda Function URL | Single entry point; no per-product URLs needed | + +--- + +## What Changes for Each New Product + +### 1. Template repo on GHE + +Create a new repo under `SCT-Engineering/` (e.g. `template-s3-bucket`) that follows +the standard account repo directory layout. This repo is cloned by the executor +CodeBuild build and serves as the starting point for rendered files. + +The template repo must contain: +- Standard `.tf-control`, `.tf-control.tfrc`, `region.tf`, `credentials.d/`, `variables.d/` +- Layer directories (`common/`, `infrastructure/`, `vpc/`) as applicable +- `remote_state.yml` stubs that the Proposer build will populate + +### 2. Jinja2 templates + +Add a new subdirectory under `lambda/templates/{product_type}/` containing the +`.tf.j2` and `.hcl.j2` files that are rendered by the Proposer build before being +committed to the new repo branch. + +``` +lambda/templates/ +├── eks_cluster/ # existing +│ ├── infrastructure/west/cluster.tf.j2 +│ └── ... +├── s3_bucket/ # new +│ ├── infrastructure/west/s3.tf.j2 +│ └── ... +└── {future_product}/ # pattern +``` + +### 3. Pydantic config model + +Add a new model in `lambda/models/{product_type}.py`: + +```python +class S3BucketConfig(BaseModel): + """Input model for S3 bucket SC product.""" + bucket_name: str + account_name: str + aws_account_id: str + environment: Literal["dev", "test", "prod"] + aws_region: str = "us-gov-west-1" + versioning_enabled: bool = True + lifecycle_days: int = 90 + team: str + workload: str + tier: str + partition: str = "gov" +``` + +The model enforces required fields and default values before any CodeBuild build is started. + +### 4. Lambda dispatcher + +A single routing table maps `product_type` to the correct handler: + +```python +PRODUCT_HANDLERS = { + "eks_cluster": handle_eks, + "s3_bucket": handle_s3, + # future: "rds_postgres": handle_rds +} + +def handle_create(props: dict): + product_type = props.get("product_type", "eks_cluster") # default: backward-compat + handler = PRODUCT_HANDLERS.get(product_type) + if not handler: + raise ValueError(f"Unknown product_type: {product_type}") + return handler(props) +``` + +This is a **one-time change** to `lambda/app.py`. After it is in place, adding a new +product type requires only a new entry in the table and a new handler function — no +other Lambda changes. + +### 5. CloudFormation product template + +Create a new `service-catalog/{product_type}-product-template.yaml`. The template +follows the same pattern as the EKS product template: + +- Parameters for user-facing form fields +- A single `Custom::TerraformRepo` resource +- Properties passed in `snake_case` to avoid the PascalCase normalizer issue +- `product_type` included as a static string property +- `aws_account_id` and `aws_region` resolved via `!Sub` — not user-facing parameters + +```yaml +Properties: + ServiceToken: !Sub "arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:sc-template-automation" + product_type: s3_bucket + bucket_name: !Ref BucketName + account_name: !Ref AccountName + aws_account_id: !Sub "${AWS::AccountId}" + environment: !Ref Environment + team: !Ref Team + workload: !Ref Workload + tier: !Ref Tier +``` + +### 6. Census config YAML (portfolio registration) + +Add a new YAML file under `terraform-service-catalog-census/templates/products/{product_type}/` +to register the product in the SC portfolio. This follows the same structure as the +existing EKS product config. + +--- + +## Onboarding Checklist for a New Product Type + +The following checklist can be handed to a product team or platform engineer to +onboard any new Terraform workload without Lambda or CodeBuild changes: + +- [ ] Create `SCT-Engineering/template-{product_type}` repo from the standard account repo skeleton +- [ ] Add `lambda/templates/{product_type}/` with Jinja2 templates for each rendered file +- [ ] Add `lambda/models/{product_type}.py` with a Pydantic model defining required inputs +- [ ] Register the handler in `lambda/app.py` `PRODUCT_HANDLERS` table +- [ ] Create `service-catalog/{product_type}-product-template.yaml` CFN template +- [ ] Add census config YAML and SC portfolio registration in `terraform-service-catalog-census` +- [ ] Test end-to-end via `scripts/test_service_catalog.py` with the new product type +- [ ] Confirm `.sc-automation.yml` is written correctly by the Proposer build + +--- + +## Example: S3 Bucket Product + +An S3 bucket product would work as follows end-to-end: + +1. Platform engineer opens Service Catalog, selects **S3 Bucket Repository Creator** +2. Fills in: `bucket_name`, `team`, `workload`, `environment`, `tier` +3. CloudFormation creates a `Custom::TerraformRepo` stack with `product_type: s3_bucket` +4. Lambda validates inputs against `S3BucketConfig`, renders S3 Jinja2 templates +5. Proposer CodeBuild clones `template-s3-bucket`, commits rendered HCL, opens PR +6. CFN stack outputs: `repository_url`, `pull_request_url` +7. Platform engineer reviews and merges PR +8. Webhook fires → Lambda reads `.sc-automation.yml` → starts executor build +9. Executor applies S3 Terragrunt config; posts ✅ commit status on merge commit + +The platform engineer never leaves GitHub or Service Catalog — there is no manual executor step. + +--- + +## Migration Path for Existing EKS Product + +The EKS product continues to work without modification. The `product_type` field defaults +to `eks_cluster` when absent, preserving backward compatibility with any existing +CloudFormation stacks or SC provisioned products. + +--- + +## Infrastructure Cost of Generalization + +| Resource | Current | After generalization | +|---|---|---| +| Lambda functions | 1 (EKS-only) | 1 (shared dispatcher) | +| CodeBuild projects | 2 (builder + creator) | 2 (no change) | +| Secrets Manager secrets | 2 (GHE tokens) + 1 (webhook) | No change | +| Lambda Function URL | 1 | No change | +| ECR repositories | 1 | No change | + +There is **no additional AWS infrastructure cost** to add new product types. Each new +product type is purely a code and configuration change. diff --git a/docs/repo-vars-and-secrets.md b/docs/repo-vars-and-secrets.md new file mode 100644 index 0000000..96686fc --- /dev/null +++ b/docs/repo-vars-and-secrets.md @@ -0,0 +1,250 @@ +# Repository Variables and Secrets Management + +**Ported from:** `lambda-template-repo-generator/design-docs/REPO_VARS_AND_SECRETS.md` +**Updated for:** sc-lambda-ghactions (CodeBuild-based initial rollout; GHA planned for later) + +This document describes how environment variables and secrets are made available +to CodeBuild builds started by the sc-lambda-ghactions Lambda. + +In the initial CodeBuild-based rollout, secrets and configuration values are +injected directly as CodeBuild environment variable overrides at build-start time +(via `environmentVariablesOverride` in the `StartBuild` API call). AWS Parameter +Store and Secrets Manager values are fetched by the Lambda and passed through, or +read directly by the CodeBuild buildspec at runtime. + +> **Later rollout (GHA):** When GitHub Actions workflows replace CodeBuild as the +> executor, the mechanism shifts to GitHub Actions secrets and variables set via +> the GHE API. The SSM/Secrets Manager parameter structure described below is +> designed to support both models. + +--- + +## Overview + +The Proposer CodeBuild build has access to: + +1. **Secrets** — read from AWS Secrets Manager; injected as CodeBuild env var overrides at build-start time or fetched in the buildspec via `aws secretsmanager get-secret-value` +2. **Configuration values** — read from AWS Parameter Store; fetched in the buildspec via `aws ssm get-parameter` + +Both are scoped by: +- **Global** — applied to every account repo regardless of product type +- **By product type** — applied only to repos of a specific `product_type` + +--- + +## AWS Parameter Store Structure + +``` +/sc-template-automation/ + ├── variables/ + │ ├── global/ # Variables set on every new repo + │ │ ├── AWS_REGION # e.g. us-gov-west-1 + │ │ └── TERRAFORM_VERSION # e.g. 1.9.1 + │ └── by-type/ # Variables by product_type + │ ├── eks_cluster/ + │ │ ├── CLUSTER_VERSION + │ │ └── NODE_TYPE + │ └── s3_bucket/ + │ └── ... +``` + +## AWS Secrets Manager Structure + +``` +sc-template-automation/ + ├── secrets/global/ # Secrets set on every new repo + │ └── AWS_ACCESS_KEY_ID # (if needed by CodeBuild buildspec) + └── secrets/by-type/ # Secrets by product_type + ├── eks_cluster/ + │ └── KUBECONFIG + └── s3_bucket/ + └── ... +``` + +--- + +## Lambda Infrastructure + +### IAM Permissions + +The Lambda execution role requires: + +```hcl +data "aws_iam_policy_document" "secrets_access" { + statement { + effect = "Allow" + actions = ["secretsmanager:GetSecretValue", "secretsmanager:ListSecrets"] + resources = [ + "arn:${data.aws_partition.current.partition}:secretsmanager:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:secret:sc-template-automation/*" + ] + } +} + +data "aws_iam_policy_document" "ssm_access" { + statement { + effect = "Allow" + actions = ["ssm:GetParameter", "ssm:GetParameters", "ssm:GetParametersByPath"] + resources = [ + "arn:${data.aws_partition.current.partition}:ssm:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:parameter/sc-template-automation/*" + ] + } +} +``` + +### Lambda Environment Variables + +```hcl +environment { + variables = { + PARAM_STORE_PREFIX = "/sc-template-automation" + SECRETS_PREFIX = "sc-template-automation" + } +} +``` + +--- + +## Implementation — Building CodeBuild `environmentVariablesOverride` + +> **Note:** In the CodeBuild-based rollout, there is **no GHE repo secrets/variables API involved**. +> Secrets and configuration values are fetched by the Lambda at invocation time and passed +> directly to CodeBuild as `environmentVariablesOverride`. The GHE repo secrets approach +> is only relevant to the planned later GHA-based rollout. + +The helper `build_env_overrides()` in `lambda/env_builder.py` assembles the override list: + +```python +import boto3 + +ssm = boto3.client("ssm", region_name="us-gov-west-1") +secretsmanager = boto3.client("secretsmanager", region_name="us-gov-west-1") + +PARAM_PREFIX = "/sc-template-automation" +SECRET_PREFIX = "sc-template-automation" + + +def _get_ssm_path(path: str) -> dict[str, str]: + """Return {name: value} for all SSM parameters under the given path.""" + paginator = ssm.get_paginator("get_parameters_by_path") + result = {} + for page in paginator.paginate(Path=f"{PARAM_PREFIX}/{path}", WithDecryption=True): + for p in page["Parameters"]: + name = p["Name"].split("/")[-1] + result[name] = p["Value"] + return result + + +def _get_secrets_path(path: str) -> dict[str, str]: + """Return {name: value} for all Secrets Manager secrets under the given prefix.""" + paginator = secretsmanager.get_paginator("list_secrets") + result = {} + for page in paginator.paginate(Filters=[{"Key": "name", "Values": [f"{SECRET_PREFIX}/{path}"]}]): + for s in page["SecretList"]: + name = s["Name"].split("/")[-1] + value = secretsmanager.get_secret_value(SecretId=s["Name"])["SecretString"] + result[name] = value + return result + + +def build_env_overrides(product_type: str) -> list[dict]: + """ + Return a list of CodeBuild environmentVariablesOverride dicts containing: + - All global SSM variables + - All product-type SSM variables + - All global Secrets Manager secrets (type=SECRETS_MANAGER passed by ref) + - All product-type Secrets Manager secrets + """ + overrides = [] + + # Plain-text variables from SSM (fetched by Lambda, passed as PLAINTEXT) + for name, value in { + **_get_ssm_path("variables/global"), + **_get_ssm_path(f"variables/by-type/{product_type}"), + }.items(): + overrides.append({"name": name, "value": value, "type": "PLAINTEXT"}) + + # Secrets — passed as a Secrets Manager ARN reference so CodeBuild fetches at build time + # This avoids the Lambda ever holding plaintext secret values in memory beyond SSM calls. + for name, arn in { + **_get_secrets_arns("secrets/global"), + **_get_secrets_arns(f"secrets/by-type/{product_type}"), + }.items(): + overrides.append({"name": name, "value": arn, "type": "SECRETS_MANAGER"}) + + return overrides +``` + +> **`SECRETS_MANAGER` type:** When CodeBuild receives an env var with `type=SECRETS_MANAGER`, +> it resolves the value (an ARN) at build-start time using the CodeBuild service role — +> the Lambda never sees the plaintext secret value. + +### Integration in the Lambda Handler + +```python +def handle_create(props: dict): + product_type = props["product_type"] + # ... validate inputs (Pydantic), identify template repo ... + + # Build env var overrides from SSM + Secrets Manager + env_overrides = build_env_overrides(product_type) + + # Add per-invocation values from CFN properties + env_overrides += [ + {"name": "PRODUCT_TYPE", "value": product_type, "type": "PLAINTEXT"}, + {"name": "REPO_NAME", "value": props["project_name"], "type": "PLAINTEXT"}, + {"name": "ENVIRONMENT", "value": props["environment"], "type": "PLAINTEXT"}, + {"name": "AWS_ACCOUNT_ID","value": props["aws_account_id"],"type": "PLAINTEXT"}, + {"name": "AWS_REGION", "value": props["aws_region"], "type": "PLAINTEXT"}, + ] + + codebuild.start_build( + projectName=PROPOSER_PROJECT, + environmentVariablesOverride=env_overrides, + ) +``` + +--- + +## Populating Secrets and Variables + +### Add a global variable (all repos) + +```bash +export AWS_DEFAULT_REGION=us-gov-west-1 +aws ssm put-parameter \ + --name "/sc-template-automation/variables/global/TERRAFORM_VERSION" \ + --value "1.9.1" \ + --type "String" +``` + +### Add a product-type-specific secret + +```bash +export AWS_DEFAULT_REGION=us-gov-west-1 +aws secretsmanager create-secret \ + --name "sc-template-automation/secrets/by-type/eks_cluster/KUBECONFIG" \ + --secret-string "..." +``` + +--- + +## Security Considerations + +- **Encryption at rest:** All secrets are AWS-managed encrypted in Secrets Manager +- **Least privilege:** Lambda role scoped to `sc-template-automation/*` prefix only +- **Audit trail:** CloudTrail records all `GetSecretValue` and `GetParameter` calls +- **Repository isolation:** Secrets are set per-repo via GHE API; they are not + stored in the Lambda or committed to the repo +- **No plaintext in Lambda env:** Secrets are fetched at runtime, not baked into + the container image or Lambda environment variables + +--- + +## Future Enhancements + +- **Secret rotation:** Implement automatic rotation for long-lived credentials +- **Environment-scoped secrets:** Dev/test/prod variants of secrets per repo +- **Organization-level variables:** Push shared variables once to org level instead + of per-repo, reducing GHE API call volume +- **Validation rules:** Reject variable names that conflict with CodeBuild reserved + names (e.g. `CODEBUILD_*`, `AWS_*` built-ins) diff --git a/docs/service-catalog-census-integration.md b/docs/service-catalog-census-integration.md new file mode 100644 index 0000000..3502c7a --- /dev/null +++ b/docs/service-catalog-census-integration.md @@ -0,0 +1,312 @@ +# Service Catalog Census Integration + +**Ported and generalized from:** `lambda-template-repo-generator/design-docs/SERVICE_CATALOG_CENSUS_INTEGRATION.md` +**Updated for:** sc-lambda-ghactions (CodeBuild-based initial rollout; GHA planned for later) +**Date:** 2026-05-19 +**Status:** DRAFT + +--- + +## Executive Summary + +This document covers how sc-lambda-ghactions products are registered in the +`terraform-service-catalog-census` repo, which manages all enterprise Service Catalog +portfolios and products via Terragrunt. Each new product type (EKS cluster, S3 bucket, +RDS instance, etc.) requires entries in the census repo to become available in the SC +console org-wide. + +The census integration is designed for **enterprise-wide deployment from the outset**. +Every resource is classified by deployment scope — central (Lambda, ECR), StackSet (launch +roles), or census-managed (portfolios, products, constraints) — and handled accordingly. + +--- + +## System Layout + +### sc-lambda-ghactions system (4 repos) + +``` +sc-lambda-ghactions/ ← Lambda + CodeBuild buildspecs + SC product templates +├── lambda/app.py ← Lambda handler (dispatcher by product_type) +├── lambda/models/{product_type}.py ← Pydantic input models per product type +├── lambda/templates/{product_type}/ ← Jinja2 HCL templates per product type +├── service-catalog/{product_type}-product-template.yaml ← CFN product template +└── deploy/ ← Terraform: Lambda, ECR, IAM, Function URL + +terraform-sc-fleet/ ← Fleet operations manifest (all managed workloads) +packer-pipeline/ ← Container build CLI +template-{product_type}/ ← Template repos (one per product type) +``` + +### `terraform-service-catalog-census` (census repo) + +``` +terraform-service-catalog-census/ +├── main-modules/service-catalog/ ← Main Terraform module +├── modules/ +│ ├── sc-portfolio/ ← Portfolio + principal association +│ ├── sc-product/ ← Product + S3 upload + versioning +│ └── cfn-roles-actions/ ← Launch roles via CFN StackSets +├── templates/ +│ ├── products/ +│ │ ├── eks-terragrunt-repo/ ← CFN product template (versioned YAMLs) +│ │ ├── s3-bucket-repo/ ← (planned) +│ │ └── {product-type}-repo/ ← pattern +│ └── role-templates/ ← IAM launch role CFN snippets +├── non-prod/csvd-dev/west/ +│ ├── configurations/ +│ │ ├── portfolios/*.yaml.tftpl ← Portfolio definitions +│ │ └── products/**/*.yaml.tftpl ← Product definitions +│ └── service-catalog/ +└── prod/operations-gov/ ← Prod (shares to org) +``` + +--- + +## Resource Classification + +Every resource falls into one of three deployment tiers: + +| Tier | What | Deployment mechanism | Scope | +|------|------|---------------------|-------| +| **Central** | Lambda, ECR, Secrets Manager, GHE token, Function URL | `sc-lambda-ghactions/deploy/` (`tf apply`) | csvd-dev only | +| **StackSet** | IAM launch role per product type | `cfn-roles-actions` StackSet via census repo | All OU-shared accounts | +| **Census-managed** | SC portfolio, product, provisioning artifact, constraints | YAML config in census repo → `terragrunt apply` | SC admin account + shared OUs | + +--- + +## Step 1 — Central Infrastructure (`sc-lambda-ghactions/deploy/`) + +The Lambda is centralized in csvd-dev. CloudFormation in any org account invokes +it cross-account via the `ServiceToken` ARN. + +**Lambda resource policy** — allows any account in the org: +```hcl +resource "aws_lambda_permission" "cloudformation_org" { + statement_id = "AllowCloudFormationOrgInvoke" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.sc_automation.function_name + principal = "cloudformation.amazonaws.com" + condition { + test = "StringEquals" + variable = "aws:PrincipalOrgID" + values = [var.org_id] + } +} +``` + +No per-account Lambda deployment is needed. Provisioners never need the Lambda locally — +their CloudFormation stack calls it cross-account via the `ServiceToken`. + +--- + +## Step 2 — IAM Launch Roles (StackSet) + +One IAM launch role is required **per product type** in every account that will +provision the product via SC. These are deployed via the `cfn-roles-actions` StackSet, +which auto-deploys to all accounts in shared OUs. + +### Launch role template (per product type) + +Add a file to `templates/role-templates/`: + +```yaml +# templates/role-templates/sc-{product_type}-launch-role.yaml +Type: AWS::IAM::Role +Properties: + RoleName: !Sub "r-ent-servicecatalog-${ProductType}-sc-launch-role" + AssumeRolePolicyDocument: + Statement: + - Effect: Allow + Principal: { Service: servicecatalog.amazonaws.com } + Action: sts:AssumeRole + Policies: + - PolicyName: InvokeCentralLambda + PolicyDocument: + Statement: + - Effect: Allow + Action: lambda:InvokeFunction + Resource: !Sub "arn:${AWS::Partition}:lambda:${LambdaRegion}:${CentralAccountId}:function:sc-template-automation" + - Effect: Allow + Action: [cloudformation:*, s3:GetObject] + Resource: "*" +``` + +### Registering in `roles.yaml.tftpl` + +```yaml +# non-prod/csvd-dev/west/configurations/roles.yaml.tftpl +- template: sc-eks-cluster-launch-role.yaml # existing + parameters: + - parameter: CentralAccountId + value: "229685449397" + - parameter: LambdaRegion + value: us-gov-west-1 + - parameter: ProductType + value: eks_cluster + +- template: sc-s3-bucket-launch-role.yaml # new product type + parameters: + - parameter: CentralAccountId + value: "229685449397" + - parameter: LambdaRegion + value: us-gov-west-1 + - parameter: ProductType + value: s3_bucket +``` + +**Only one `terragrunt apply`** is needed after adding a new role entry. The StackSet +propagates to all shared accounts automatically via `auto_deployment { enabled = true }`. + +--- + +## Step 3 — Census Portfolio and Product Config + +### Portfolio YAML + +Portfolios are defined in `configurations/portfolios/`. The sc-lambda-ghactions products +belong in a single shared portfolio (or alongside existing census portfolios): + +```yaml +# configurations/portfolios/sc-automation.yaml.tftpl +sc_automation: + name: "Service Catalog Automation Portfolio" + description: >- + Self-service infrastructure provisioning via sc-lambda-ghactions. + Supports any Terraform workload type. + provider_name: CSVD + products: + - eks_cluster_repo + - s3_bucket_repo + user_roles: + - /census/*/sc-end-user-role + share_ous: + - name: census-workload-accounts +``` + +### Product YAML (per product type) + +```yaml +# configurations/products/eks-cluster-repo/EKS_CLUSTER_REPO.yaml.tftpl +eks_cluster_repo: + name: "EKS Cluster Repository Creator" + description: >- + Creates a GitHub Enterprise repository with Terragrunt EKS cluster + configuration and opens a review PR. + type: CLOUD_FORMATION_TEMPLATE + distributor: CSVD + support_email: csvd.aws.service.catalog.team.list@census.gov + launch_role: r-ent-servicecatalog-eks-cluster-sc-launch-role + template_constraints: + Parameters: + # Lock the Lambda ARN — users cannot redirect to a different Lambda + ServiceToken: "arn:${Partition}:lambda:us-gov-west-1:229685449397:function:sc-template-automation" + versions: + - name: "1.0.0" + description: "Initial CodeBuild-based version" + file_path: products/eks-cluster-repo/1-0-0.yaml +``` + +### Product template location + +The CFN product template lives at: +``` +templates/products/{product_type}-repo/{version}.yaml +``` + +This is a copy of (or symlink to) `sc-lambda-ghactions/service-catalog/{product_type}-product-template.yaml`. +When a new version of the product template is released, add a new versioned file here +and bump the `versions` list in the product YAML. + +--- + +## Step 4 — Moving the Lambda to a Different Account + +If the central Lambda needs to move to a different AWS account, the following must be +updated. **All other components are account-agnostic.** + +| Resource | Location | What changes | +|----------|----------|-------------| +| Lambda + all central infra | `sc-lambda-ghactions/deploy/` | Re-deploy in new account | +| Launch role `lambda:InvokeFunction` ARN | `roles.yaml.tftpl` → `CentralAccountId` parameter | Update to new account ID — one change propagates to all shared accounts via StackSet | +| Template constraint `ServiceToken` | Product YAML `template_constraints` | Update ARN value | +| GitHub token secrets | Secrets Manager in new account | Recreate manually | + +**Migration order:** Update StackSet launch roles (step 3) → wait for propagation → update +template constraint (step 4). Reversing the order causes a `lambda:InvokeFunction` permission +denial window. + +### Why parameterizing `CentralAccountId` matters + +The account ID is only in `roles.yaml.tftpl` under the `CentralAccountId` parameter. The +role template YAML itself is static and account-agnostic. A single value change propagates +to all shared accounts via the StackSet — no role template file needs updating. + +--- + +## Adding a New Product Type to the Census Portfolio + +Checklist for each new product type: + +- [ ] Add CFN product template at `templates/products/{product_type}-repo/1-0-0.yaml` +- [ ] Add product YAML at `configurations/products/{product_type}-repo/{PRODUCT}.yaml.tftpl` +- [ ] Add launch role template at `templates/role-templates/sc-{product_type}-launch-role.yaml` +- [ ] Add launch role entry in `roles.yaml.tftpl` +- [ ] Add product key to portfolio YAML `products:` list +- [ ] Run `terragrunt apply` in `non-prod/csvd-dev/west/service-catalog/` +- [ ] Validate: product appears in SC console; end-to-end test from a workload account + +--- + +## Validation Checklist + +### After central Lambda deploy: +- [ ] Lambda resource policy allows org-wide CloudFormation invocation +- [ ] Cross-account test: invoke Lambda from a different account via CFN Custom Resource + +### After StackSet launch role deploy: +- [ ] StackSet instances show `CURRENT` in CloudFormation console for target OUs +- [ ] Launch role exists in at least 2-3 workload accounts (spot check) +- [ ] Role trust policy allows `servicecatalog.amazonaws.com` + +### After census product deploy: +- [ ] Portfolio visible in SC console in the admin account +- [ ] Portfolio shared to target OUs (verify in a workload account) +- [ ] Product associated with portfolio; launch constraint attached +- [ ] Template constraint locks `ServiceToken` to correct Lambda ARN +- [ ] End-to-end test: provision from a **workload account** (not csvd-dev) + +--- + +## Appendix: Census Config Format Reference + +### Portfolio YAML schema + +```yaml +: + name: string + description: string + provider_name: string + products: [, ...] + user_roles: [/path/pattern/*] + tags: {} + share_ous: [] # OU names; empty = inherit from terraform.tfvars +``` + +### Product YAML schema + +```yaml +: + name: string + description: string + type: CLOUD_FORMATION_TEMPLATE + launch_role: string # IAM role NAME (not ARN) — must exist in every target account + distributor: string + template_constraints: + Parameters: + ParamName: locked-value + versions: + - name: "1.0.0" + file_path: products/{product-dir}/{version}.yaml + actions: [] +``` diff --git a/docs/template-management.md b/docs/template-management.md new file mode 100644 index 0000000..5a20ec4 --- /dev/null +++ b/docs/template-management.md @@ -0,0 +1,271 @@ +# Template Management + +**Ported from:** `lambda-template-repo-generator/design-docs/CUSTOM_TEMPLATES.MD` +**Updated for:** sc-lambda-ghactions (CodeBuild-based initial rollout; GHA planned for later) + +This document describes how template repositories are structured and consumed by +the sc-lambda-ghactions system to create new account repos for any Terraform workload. + +--- + +## Template Sources + +### Full Repository Templates + +The standard approach: a GHE repository is used as the template. When the Lambda +Proposer build runs, it clones the template repo verbatim and renders Jinja2 +configuration files on top of it before committing to the new account repo branch. + +**Convention:** template repos are named `template-{product_type}` under `SCT-Engineering/`. + +| Product type | Template repo | +|---|---| +| `eks_cluster` | `SCT-Engineering/template-eks-cluster` | +| `s3_bucket` | `SCT-Engineering/template-s3-bucket` *(planned)* | +| `{any_type}` | `SCT-Engineering/template-{any_type}` | + +### Subdirectory Templates + +For product families that share significant infrastructure (e.g. multiple tiers +of the same workload), a single template repo can contain multiple subdirectory +templates. The Proposer build accepts a `source_path` parameter to clone only +the relevant subdirectory into the new account repo. + +Example: a `template-terraform-workloads` repo with: + +``` +template-terraform-workloads/ +├── eks-cluster/ # Standard EKS cluster template +├── eks-cluster-minimal/ # Reduced-footprint cluster variant +├── s3-standard/ # Standard S3 bucket configuration +└── s3-encrypted/ # S3 with custom KMS key configuration +``` + +A product that specifies `source_path: eks-cluster-minimal` will clone only that +subdirectory, stripped of the parent path prefix. + +--- + +## CFN Product Template Usage + +### Full repository (no source_path) + +```yaml +Resources: + MyAccountRepo: + Type: Custom::TerraformRepo + Properties: + ServiceToken: !Sub "arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:sc-template-automation" + product_type: eks_cluster + project_name: !Ref ProjectName + environment: !Ref Environment + aws_account_id: !Sub "${AWS::AccountId}" + aws_region: !Sub "${AWS::Region}" +``` + +### Subdirectory template + +```yaml +Properties: + ServiceToken: !Sub "arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:sc-template-automation" + product_type: s3_bucket + source_path: s3-encrypted # ← subdirectory within the template repo + project_name: !Ref ProjectName + environment: !Ref Environment + aws_account_id: !Sub "${AWS::AccountId}" + aws_region: !Sub "${AWS::Region}" +``` + +--- + +## Template Repository Structure + +Every template repo must follow the standard account repo layout so the rendered +output is compatible with the `tf-run` toolchain and `tf-directory-setup.py`: + +``` +template-{product_type}/ +├── .tf-control # tf-run toolchain version pin +├── .tf-control.tfrc # Terraform provider cache config +├── region.tf # locals { region = var.region } +├── credentials.d/ +│ ├── us-gov-east-1.credentials.tf +│ └── us-gov-west-1.credentials.tf +├── variables.d/ +│ ├── variables.common.tf +│ └── variables.tfstate.tf +├── infrastructure/ +│ ├── east/ +│ │ ├── remote_state.yml.j2 # ← Jinja2: rendered by Proposer +│ │ └── {workload}.tf.j2 # ← Jinja2: rendered by Proposer +│ └── west/ +│ ├── remote_state.yml.j2 +│ └── {workload}.tf.j2 +└── README.md +``` + +Files ending in `.j2` are Jinja2 templates. The Proposer CodeBuild build renders +them using the product inputs and commits the rendered result (without the `.j2` +extension) to the new account repo branch. + +--- + +## Jinja2 Template Organization in the Lambda + +Rendered templates are stored in the Lambda image under `lambda/templates/{product_type}/`: + +``` +lambda/templates/ +├── eks_cluster/ +│ ├── infrastructure/west/cluster.tf.j2 +│ ├── infrastructure/east/cluster.tf.j2 +│ └── ... +├── s3_bucket/ # ← new product type: add a directory here +│ ├── infrastructure/west/s3.tf.j2 +│ └── ... +└── {product_type}/ # ← pattern for future types +``` + +The Lambda dispatcher maps `product_type` → template directory automatically. +Adding a new product type requires only adding a new subdirectory here, a +Pydantic model, and a CFN product template — no Lambda plumbing changes. + +--- + +## Proposer Build — Template Copying Logic + +The Proposer CodeBuild build (started by the Lambda via `codebuild:StartBuild`) performs these steps: + +1. Clone the template repo (full repo or `source_path` subdirectory) +2. For each `.j2` file found: + - Render it using `jinja2.Environment` with the product input variables + - Write the rendered output alongside the source file (without `.j2` extension) + - Remove the `.j2` source file from the working tree +3. Add rendered `remote_state.yml` files using actual account/bucket values +4. Write `.sc-automation.yml` to the repo root if it does not already exist on `main` +5. Commit all rendered files to a new branch (`proposal/{timestamp}`) and open a PR + +The PR is reviewed by a platform engineer before merging. On merge, the webhook +handler reads `.sc-automation.yml` and automatically starts the executor CodeBuild build. + +--- + +## `.sc-automation.yml` — Automation Config File + +Every account repo that participates in sc-lambda-ghactions automation must have a +`.sc-automation.yml` file at the repo root. The Proposer writes this file when it +creates the initial PR if it does not already exist on `main`. + +### Schema + +```yaml +# .sc-automation.yml +product_type: eks_cluster # Must match a registered PRODUCT_HANDLERS key +executor_project: sc-executor # CodeBuild project name for the Executor build +dry_run: true # If true, Executor runs tf plan only (no apply) +template_repo: SCT-Engineering/template-eks-cluster # Source template repo +template_source_path: "" # Subdirectory within template repo; empty = root +fleet_entry: workloads/eks_cluster/prod/my-cluster/main.tf # Path in terraform-sc-fleet +variables: # Extra key/value pairs injected as CodeBuild env vars + CLUSTER_VERSION: "1.29" + NODE_TYPE: m5.xlarge +``` + +| Field | Required | Description | +|---|---|---| +| `product_type` | ✅ | Routes to the correct Pydantic model and template directory | +| `executor_project` | ✅ | CodeBuild project started by the webhook on PR merge | +| `dry_run` | ✅ | `true` → `tf plan` only; `false` → `tf apply` | +| `template_repo` | ✅ | GHE repo used as the Jinja2 template source | +| `template_source_path` | ❌ | Subdirectory within `template_repo`; omit for whole-repo templates | +| `fleet_entry` | ❌ | Relative path of this workload's entry in `terraform-sc-fleet` | +| `variables` | ❌ | Product-type-specific overrides; merged with SSM global defaults | + +> **Versioning:** The Executor reads `.sc-automation.yml` from `main` at build time, not from the +> PR branch, so changes to it take effect on the next automation run without requiring a re-render. + +--- + +## Executor Build — Injecting into an Existing Account Repo + +After a platform engineer merges the Proposer PR into `main`, the sc-lambda-ghactions +webhook fires and starts the **Executor** CodeBuild build. The Executor handles +both the initial `tf plan`/`tf apply` run and any subsequent re-render of existing repos. + +### What the Executor Does + +``` +webhook (PR merged to main) + └─> Lambda reads .sc-automation.yml from main + └─> Lambda starts Executor CodeBuild build via StartBuild + environmentVariablesOverride: + REPO_NAME, PRODUCT_TYPE, DRY_RUN, TEMPLATE_REPO, ... + +Executor buildspec: + INSTALL: + - Install Terraform from S3 assets bucket + - Install Census CA cert, set HTTPS_PROXY + - git clone {account_repo} (GHE token from Secrets Manager) + PRE_BUILD: + - Read .sc-automation.yml from cloned repo + - git clone {template_repo} into /tmp/template + BUILD: + - For each .j2 file in /tmp/template: + Render with Jinja2 using env vars as context + Write to account_repo at same relative path (no .j2 extension) + - git checkout -b update/{timestamp} + - git add -A && git commit + - git push + - gh pr create --title "Automated update: {product_type} {timestamp}" + - If dry_run == false: + tf init && tf apply -auto-approve + POST_BUILD: + - POST commit status to GHE (success/failure with CodeBuild log URL) +``` + +### Fleet Update (re-rendering an existing repo) + +When a **template repo itself changes** — for example, an upstream HCL pattern is +updated — the fleet update flow (Flow 3) re-renders all account repos of that +`product_type`: + +1. `terraform-sc-fleet` lists all `workloads/{product_type}/*/main.tf` entries +2. Lambda starts one Executor build **per account repo** (fan-out) +3. Each Executor clones its account repo, re-renders all `.j2` files from the + updated template, commits to a new branch, and opens a PR +4. Platform engineers review and merge the PRs individually + +The Executor **never force-pushes to `main`** — every change goes through a PR, +preserving review gates regardless of whether `dry_run` is set. + +### Idempotency + +The Executor is safe to re-run. If the rendered output is identical to `main` +(`git diff --quiet`), it exits with no PR opened and reports a `SKIPPED` status +back to the Lambda. + +--- + +## Security Considerations + +- **Source path validation:** The Proposer validates that `source_path` (if provided) + exists in the template repo before proceeding. Path traversal (`../`) is rejected. +- **File type restrictions:** Only `.tf`, `.hcl`, `.yml`, `.yaml`, `.md`, `.j2`, + and standard dotfiles are copied. Binary files and executables are rejected. +- **Template repo access:** The GHE token injected into the CodeBuild environment + has read-only access to `SCT-Engineering/template-*` repos and read-write access + only to the target account repo. + +--- + +## Adding a New Template Repository + +Checklist when onboarding a new product type: + +- [ ] Create `SCT-Engineering/template-{product_type}` with standard account repo layout +- [ ] Add `.j2` files for each rendered configuration file +- [ ] Add `lambda/templates/{product_type}/` with corresponding Jinja2 templates +- [ ] Add a Pydantic model in `lambda/models/{product_type}.py` +- [ ] Register the handler in `lambda/app.py` `PRODUCT_HANDLERS` table +- [ ] Create a CFN product template in `service-catalog/{product_type}-product-template.yaml` +- [ ] Add the product to `terraform-service-catalog-census` (see [service-catalog-census-integration.md](service-catalog-census-integration.md)) diff --git a/docs/workflow-flowcharts.md b/docs/workflow-flowcharts.md new file mode 100644 index 0000000..ef860d4 --- /dev/null +++ b/docs/workflow-flowcharts.md @@ -0,0 +1,135 @@ +# Service Catalog Automation — Workflow Flowcharts + +**Ported and updated from:** `lambda-template-repo-generator/docs/DEMO_FLOWCHART.md` +**Updated for:** sc-lambda-ghactions (CodeBuild-based initial rollout; GHA planned for later) + +Generic overview of all end-to-end flows for any Service Catalog product built +on the sc-lambda-ghactions pattern. Intended for stakeholder demos and onboarding +conversations. + +--- + +## Flow 1 — Provisioning (SC Form → New Account Repo + PR) + +```mermaid +flowchart TD + A([👤 Engineer]) -->|Fills out form & clicks Launch| B[AWS Service Catalog] + + B -->|Creates CloudFormation Stack| C[CloudFormation\nCustom Resource] + + C -->|Cross-account invocation\nvia ServiceToken| D[Lambda Function\ncsvd-dev] + + D -->|Fetches GHE token| E[(Secrets Manager\ncsvd-dev)] + + D -->|Starts CodeBuild build\nproduct_type + inputs as env vars| F[CodeBuild\nProposer — csvd-dev] + + F -->|Clones template repo| G[SCT-Engineering/template-{product_type}] + F -->|Renders Jinja2 templates\nCommits rendered HCL| H[New Branch\nproposal/timestamp] + F -->|Opens| I[Pull Request\nproposal → main] + F -->|Commits entry to| K[terraform-sc-fleet\nworkloads/{type}/{name}/main.tf] + + D -->|Polls CodeBuild build\nevery 20s until complete| F + D -->|Returns repo URL + PR URL| C + + C -->|Stack outputs| B + B -->|Status: AVAILABLE\n+ repo & PR links| A + + style A fill:#4a90d9,color:#fff + style B fill:#f5a623,color:#fff + style C fill:#f5a623,color:#fff + style D fill:#7ed321,color:#fff + style E fill:#9b59b6,color:#fff + style F fill:#27ae60,color:#fff + style G fill:#2c3e50,color:#fff + style H fill:#2c3e50,color:#fff + style I fill:#e74c3c,color:#fff + style K fill:#8e44ad,color:#fff + +``` + +--- + +## Flow 2 — Apply on Merge (Webhook → Auto-Executor) + +After a platform engineer reviews and merges the Proposer PR, the webhook handler +automatically starts the executor build — no manual SC provisioning step required. + +```mermaid +flowchart TD + A([👤 Platform Engineer]) -->|Reviews & merges PR| B[GitHub Enterprise\nmain branch] + + B -->|Push event| C[Lambda Function URL\nPOST /webhook] + + C -->|Verifies HMAC signature| C + C -->|Reads .sc-automation.yml\nfrom merged commit| D{layer / region_dir\nconfigured?} + + D -->|Yes| E[Starts CodeBuild build\nexecutor — csvd-dev] + D -->|No| Z([Skip — no automation config]) + + E -->|Reads .sc-automation.yml\nvia buildspec env var| G{dry_run: true?} + G -->|Yes| H[terraform plan only] + G -->|No| I[terraform apply] + + E -->|POST commit status via GitHub API| B + B -->|✅ or ❌ on merge commit| A + + style A fill:#4a90d9,color:#fff + style B fill:#2c3e50,color:#fff + style C fill:#7ed321,color:#fff + style D fill:#f5a623,color:#fff + style E fill:#27ae60,color:#fff + style G fill:#f5a623,color:#fff + style H fill:#9b59b6,color:#fff + style I fill:#e74c3c,color:#fff + style Z fill:#95a5a6,color:#fff +``` + +--- + +## Flow 3 — Fleet-Wide Update (CSVD Operations) + +CSVD-initiated update applied across all managed workloads — e.g. a version bump. +No Service Catalog involvement; runs directly from `terraform-sc-fleet`. + +```mermaid +flowchart TD + A([👤 CSVD Engineer]) -->|python update_fleet.py\n--product-type eks_cluster --lifecycle dev| B[terraform-sc-fleet\nscripts/update_fleet.py] + + B -->|Walks workloads/eks_cluster/dev/**| C{maintenance\nwindow open?} + + C -->|Yes| D[tf apply\nper workload folder] + C -->|No| E([Skip workload\nlog window info]) + + D -->|Starts CodeBuild build\nexecutor — csvd-dev| F[CodeBuild\nExecutor — csvd-dev] + + F -->|Renders + commits\nupdated HCL| G[Account Repo\nNew branch] + G -->|Opens| H[Pull Request\nfor CSVD or customer review] + + F -->|POST commit status via GitHub API| H + + B -->|Summary: N applied\nM skipped| A + + style A fill:#4a90d9,color:#fff + style B fill:#8e44ad,color:#fff + style C fill:#f5a623,color:#fff + style D fill:#7ed321,color:#fff + style E fill:#95a5a6,color:#fff + style F fill:#27ae60,color:#fff + style G fill:#2c3e50,color:#fff + style H fill:#e74c3c,color:#fff +``` + +--- + +## Key Design Points + +| # | Point | +|---|-------| +| 1 | **Self-service provisioning** — engineer fills a form; no CSVD involvement for the create path | +| 2 | **Centralized compute** — Lambda, CodeBuild projects, and GHE tokens all live in csvd-dev; the provisioner's account only sees a CFN stack with output URLs | +| 3 | **Lambda as thin orchestrator** — validates inputs, starts CodeBuild build, polls for completion, returns URLs to CFN | +| 4 | **CodeBuild runs the Terraform** — actual repo creation and HCL rendering logic lives in CodeBuild buildspecs, not bespoke Lambda Python. GHA workflows are planned for a later rollout phase. | +| 5 | **Auto-apply on merge** — webhook handler eliminates the manual executor step; merge = apply | +| 6 | **Fleet operations separate from provisioning** — `terraform-sc-fleet` + `update_fleet.py` give CSVD a single command for fleet-wide changes | +| 7 | **Works for any product type** — swap `product_type` in the SC form and the entire chain routes to a different template repo, Pydantic model, and Jinja2 templates, with no Lambda plumbing changes | +| 8 | **Governance via GHE** — branch protection and CODEOWNERS are baked into every provisioned repo at creation time; customers can propose changes but cannot merge without CSVD approval | From 9cc46c62896ba2afc45fa030c6ceec0dad30b52e Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Wed, 20 May 2026 12:20:55 -0400 Subject: [PATCH 12/27] docs: remove stale GHA executor references - repo-vars-and-secrets.md: remove 'Later rollout (GHA)' callout block; the executor is CodeBuild triggered by webhook, not GitHub Actions - fleet-governance-at-scale.md: remove 'GHA executor rollout phase' note; replace with accurate CodeBuild+webhook description --- docs/fleet-governance-at-scale.md | 4 ++-- docs/repo-vars-and-secrets.md | 5 ----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/docs/fleet-governance-at-scale.md b/docs/fleet-governance-at-scale.md index 92941cc..0ac2483 100644 --- a/docs/fleet-governance-at-scale.md +++ b/docs/fleet-governance-at-scale.md @@ -273,8 +273,8 @@ gives a CSVD engineer a full fleet view in a single editor window: on every push to `main`. The script walks `workloads/**/**/main.tf`, extracts `repo_name` and `workload_name`, and writes `fleet.code-workspace`. No operator ever edits it manually. -> In the initial rollout this is a CodeBuild project triggered by a webhook. GHA -> workflows will replace it when the GHA executor rollout phase is complete. +> In the initial rollout this is a CodeBuild project triggered by a webhook on +> push to `main` in the `terraform-sc-fleet` repo. With this workspace open, a CSVD engineer can: - See all workload configs side-by-side in the Explorer without navigating repos diff --git a/docs/repo-vars-and-secrets.md b/docs/repo-vars-and-secrets.md index 96686fc..85a2153 100644 --- a/docs/repo-vars-and-secrets.md +++ b/docs/repo-vars-and-secrets.md @@ -12,11 +12,6 @@ injected directly as CodeBuild environment variable overrides at build-start tim Store and Secrets Manager values are fetched by the Lambda and passed through, or read directly by the CodeBuild buildspec at runtime. -> **Later rollout (GHA):** When GitHub Actions workflows replace CodeBuild as the -> executor, the mechanism shifts to GitHub Actions secrets and variables set via -> the GHE API. The SSM/Secrets Manager parameter structure described below is -> designed to support both models. - --- ## Overview From dc71d5793dad3075b7e96e9359f98f0a1780ef74 Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Wed, 20 May 2026 13:15:11 -0400 Subject: [PATCH 13/27] docs: add CodeBuild Projects Reference section to HOW-IT-WORKS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a dedicated section clearly describing what each CodeBuild project does, how it is triggered, what env vars it receives, and what it does/does not do — so stakeholders have a single place to understand the two-build model. --- docs/HOW-IT-WORKS.md | 98 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/docs/HOW-IT-WORKS.md b/docs/HOW-IT-WORKS.md index 4be7d99..bc42ba3 100644 --- a/docs/HOW-IT-WORKS.md +++ b/docs/HOW-IT-WORKS.md @@ -101,6 +101,104 @@ See [ADR-001](decisions/001-webhook-auto-apply.md) for the full decision record. --- +## CodeBuild Projects Reference + +There are exactly **two** CodeBuild projects. They are both in csvd-dev and are +never invoked directly by end users. + +--- + +### `tf-run-proposer` + +**Triggered by:** The Proposer SC product (user fills SC form → CFN Custom +Resource → Lambda starts this build and polls it). + +**What it does:** + +1. Clones the target account repo from GHE +2. Checks out (or creates) a proposal branch (default: `propose/sc-automation`) +3. Renders Jinja2 (`.j2`) template files from `TEMPLATE_REPO` using `TEMPLATE_VARS` +4. Writes any `EXTRA_FILES` directly into the repo tree +5. `git commit && git push --force-with-lease` +6. Opens a pull request (`proposal branch` → `main`) via `gh pr create`; skips + if a PR already exists for that branch (idempotent) +7. Emits `PR_URL=` in POST_BUILD so the Lambda can return it to CloudFormation + +**Does NOT run Terraform.** No infrastructure is touched during this build. +The only changes are committed files in a GHE branch. + +**Key env vars (injected per-build):** + +| Variable | Source | +|---|---| +| `ACCOUNT_REPO` | SC form → CFN → Lambda | +| `LAYER` | SC form → CFN → Lambda | +| `REGION_DIR` | SC form → CFN → Lambda | +| `GIT_BRANCH` | SC form → CFN → Lambda | +| `TEMPLATE_REPO` | SC form → CFN → Lambda | +| `TEMPLATE_VARS` | SC form → CFN → Lambda | +| `EXTRA_FILES` | SC form → CFN → Lambda | +| `GITHUB_TOKEN` | Lambda reads from Secrets Manager `ghe-runner/github-token` | + +**Build definition:** `buildspec-proposer.yml` + +--- + +### `tf-run-executor` + +**Triggered by:** The webhook Lambda — automatically on every push to `main` in +a watched account repo. Never triggered by a user or SC product. + +**What it does:** + +1. Reads target parameters from the per-build env vars (set by the webhook Lambda + from `.sc-automation.yml`) +2. Clones the account repo at `main` (the post-merge state) +3. If `TARGET_ACCOUNT_ID` is set: calls `aws sts assume-role` to obtain + temporary credentials in the target account +4. `cd ${LAYER}/${REGION_DIR}` +5. Runs `tf-run apply` (or `tf-run plan` if `DRY_RUN=true`), optionally starting + from a specific `TAG` step +6. In POST_BUILD: calls the GHE commit status API to write ✅ `success` or + ❌ `failure` on the merge commit — visible directly on the PR timeline + +**This is the only build that runs Terraform and changes real infrastructure.** + +**Key env vars (injected per-build):** + +| Variable | Source | +|---|---| +| `ACCOUNT_REPO` | Webhook Lambda reads from `.sc-automation.yml` | +| `LAYER` | Webhook Lambda reads from `.sc-automation.yml` | +| `REGION_DIR` | Webhook Lambda reads from `.sc-automation.yml` | +| `TARGET_ACCOUNT_ID` | Webhook Lambda reads from `.sc-automation.yml` | +| `VAULT_AWS_ROLE` | Webhook Lambda reads from `.sc-automation.yml` | +| `DRY_RUN` | Webhook Lambda reads from `.sc-automation.yml` | +| `TF_RUN_START_TAG` | Webhook Lambda reads from `.sc-automation.yml` | +| `COMMIT_SHA` | Webhook Lambda reads from the GHE push payload | +| `GITHUB_TOKEN` | Lambda reads from Secrets Manager `ghe-runner/github-token` | + +**Build definition:** `buildspec-executor.yml` + +--- + +### Relationship between the two projects + +``` +User (SC form) + └─> tf-run-proposer ← renders files, opens PR, touches nothing in AWS + ↓ + Human reviews diff and merges PR + ↓ + GHE push webhook ─> tf-run-executor ← runs Terraform, changes infrastructure +``` + +They share the same account repo and the same GHE PAT, but have completely +separate IAM roles, buildspecs, and trigger paths. The proposer build never has +Terraform installed; the executor build never opens GitHub PRs. + +--- + ## Step-by-Step: Propose Flow ### 1. User fills the SC form From 77f9a49d8417579f183028328321299209306a89 Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Wed, 20 May 2026 13:32:47 -0400 Subject: [PATCH 14/27] feat: Proposer generates all workspace files (REMOTE-STATE + tf-directory-setup.py) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - buildspec-proposer.yml: install tf-directory-setup.py from terraform/support in INSTALL phase; add python-dateutil + pyyaml pip deps. BUILD phase: after template rendering, run Python bootstrap step that: 1. Processes REMOTE-STATE directives in tf-run.data files — derives workspace remote_state.yml from layer-level file (identical to tf-run.sh behavior) 2. Runs tf-directory-setup.py --link none in each workspace with remote_state.yml — generates remote_state.backend.tf + .tf.s3/.local/.none variant files + symlink - buildspec-executor.yml: add note that REMOTE-STATE and tf-directory-setup.py steps are idempotent — files already exist from Proposer PR, no new files created - docs/HOW-IT-WORKS.md: expand BUILD phase step 5 to document the full file generation sequence including REMOTE-STATE and tf-directory-setup.py; add rationale explaining why all generation must happen in the Proposer - docs/template-management.md: fix template repo structure diagram — workspace remote_state.yml.j2 files removed (wrong); layer-level remote_state.yml.j2 shown; workspace tf-run.data with REMOTE-STATE directive shown; add layout rules for auto.tfvars profile/region requirement and .j2 source file handling. Expand Proposer Build steps to cover REMOTE-STATE + tf-directory-setup.py. Add principle callout: PR diff is the complete truth. --- buildspec-executor.yml | 7 ++++ buildspec-proposer.yml | 71 +++++++++++++++++++++++++++++++++++-- docs/HOW-IT-WORKS.md | 23 ++++++++++-- docs/template-management.md | 45 +++++++++++++++++------ 4 files changed, 130 insertions(+), 16 deletions(-) diff --git a/buildspec-executor.yml b/buildspec-executor.yml index 8bf5620..e9d6629 100644 --- a/buildspec-executor.yml +++ b/buildspec-executor.yml @@ -111,6 +111,13 @@ phases: # --- Run Terraform in target layer/region directory --- # tf-run auto-proceeds on non-TTY stdin (read -t timeout defaults to "y") + # + # NOTE on file-generating tf-run.data directives: + # REMOTE-STATE — generates workspace remote_state.yml from parent + # COMMAND tf-directory-setup.py — generates remote_state.backend.tf + variant files + # The Proposer already ran both of these and committed the results in the PR. + # When tf-run hits these steps here they are idempotent: they overwrite files + # that already exist with identical content. No new files are created at apply time. - cd "${LAYER}/${REGION_DIR}" - | if [ "${DRY_RUN}" = "true" ]; then diff --git a/buildspec-proposer.yml b/buildspec-proposer.yml index ddccbaa..624710b 100644 --- a/buildspec-proposer.yml +++ b/buildspec-proposer.yml @@ -47,8 +47,13 @@ phases: - aws s3 cp "$CENSUS_CA_S3" /etc/pki/ca-trust/source/anchors/census-ca.pem - update-ca-trust extract - # --- Python deps for template rendering --- - - pip3 install --quiet jinja2 + # --- tf-directory-setup.py (generates remote_state.backend.tf + variant files) --- + # Must be available in Proposer because ALL file generation happens here, not in Executor. + - cp /tmp/tf-support/local-app/tf-directory-setup/tf-directory-setup.py /usr/local/bin/tf-directory-setup.py + - chmod +x /usr/local/bin/tf-directory-setup.py + + # --- Python deps for template rendering + tf-directory-setup.py --- + - pip3 install --quiet jinja2 python-dateutil pyyaml # --- gh CLI (from S3, version pinned in terraform/support) --- - aws s3 cp "${GH_CLI_S3_PREFIX}/gh_${GH_VERSION}_linux_amd64.tar.gz" /tmp/gh.tar.gz @@ -122,6 +127,68 @@ phases: print(f'Wrote {len(files)} extra file(s)') " + # --- Bootstrap workspace state files (REMOTE-STATE + tf-directory-setup.py) --- + # tf-run.sh's REMOTE-STATE directive generates workspace remote_state.yml at apply time. + # tf-run.data COMMAND steps run tf-directory-setup.py to generate remote_state.backend.tf + # and the three variant files (.tf.s3 / .tf.local / .tf.none) + activate the symlink. + # + # ALL of this must happen in the Proposer so every generated file appears in the PR diff. + # The Executor must not silently create files; it inherits what the PR committed. + - | + python3 - <<'PYEOF' + import os, re, subprocess, sys, pathlib + + repo_root = pathlib.Path('.') + + for tfrun_data in sorted(repo_root.rglob('tf-run.data')): + ws_dir = tfrun_data.parent + # Skip .git internals + if any(p.startswith('.git') for p in ws_dir.parts): + continue + + content = tfrun_data.read_text() + lines = [l.strip() for l in content.splitlines() if l.strip() and not l.startswith('#')] + + # ── Step 1: REMOTE-STATE ────────────────────────────────────────────────── + # Mirrors tf-run.sh: read ../remote_state.yml, append /{workspace_name} to + # the directory field, write workspace-level remote_state.yml. + if any(l.startswith('REMOTE-STATE') for l in lines): + parent_rs = ws_dir.parent / 'remote_state.yml' + if not parent_rs.exists(): + print(f'WARNING: {ws_dir}: REMOTE-STATE in tf-run.data but no ' + f'parent remote_state.yml found — skipping', flush=True) + continue + parent_text = parent_rs.read_text() + subdir = ws_dir.name + # Replicate: sed -E s#(^directory.*)\"\'#\1/{subdir}\" + ws_rs_text = re.sub( + r'^(directory\s*:\s*")([^"]+)(")', + lambda m: m.group(1) + m.group(2).rstrip('/') + '/' + subdir + m.group(3), + parent_text, count=1, flags=re.MULTILINE + ) + ws_rs = ws_dir / 'remote_state.yml' + ws_rs.write_text(ws_rs_text) + print(f'REMOTE-STATE: wrote {ws_rs} (directory += /{subdir})', flush=True) + + # ── Step 2: tf-directory-setup.py ──────────────────────────────────────── + # Run whenever the workspace has a remote_state.yml (just written or from + # the template). Generates remote_state.backend.tf + 3 variant files. + # --link none: initial state; the Executor will re-link to s3 after first apply. + rs_file = ws_dir / 'remote_state.yml' + if rs_file.exists(): + result = subprocess.run( + [sys.executable, '/usr/local/bin/tf-directory-setup.py', '--link', 'none'], + cwd=str(ws_dir), capture_output=True, text=True + ) + print(result.stdout, end='', flush=True) + if result.returncode != 0: + print(f'ERROR: tf-directory-setup.py failed in {ws_dir}:\n{result.stderr}', + file=sys.stderr, flush=True) + sys.exit(result.returncode) + + print('Bootstrap complete.', flush=True) + PYEOF + # --- Commit and push --- - git add -A - | diff --git a/docs/HOW-IT-WORKS.md b/docs/HOW-IT-WORKS.md index bc42ba3..5bb5b5b 100644 --- a/docs/HOW-IT-WORKS.md +++ b/docs/HOW-IT-WORKS.md @@ -246,9 +246,26 @@ TEMPLATE_REPO, TEMPLATE_VARS, EXTRA_FILES, GITHUB_TOKEN - Copy rendered + non-template files into account repo at same relative paths 4. If `EXTRA_FILES` is non-empty: - Parse the JSON dict; write each `path → content` entry directly (overrides templates) -5. `git add -A && git commit -m "feat: sc-automation propose" --allow-empty` -6. `git push origin ${GIT_BRANCH} --force-with-lease` -7. `gh pr create --base main --head ${GIT_BRANCH} --title "..." --body "..."` (idempotent — skips if PR already exists) +5. **Bootstrap workspace state files** (all file generation must be in the Proposer PR): + - For every `tf-run.data` containing a `REMOTE-STATE` directive: + - Read `../remote_state.yml` (layer-level) and append `/{workspace_name}` to the `directory` field + - Write the result as `remote_state.yml` in the workspace directory + - This mirrors exactly what `tf-run.sh`'s `REMOTE-STATE` handler does at apply time + - For every workspace directory that now has a `remote_state.yml`: + - Run `tf-directory-setup.py --link none` to generate: + - `remote_state.backend.tf` — the S3 backend block + - `remote_state.{dir}.tf.s3` — production variant + - `remote_state.{dir}.tf.local` — local-state variant + - `remote_state.{dir}.tf.none` — empty stub (activated by `--link none`) + - Symlink `remote_state.{dir}.tf → remote_state.{dir}.tf.none` (bootstrap state) + - `--link none` is the correct bootstrap choice: state does not exist yet; the Executor will re-link to `.s3` after the first successful apply + > **Why here and not in the Executor?** `tf-run.sh` generates these files at apply time via + > `REMOTE-STATE` directive and `COMMAND tf-directory-setup.py` steps. If the Executor generates + > them, they are invisible to reviewers. By running this in the Proposer, every generated file + > appears in the PR diff and is subject to human review before any infrastructure changes. +6. `git add -A && git commit -m "feat: sc-automation propose" --allow-empty` +7. `git push origin ${GIT_BRANCH} --force-with-lease` +8. `gh pr create --base main --head ${GIT_BRANCH} --title "..." --body "..."` (idempotent — skips if PR already exists) ### 6. CodeBuild - POST_BUILD phase diff --git a/docs/template-management.md b/docs/template-management.md index 5a20ec4..3cbcb93 100644 --- a/docs/template-management.md +++ b/docs/template-management.md @@ -94,19 +94,24 @@ template-{product_type}/ ├── variables.d/ │ ├── variables.common.tf │ └── variables.tfstate.tf +│ └── {region}.variables.common.auto.tfvars.j2 # ← must emit profile + region keys ├── infrastructure/ +│ ├── remote_state.yml.j2 # ← layer-level; Proposer renders to remote_state.yml │ ├── east/ -│ │ ├── remote_state.yml.j2 # ← Jinja2: rendered by Proposer +│ │ ├── tf-run.data # ← must contain REMOTE-STATE directive │ │ └── {workload}.tf.j2 # ← Jinja2: rendered by Proposer │ └── west/ -│ ├── remote_state.yml.j2 -│ └── {workload}.tf.j2 +│ ├── tf-run.data # ← must contain REMOTE-STATE directive +│ └── {workload}.tf.j2 # ← Jinja2: rendered by Proposer └── README.md ``` -Files ending in `.j2` are Jinja2 templates. The Proposer CodeBuild build renders -them using the product inputs and commits the rendered result (without the `.j2` -extension) to the new account repo branch. +**Key layout rules:** + +- `remote_state.yml.j2` lives at the **layer level** (`infrastructure/`, `common/`, `vpc/`), **not** inside workspace subdirectories. The Proposer's REMOTE-STATE processor derives each workspace's `remote_state.yml` from the layer-level file by appending `/{workspace_name}` to the `directory` field — identical to what `tf-run.sh` does at apply time. +- Each workspace directory (`east/`, `west/`, `global/`) **must** include a `tf-run.data` file with a `REMOTE-STATE` directive so the Proposer knows to generate its `remote_state.yml`. +- The `.auto.tfvars.j2` file must render `profile = "..."` and `region = "..."` entries at the top level — `tf-run.sh` auto-discovers profile and region by grepping `*.tfvars`, so these values must be present for placeholder substitution (`%%REGION%%`, `%%PROFILE%%`, etc.) to work correctly. +- Files ending in `.j2` are Jinja2 templates. The Proposer renders them using the product input variables and commits the result (without the `.j2` extension) to the work branch. The `.j2` source files are **not** committed. --- @@ -139,11 +144,29 @@ The Proposer CodeBuild build (started by the Lambda via `codebuild:StartBuild`) 1. Clone the template repo (full repo or `source_path` subdirectory) 2. For each `.j2` file found: - Render it using `jinja2.Environment` with the product input variables - - Write the rendered output alongside the source file (without `.j2` extension) - - Remove the `.j2` source file from the working tree -3. Add rendered `remote_state.yml` files using actual account/bucket values -4. Write `.sc-automation.yml` to the repo root if it does not already exist on `main` -5. Commit all rendered files to a new branch (`proposal/{timestamp}`) and open a PR + - Write the rendered output to the same relative path (without `.j2` extension) +3. Write any `EXTRA_FILES` entries (direct path → content map; overrides template output) +4. **REMOTE-STATE processing** — for every `tf-run.data` with a `REMOTE-STATE` directive: + - Read the layer-level `remote_state.yml` (e.g. `infrastructure/remote_state.yml`) + - Append `/{workspace_basename}` to the `directory` field via regex substitution + - Write the result as `remote_state.yml` in the workspace directory (e.g. `infrastructure/west/remote_state.yml`) + - This is the same transformation `tf-run.sh` performs at apply time for the `REMOTE-STATE` directive +5. **`tf-directory-setup.py` bootstrap** — for every workspace directory that now has a `remote_state.yml`: + - Run `tf-directory-setup.py --link none` to generate: + - `remote_state.backend.tf` — the S3 backend configuration block + - `remote_state.{dir}.tf.s3` — S3-backed remote state variant + - `remote_state.{dir}.tf.local` — local state file variant + - `remote_state.{dir}.tf.none` — empty no-op stub (active on first propose) + - Symlink `remote_state.{dir}.tf → remote_state.{dir}.tf.none` + - `--link none` is the correct bootstrap value: Terraform state does not exist yet for a new workspace + - After a successful `tf apply` in the Executor, the `tf-run.data` `COMMAND tf-directory-setup.py --link s3` step re-links to `.s3` +6. Write `.sc-automation.yml` to the repo root if it does not already exist on `main` +7. Commit all files (rendered templates + generated state bootstrap files) to a work branch and open a PR + +> **Principle: the PR diff is the complete truth.** Every file the Executor will find at +> apply time must already be committed in the Proposer PR. Neither `REMOTE-STATE` nor +> `tf-directory-setup.py` should create new files during `tf-run apply` — those steps become +> idempotent re-generations of files already in the repo. The PR is reviewed by a platform engineer before merging. On merge, the webhook handler reads `.sc-automation.yml` and automatically starts the executor CodeBuild build. From da7cc4e4d8e54be6780c4127672d212ca42886cc Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Wed, 20 May 2026 13:37:54 -0400 Subject: [PATCH 15/27] feat: executor commit-back; terraform_latest; plugin cache; lock file; plan gitignore buildspec-executor.yml: - INSTALL: create terraform_latest symlink -> terraform (account repos use TFCOMMAND=terraform_latest) - INSTALL: mkdir /data/terraform/terraform.d/plugin-cache + providers (required by .tf-control.tfrc) - BUILD: after tf-run apply, git add symlink re-link + .terraform.lock.hcl and push directly to main with [skip ci] to prevent webhook re-trigger - Add CodeBuild cache block for /data/terraform/terraform.d/plugin-cache (persists provider archives across builds via S3) - Add log note: logs/ is ephemeral, must be in .gitignore docs/HOW-IT-WORKS.md: - INSTALL phase: document terraform_latest alias and /data/terraform dir creation - BUILD phase step 5: document symlink re-link + lock file commit-back with rationale docs/template-management.md: - Template structure: add .gitignore and .terraform.lock.hcl to workspace dirs - Layout rules: add .gitignore required entries (logs/, .terraform/, tfstate*) - Layout rules: explain .terraform.lock.hcl lifecycle (committed, Executor updates + pushes back) - Layout rules: explain terraform_latest alias and plugin cache/.tf-control.tfrc behavior --- buildspec-executor.yml | 45 +++++++++++++++++++++++++++++++++++++ docs/HOW-IT-WORKS.md | 7 ++++++ docs/template-management.md | 14 ++++++++++++ 3 files changed, 66 insertions(+) diff --git a/buildspec-executor.yml b/buildspec-executor.yml index e9d6629..49e1d50 100644 --- a/buildspec-executor.yml +++ b/buildspec-executor.yml @@ -64,6 +64,16 @@ phases: for action in init plan apply destroy refresh output validate import state fmt taint console; do ln -sf /usr/local/bin/tf-control.sh /usr/local/bin/tf-${action}; done + # Account repo .tf-control files set TFCOMMAND=terraform_latest (the Census workstation alias). + # In CodeBuild the binary is just 'terraform'; create the alias so tf-control.sh resolves it. + - ln -sf /usr/local/bin/terraform /usr/local/bin/terraform_latest + + # --- Plugin cache directory (referenced by .tf-control.tfrc in every account repo) --- + # .tf-control.tfrc sets plugin_cache_dir = "/data/terraform/terraform.d/plugin-cache" + # and filesystem_mirror path = "/data/terraform/terraform.d/providers". + # Create both so Terraform does not error on init; the mirror is empty so Terraform + # falls through to the 'direct' block in the tfrc (via Census proxy to registry.terraform.io). + - mkdir -p /data/terraform/terraform.d/plugin-cache /data/terraform/terraform.d/providers # --- Python deps for tf-directory-setup.py --- - pip3 install --quiet python-dateutil pyyaml @@ -118,6 +128,9 @@ phases: # The Proposer already ran both of these and committed the results in the PR. # When tf-run hits these steps here they are idempotent: they overwrite files # that already exist with identical content. No new files are created at apply time. + # + # NOTE on logs/: tf-control.sh writes every plan/apply to logs/{action}.{timestamp}.log. + # This directory is ephemeral (never committed). Ensure logs/ is in .gitignore. - cd "${LAYER}/${REGION_DIR}" - | if [ "${DRY_RUN}" = "true" ]; then @@ -129,8 +142,40 @@ phases: TFARGS="-auto-approve" tf-run apply fi + # --- Commit post-apply file changes back to main --- + # After a successful apply tf-run.data typically runs: + # COMMAND tf-directory-setup.py --link s3 + # which re-links remote_state.{dir}.tf from .tf.none → .tf.s3. + # terraform init also generates/updates .terraform.lock.hcl. + # Both of these changes must be committed back to main so: + # (a) the repo reflects actual state for future Proposer re-renders + # (b) subsequent tf-init on main does not re-download all providers + # [skip ci] prevents the push from re-triggering the webhook executor. + - cd "${CODEBUILD_SRC_DIR}/repo" + - | + git add -A -- "${LAYER}/${REGION_DIR}/remote_state."* \ + "${LAYER}/${REGION_DIR}/.terraform.lock.hcl" 2>/dev/null || true + if ! git diff --cached --quiet; then + git -c user.email="sc-automation@census.gov" \ + -c user.name="SC Automation" \ + commit -m "chore: executor post-apply update ${LAYER}/${REGION_DIR} [skip ci]" + git push \ + "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${ACCOUNT_REPO}.git" \ + HEAD:main + echo "Committed and pushed post-apply changes to main" + else + echo "No post-apply file changes to commit" + fi + post_build: commands: - echo "BUILD_RESULT=${CODEBUILD_BUILD_SUCCEEDING}" - echo "ACCOUNT_REPO=${ACCOUNT_REPO}" - echo "LAYER=${LAYER} REGION_DIR=${REGION_DIR}" + +cache: + paths: + # Cache the provider plugin cache across builds for faster tf-init. + # Providers downloaded via Census proxy are stored here; subsequent builds + # skip re-downloading providers that haven't changed. + - /data/terraform/terraform.d/plugin-cache/**/* diff --git a/docs/HOW-IT-WORKS.md b/docs/HOW-IT-WORKS.md index 5bb5b5b..b507b94 100644 --- a/docs/HOW-IT-WORKS.md +++ b/docs/HOW-IT-WORKS.md @@ -320,6 +320,8 @@ The Lambda (webhook handler mode): - Clones `github.e.it.census.gov/terraform/support` for version governance - Downloads Terraform binary from S3 (version governed by `VERSION_TF`) - Installs tf-run toolchain scripts from the support repo +- Creates `terraform_latest` symlink → `terraform` (account repos set `TFCOMMAND=terraform_latest` in `.tf-control`) +- Creates `/data/terraform/terraform.d/plugin-cache/` and `/data/terraform/terraform.d/providers/` (required by `.tf-control.tfrc` `plugin_cache_dir` and `filesystem_mirror` directives) - Downloads and installs Census CA cert - Downloads and installs `gh` CLI - `pip3 install python-dateutil pyyaml` @@ -333,6 +335,11 @@ The Lambda (webhook handler mode): 3. `cd ${LAYER}/${REGION_DIR}` 4. If `DRY_RUN=true`: `tf-run plan`; else: `tf-run apply` (with optional `--start-tag ${TF_RUN_START_TAG}`) +5. **Commit post-apply changes back to `main`** — two categories of files change after a successful apply: + - **Symlink re-link**: `tf-run.data` typically contains `COMMAND tf-directory-setup.py --link s3` which changes the `remote_state.{dir}.tf` symlink from `.tf.none` → `.tf.s3`. This must be pushed back so future Proposer re-renders see the correct active variant. + - **Lock file update**: `tf-init` generates or updates `.terraform.lock.hcl` if provider constraints change. This must be pushed back so subsequent runs do not re-resolve providers. + - These are committed directly to `main` with `[skip ci]` in the message to prevent the webhook from re-triggering the Executor. No PR is needed: these are operational metadata, not infrastructure config changes. + - If `git diff --cached` is empty (DRY_RUN or no changes), the commit step is skipped cleanly. ### 6. CodeBuild - POST_BUILD phase diff --git a/docs/template-management.md b/docs/template-management.md index 3cbcb93..d1092c4 100644 --- a/docs/template-management.md +++ b/docs/template-management.md @@ -85,6 +85,7 @@ output is compatible with the `tf-run` toolchain and `tf-directory-setup.py`: ``` template-{product_type}/ +├── .gitignore # must exclude logs/ .terraform/ terraform.tfstate* ├── .tf-control # tf-run toolchain version pin ├── .tf-control.tfrc # Terraform provider cache config ├── region.tf # locals { region = var.region } @@ -99,9 +100,11 @@ template-{product_type}/ │ ├── remote_state.yml.j2 # ← layer-level; Proposer renders to remote_state.yml │ ├── east/ │ │ ├── tf-run.data # ← must contain REMOTE-STATE directive +│ │ ├── .terraform.lock.hcl # ← committed; Executor updates and pushes back to main │ │ └── {workload}.tf.j2 # ← Jinja2: rendered by Proposer │ └── west/ │ ├── tf-run.data # ← must contain REMOTE-STATE directive +│ ├── .terraform.lock.hcl # ← committed; Executor updates and pushes back to main │ └── {workload}.tf.j2 # ← Jinja2: rendered by Proposer └── README.md ``` @@ -111,6 +114,17 @@ template-{product_type}/ - `remote_state.yml.j2` lives at the **layer level** (`infrastructure/`, `common/`, `vpc/`), **not** inside workspace subdirectories. The Proposer's REMOTE-STATE processor derives each workspace's `remote_state.yml` from the layer-level file by appending `/{workspace_name}` to the `directory` field — identical to what `tf-run.sh` does at apply time. - Each workspace directory (`east/`, `west/`, `global/`) **must** include a `tf-run.data` file with a `REMOTE-STATE` directive so the Proposer knows to generate its `remote_state.yml`. - The `.auto.tfvars.j2` file must render `profile = "..."` and `region = "..."` entries at the top level — `tf-run.sh` auto-discovers profile and region by grepping `*.tfvars`, so these values must be present for placeholder substitution (`%%REGION%%`, `%%PROFILE%%`, etc.) to work correctly. +- `.gitignore` **must** contain at minimum: + ``` + logs/ + .terraform/ + terraform.tfstate + terraform.tfstate.backup + ``` + `logs/` is where `tf-control.sh` writes every plan/apply log. These are ephemeral and must never be committed. `.terraform/` caches the provider plugins locally during a run and must not be committed (only `.terraform.lock.hcl` is committed). +- `.terraform.lock.hcl` is the [dependency lock file](https://developer.hashicorp.com/terraform/language/files/dependency-lock) and **must be committed**. The template should include an initial lock file generated from the workspace's required providers. The Executor runs `tf-init` which updates it if providers change, then commits the update directly back to `main` (bypassing the PR flow, tagged `[skip ci]`). +- `.tf-control` sets `TFCOMMAND=terraform_latest` (the Census workstation alias). The Executor buildspec creates a `terraform_latest` symlink pointing to the installed `terraform` binary so `tf-control.sh` resolves it correctly. +- `.tf-control.tfrc` sets `plugin_cache_dir = "/data/terraform/terraform.d/plugin-cache"` and a `filesystem_mirror` at `/data/terraform/terraform.d/providers`. The Executor buildspec creates both directories. The `filesystem_mirror` path starts empty so Terraform falls through to the `direct {}` block — providers are fetched via the Census proxy and then cached in `plugin_cache_dir` for the remainder of the build. The plugin cache directory is also configured as a CodeBuild S3 cache path so provider archives persist across builds. - Files ending in `.j2` are Jinja2 templates. The Proposer renders them using the product input variables and commits the result (without the `.j2` extension) to the work branch. The `.j2` source files are **not** committed. --- From ca5193124896b0e0a81fc8c8cf5da922f77907db Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Wed, 20 May 2026 13:57:10 -0400 Subject: [PATCH 16/27] docs: template repos are delta overlays, not full account repo scaffolds Core principle: account repos already carry .tf-control, .tf-control.tfrc, region.tf, credentials.d/, variables.d/ from initial setup. Template repos provide only the workload-specific delta (new .tf.j2 files + tf-run.data). Changes: - Rewrite template-management.md opening to explain delta-overlay model and why duplicating standard files would break reusability - Minimal real example: template-s3-bucket is 3 files total - New-layer case: layer-level remote_state.yml provided via EXTRA_FILES (Lambda Pydantic model builds it from SC form inputs), not from template - Remove .tf-control, .tf-control.tfrc, region.tf, credentials.d/, variables.d/ from template structure diagram (wrong/environment-specific) - Remove outdated Lambda template organization section (old EKS-only model) - Replace stale Executor section (was: renders templates + opens PRs) with correct model (runs tf-run apply only, commits lock+symlink back) - Fix Adding checklist: delta files only + EXTRA_FILES note for new layers --- docs/template-management.md | 389 ++++++++++++++++++------------------ 1 file changed, 189 insertions(+), 200 deletions(-) diff --git a/docs/template-management.md b/docs/template-management.md index d1092c4..4a08803 100644 --- a/docs/template-management.md +++ b/docs/template-management.md @@ -1,189 +1,215 @@ # Template Management -**Ported from:** `lambda-template-repo-generator/design-docs/CUSTOM_TEMPLATES.MD` -**Updated for:** sc-lambda-ghactions (CodeBuild-based initial rollout; GHA planned for later) - This document describes how template repositories are structured and consumed by -the sc-lambda-ghactions system to create new account repos for any Terraform workload. +the sc-lambda-ghactions system to add new workloads to existing account repos. --- -## Template Sources +## Core Principle: Templates are Delta Overlays + +Template repos do **not** contain a full account repo scaffold. Account repos +already carry all of the standard boilerplate from their initial setup: + +``` +{account-id}-{alias}/ +├── .tf-control # already there — toolchain version pin +├── .tf-control.tfrc # already there — plugin cache / provider mirror +├── .gitignore # already there +├── region.tf # already there +├── credentials.d/ # already there — per-region AWS credential files +├── variables.d/ # already there — profile + region auto.tfvars +├── common/ # existing layer with remote_state.yml, variables, etc. +├── infrastructure/ # existing layer ... +│ ├── remote_state.yml # already there — account-specific bucket/profile/account_id +│ ├── variables.common.tf # already there +│ └── west/ # existing workspace ... +└── vpc/ # existing layer ... +``` -### Full Repository Templates +A template repo provides **only the new files** the Proposer writes into +that existing structure. If the template were to include `.tf-control`, +`region.tf`, `credentials.d/`, or `variables.d/`, it would: -The standard approach: a GHE repository is used as the template. When the Lambda -Proposer build runs, it clones the template repo verbatim and renders Jinja2 -configuration files on top of it before committing to the new account repo branch. +- **Overwrite working account-specific values** with placeholders or wrong defaults +- Be **non-reusable** across accounts (different profiles, regions, account IDs) +- Duplicate governance already managed by the `terraform/support` repo -**Convention:** template repos are named `template-{product_type}` under `SCT-Engineering/`. +--- -| Product type | Template repo | -|---|---| -| `eks_cluster` | `SCT-Engineering/template-eks-cluster` | -| `s3_bucket` | `SCT-Engineering/template-s3-bucket` *(planned)* | -| `{any_type}` | `SCT-Engineering/template-{any_type}` | +## What Belongs in a Template Repo -### Subdirectory Templates +A template repo contains only the workload-specific delta: -For product families that share significant infrastructure (e.g. multiple tiers -of the same workload), a single template repo can contain multiple subdirectory -templates. The Proposer build accepts a `source_path` parameter to clone only -the relevant subdirectory into the new account repo. +``` +template-{product_type}/ +├── {layer}/ +│ └── {workspace}/ +│ ├── {workload}.tf.j2 # workload resources — rendered by Proposer +│ └── tf-run.data # apply step sequence for this workspace +└── .sc-automation.yml.j2 # optional: Proposer writes this if absent +``` -Example: a `template-terraform-workloads` repo with: +### Minimal real example — `template-s3-bucket` ``` -template-terraform-workloads/ -├── eks-cluster/ # Standard EKS cluster template -├── eks-cluster-minimal/ # Reduced-footprint cluster variant -├── s3-standard/ # Standard S3 bucket configuration -└── s3-encrypted/ # S3 with custom KMS key configuration +template-s3-bucket/ +├── infrastructure/ +│ └── west/ +│ ├── INF.s3-standard.tf.j2 # S3 bucket + policy resources +│ └── tf-run.data # REMOTE-STATE + tf-directory-setup + ALL +└── .sc-automation.yml.j2 ``` -A product that specifies `source_path: eks-cluster-minimal` will clone only that -subdirectory, stripped of the parent path prefix. +That is the entire template. Nothing else. The account repo already provides +the execution context: Terraform binary version, plugin cache, proxy settings, +provider config, region, credentials, and the layer-level `remote_state.yml` +from which the workspace `remote_state.yml` is derived. + +### When the target layer does not yet exist + +If the workload requires adding a **brand-new layer** to the account repo +(e.g. adding `infrastructure/` to an account that only has `common/`), the +template still does not provide the layer-level `remote_state.yml`. Instead, +the Lambda's Pydantic model builds it from SC form inputs and passes it via +`EXTRA_FILES`: + +```python +# Inside the Lambda handler for this product type: +extra_files = { + f"{layer}/remote_state.yml": render_remote_state_yml( + directory=layer, + account_id=req.aws_account_id, + account_alias=req.account_alias, + bucket=f"inf-tfstate-{req.aws_account_id}", + bucket_region="us-gov-east-1", + profile=f"{req.aws_account_id}-{req.account_alias}", + region=req.aws_region, + aws_environment="gov", + ) +} +``` + +`EXTRA_FILES` are written by the Proposer **after** template rendering, so they +can never be accidentally provided by the template repo. The account-specific +values come from the validated Pydantic model, not from a `.j2` file. --- -## CFN Product Template Usage +## Template Repository Conventions -### Full repository (no source_path) +### `tf-run.data` — required in every workspace the template touches + +Every workspace directory added by the template must include a `tf-run.data` +with at minimum: -```yaml -Resources: - MyAccountRepo: - Type: Custom::TerraformRepo - Properties: - ServiceToken: !Sub "arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:sc-template-automation" - product_type: eks_cluster - project_name: !Ref ProjectName - environment: !Ref Environment - aws_account_id: !Sub "${AWS::AccountId}" - aws_region: !Sub "${AWS::Region}" +``` +VERSION 1.0 +REMOTE-STATE +COMMAND tf-directory-setup.py --link none +TAG apply-start +ALL ``` -### Subdirectory template +- `REMOTE-STATE` instructs the Proposer to derive the workspace `remote_state.yml` + from the layer-level one (appending `/{workspace_name}` to `directory`). +- `COMMAND tf-directory-setup.py --link none` causes the Proposer to generate + `remote_state.backend.tf` + the three variant files. `--link none` is the + bootstrap state; the Executor re-links to `--link s3` after first apply. +- `TAG apply-start` lets an operator re-run from this point without re-running + the setup directives. -```yaml -Properties: - ServiceToken: !Sub "arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:sc-template-automation" - product_type: s3_bucket - source_path: s3-encrypted # ← subdirectory within the template repo - project_name: !Ref ProjectName - environment: !Ref Environment - aws_account_id: !Sub "${AWS::AccountId}" - aws_region: !Sub "${AWS::Region}" -``` +### `.sc-automation.yml.j2` — optional ---- +If the template includes `.sc-automation.yml.j2`, the Proposer renders and +commits it. If absent, the Proposer writes a default `.sc-automation.yml` +using the product type and executor project from the Lambda's `TfRunRequest` +model. Either way, the file ends up at the repo root on `main` after merge. -## Template Repository Structure +### `.terraform.lock.hcl` — include if possible -Every template repo must follow the standard account repo layout so the rendered -output is compatible with the `tf-run` toolchain and `tf-directory-setup.py`: +If the template is authored for a known provider set (e.g. `hashicorp/aws`), +include a pre-generated `.terraform.lock.hcl` in each workspace directory. +This avoids a from-scratch provider resolution on first `tf-init` and gives +reviewers visibility into the locked provider versions. -``` -template-{product_type}/ -├── .gitignore # must exclude logs/ .terraform/ terraform.tfstate* -├── .tf-control # tf-run toolchain version pin -├── .tf-control.tfrc # Terraform provider cache config -├── region.tf # locals { region = var.region } -├── credentials.d/ -│ ├── us-gov-east-1.credentials.tf -│ └── us-gov-west-1.credentials.tf -├── variables.d/ -│ ├── variables.common.tf -│ └── variables.tfstate.tf -│ └── {region}.variables.common.auto.tfvars.j2 # ← must emit profile + region keys -├── infrastructure/ -│ ├── remote_state.yml.j2 # ← layer-level; Proposer renders to remote_state.yml -│ ├── east/ -│ │ ├── tf-run.data # ← must contain REMOTE-STATE directive -│ │ ├── .terraform.lock.hcl # ← committed; Executor updates and pushes back to main -│ │ └── {workload}.tf.j2 # ← Jinja2: rendered by Proposer -│ └── west/ -│ ├── tf-run.data # ← must contain REMOTE-STATE directive -│ ├── .terraform.lock.hcl # ← committed; Executor updates and pushes back to main -│ └── {workload}.tf.j2 # ← Jinja2: rendered by Proposer -└── README.md +If omitted, the Executor generates it on first `tf-init` and commits it back +to `main` (tagged `[skip ci]`). + +--- + +## CFN Product Template Usage + +```yaml +Resources: + WorkloadRepo: + Type: Custom::TfRunPropose + Properties: + ServiceToken: !Sub "arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:sc-template-automation" + product_type: s3_bucket + account_repo: !Ref AccountRepo # e.g. 229685449397-csvd-dev-platform-dev-gov + layer: infrastructure + region_dir: west + aws_account_id: !Sub "${AWS::AccountId}" + aws_region: !Sub "${AWS::Region}" + # product-type-specific inputs (vary by Pydantic model): + bucket_name: !Ref BucketName + versioning_enabled: "true" ``` -**Key layout rules:** - -- `remote_state.yml.j2` lives at the **layer level** (`infrastructure/`, `common/`, `vpc/`), **not** inside workspace subdirectories. The Proposer's REMOTE-STATE processor derives each workspace's `remote_state.yml` from the layer-level file by appending `/{workspace_name}` to the `directory` field — identical to what `tf-run.sh` does at apply time. -- Each workspace directory (`east/`, `west/`, `global/`) **must** include a `tf-run.data` file with a `REMOTE-STATE` directive so the Proposer knows to generate its `remote_state.yml`. -- The `.auto.tfvars.j2` file must render `profile = "..."` and `region = "..."` entries at the top level — `tf-run.sh` auto-discovers profile and region by grepping `*.tfvars`, so these values must be present for placeholder substitution (`%%REGION%%`, `%%PROFILE%%`, etc.) to work correctly. -- `.gitignore` **must** contain at minimum: - ``` - logs/ - .terraform/ - terraform.tfstate - terraform.tfstate.backup - ``` - `logs/` is where `tf-control.sh` writes every plan/apply log. These are ephemeral and must never be committed. `.terraform/` caches the provider plugins locally during a run and must not be committed (only `.terraform.lock.hcl` is committed). -- `.terraform.lock.hcl` is the [dependency lock file](https://developer.hashicorp.com/terraform/language/files/dependency-lock) and **must be committed**. The template should include an initial lock file generated from the workspace's required providers. The Executor runs `tf-init` which updates it if providers change, then commits the update directly back to `main` (bypassing the PR flow, tagged `[skip ci]`). -- `.tf-control` sets `TFCOMMAND=terraform_latest` (the Census workstation alias). The Executor buildspec creates a `terraform_latest` symlink pointing to the installed `terraform` binary so `tf-control.sh` resolves it correctly. -- `.tf-control.tfrc` sets `plugin_cache_dir = "/data/terraform/terraform.d/plugin-cache"` and a `filesystem_mirror` at `/data/terraform/terraform.d/providers`. The Executor buildspec creates both directories. The `filesystem_mirror` path starts empty so Terraform falls through to the `direct {}` block — providers are fetched via the Census proxy and then cached in `plugin_cache_dir` for the remainder of the build. The plugin cache directory is also configured as a CodeBuild S3 cache path so provider archives persist across builds. -- Files ending in `.j2` are Jinja2 templates. The Proposer renders them using the product input variables and commits the result (without the `.j2` extension) to the work branch. The `.j2` source files are **not** committed. +The Lambda's Pydantic model for `s3_bucket` validates the product-specific +inputs and builds `TEMPLATE_VARS` + any `EXTRA_FILES` (e.g. a new +`remote_state.yml` if the layer doesn't exist). The template repo supplies +only the generic `.tf.j2` and `tf-run.data`; the Lambda supplies all +environment-specific values. --- -## Jinja2 Template Organization in the Lambda +## Subdirectory Templates -Rendered templates are stored in the Lambda image under `lambda/templates/{product_type}/`: +A single template repo can contain multiple product variants as subdirectories. +The Lambda passes `source_path` to the Proposer to clone only the relevant subtree: ``` -lambda/templates/ -├── eks_cluster/ -│ ├── infrastructure/west/cluster.tf.j2 -│ ├── infrastructure/east/cluster.tf.j2 -│ └── ... -├── s3_bucket/ # ← new product type: add a directory here -│ ├── infrastructure/west/s3.tf.j2 -│ └── ... -└── {product_type}/ # ← pattern for future types +template-s3/ +├── standard/ +│ └── infrastructure/west/INF.s3-standard.tf.j2 +└── encrypted/ + └── infrastructure/west/INF.s3-encrypted.tf.j2 ``` -The Lambda dispatcher maps `product_type` → template directory automatically. -Adding a new product type requires only adding a new subdirectory here, a -Pydantic model, and a CFN product template — no Lambda plumbing changes. +A product that specifies `source_path: encrypted` copies only +`infrastructure/west/INF.s3-encrypted.tf.j2` into the account repo. --- -## Proposer Build — Template Copying Logic - -The Proposer CodeBuild build (started by the Lambda via `codebuild:StartBuild`) performs these steps: - -1. Clone the template repo (full repo or `source_path` subdirectory) -2. For each `.j2` file found: - - Render it using `jinja2.Environment` with the product input variables - - Write the rendered output to the same relative path (without `.j2` extension) -3. Write any `EXTRA_FILES` entries (direct path → content map; overrides template output) -4. **REMOTE-STATE processing** — for every `tf-run.data` with a `REMOTE-STATE` directive: - - Read the layer-level `remote_state.yml` (e.g. `infrastructure/remote_state.yml`) - - Append `/{workspace_basename}` to the `directory` field via regex substitution - - Write the result as `remote_state.yml` in the workspace directory (e.g. `infrastructure/west/remote_state.yml`) - - This is the same transformation `tf-run.sh` performs at apply time for the `REMOTE-STATE` directive -5. **`tf-directory-setup.py` bootstrap** — for every workspace directory that now has a `remote_state.yml`: - - Run `tf-directory-setup.py --link none` to generate: - - `remote_state.backend.tf` — the S3 backend configuration block - - `remote_state.{dir}.tf.s3` — S3-backed remote state variant - - `remote_state.{dir}.tf.local` — local state file variant - - `remote_state.{dir}.tf.none` — empty no-op stub (active on first propose) - - Symlink `remote_state.{dir}.tf → remote_state.{dir}.tf.none` - - `--link none` is the correct bootstrap value: Terraform state does not exist yet for a new workspace - - After a successful `tf apply` in the Executor, the `tf-run.data` `COMMAND tf-directory-setup.py --link s3` step re-links to `.s3` -6. Write `.sc-automation.yml` to the repo root if it does not already exist on `main` -7. Commit all files (rendered templates + generated state bootstrap files) to a work branch and open a PR - -> **Principle: the PR diff is the complete truth.** Every file the Executor will find at -> apply time must already be committed in the Proposer PR. Neither `REMOTE-STATE` nor -> `tf-directory-setup.py` should create new files during `tf-run apply` — those steps become -> idempotent re-generations of files already in the repo. - -The PR is reviewed by a platform engineer before merging. On merge, the webhook -handler reads `.sc-automation.yml` and automatically starts the executor CodeBuild build. +## Proposer Build — What It Does + +The Proposer CodeBuild build clones the **existing account repo** and writes the +template delta on top of it. Steps in order: + +1. `git clone` the account repo; `git checkout -B ${GIT_BRANCH}` +2. If `TEMPLATE_REPO` is set: clone it (optionally at `source_path`), render all `.j2` + files with Jinja2 `StrictUndefined`, copy non-`.j2` files as-is, all at the same + relative paths. Account repo files that the template does not touch are left unchanged. +3. Write any `EXTRA_FILES` entries (path → content map from the Lambda model; overrides + template output). Typical use: new layer-level `remote_state.yml` when the target + layer does not yet exist in the account repo. +4. **REMOTE-STATE bootstrap** — for every `tf-run.data` found that contains a `REMOTE-STATE` + directive: read the layer-level `remote_state.yml` already present in the account repo + (or just written via `EXTRA_FILES`), append `/{workspace_name}` to the `directory` field, + write the result as `remote_state.yml` in the workspace directory. This mirrors exactly + what `tf-run.sh` does at apply time. +5. **`tf-directory-setup.py --link none`** — for every workspace directory that now has a + `remote_state.yml`, run `tf-directory-setup.py --link none` to generate: + - `remote_state.backend.tf` — S3 backend block + - `remote_state.{dir}.tf.s3` / `.local` / `.none` variant files + - Symlink `remote_state.{dir}.tf → remote_state.{dir}.tf.none` (bootstrap state) +6. Write `.sc-automation.yml` at the repo root if absent on `main`. +7. `git add -A && git commit && git push && gh pr create` + +> **Principle: the PR diff is the complete truth.** Every file the Executor will see +> at apply time is committed in the Proposer PR. The Executor never silently creates +> files; its `REMOTE-STATE` and `tf-directory-setup.py` steps are idempotent overwrites. --- @@ -223,63 +249,25 @@ variables: # Extra key/value pairs injected as CodeBuild --- -## Executor Build — Injecting into an Existing Account Repo - -After a platform engineer merges the Proposer PR into `main`, the sc-lambda-ghactions -webhook fires and starts the **Executor** CodeBuild build. The Executor handles -both the initial `tf plan`/`tf apply` run and any subsequent re-render of existing repos. - -### What the Executor Does - -``` -webhook (PR merged to main) - └─> Lambda reads .sc-automation.yml from main - └─> Lambda starts Executor CodeBuild build via StartBuild - environmentVariablesOverride: - REPO_NAME, PRODUCT_TYPE, DRY_RUN, TEMPLATE_REPO, ... - -Executor buildspec: - INSTALL: - - Install Terraform from S3 assets bucket - - Install Census CA cert, set HTTPS_PROXY - - git clone {account_repo} (GHE token from Secrets Manager) - PRE_BUILD: - - Read .sc-automation.yml from cloned repo - - git clone {template_repo} into /tmp/template - BUILD: - - For each .j2 file in /tmp/template: - Render with Jinja2 using env vars as context - Write to account_repo at same relative path (no .j2 extension) - - git checkout -b update/{timestamp} - - git add -A && git commit - - git push - - gh pr create --title "Automated update: {product_type} {timestamp}" - - If dry_run == false: - tf init && tf apply -auto-approve - POST_BUILD: - - POST commit status to GHE (success/failure with CodeBuild log URL) -``` - -### Fleet Update (re-rendering an existing repo) +## Executor Build — What It Does -When a **template repo itself changes** — for example, an upstream HCL pattern is -updated — the fleet update flow (Flow 3) re-renders all account repos of that -`product_type`: +The Executor does **not** render templates or open PRs. It only runs Terraform. -1. `terraform-sc-fleet` lists all `workloads/{product_type}/*/main.tf` entries -2. Lambda starts one Executor build **per account repo** (fan-out) -3. Each Executor clones its account repo, re-renders all `.j2` files from the - updated template, commits to a new branch, and opens a PR -4. Platform engineers review and merge the PRs individually +After a platform engineer merges the Proposer PR to `main`: +1. GHE push webhook → Lambda reads `.sc-automation.yml` → starts `tf-run-executor` +2. Executor clones the account repo at `main` (all files already committed by the Proposer PR) +3. Optionally assumes cross-account IAM role (`TARGET_ACCOUNT_ID`) +4. `cd ${LAYER}/${REGION_DIR}`; runs `tf-run plan` or `tf-run apply` +5. After successful apply: commits `remote_state.{dir}.tf` symlink re-link + + `.terraform.lock.hcl` updates directly to `main` with `[skip ci]` -The Executor **never force-pushes to `main`** — every change goes through a PR, -preserving review gates regardless of whether `dry_run` is set. +The Executor does not touch any file that wasn't already committed in the PR. +It carries no template-repo knowledge and no Jinja2 dependencies. ### Idempotency -The Executor is safe to re-run. If the rendered output is identical to `main` -(`git diff --quiet`), it exits with no PR opened and reports a `SKIPPED` status -back to the Lambda. +The Executor is safe to re-run. If `tf-run apply` produces no infrastructure +changes and the post-apply file diff is empty, the commit step is skipped. --- @@ -295,14 +283,15 @@ back to the Lambda. --- -## Adding a New Template Repository +## Adding a New Product Type Checklist when onboarding a new product type: -- [ ] Create `SCT-Engineering/template-{product_type}` with standard account repo layout -- [ ] Add `.j2` files for each rendered configuration file -- [ ] Add `lambda/templates/{product_type}/` with corresponding Jinja2 templates -- [ ] Add a Pydantic model in `lambda/models/{product_type}.py` +- [ ] Create `SCT-Engineering/template-{product_type}` containing **only** the workload + delta: `{layer}/{workspace}/{workload}.tf.j2` + `tf-run.data` +- [ ] Add a Pydantic model in `lambda/models/{product_type}.py` that validates + product-specific inputs and builds `TEMPLATE_VARS` + any `EXTRA_FILES` + (e.g. layer-level `remote_state.yml` if the target layer may not exist yet) - [ ] Register the handler in `lambda/app.py` `PRODUCT_HANDLERS` table - [ ] Create a CFN product template in `service-catalog/{product_type}-product-template.yaml` - [ ] Add the product to `terraform-service-catalog-census` (see [service-catalog-census-integration.md](service-catalog-census-integration.md)) From a16101ceaacdbbefbba8aedcbc22fcde61e0b820 Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Wed, 20 May 2026 14:03:32 -0400 Subject: [PATCH 17/27] feat: flat template repos; Proposer injects into LAYER/REGION_DIR Template repos no longer encode layer/workspace as directory nesting. LAYER and REGION_DIR are already known env vars - the Proposer uses them to determine the destination path in the account repo. buildspec-proposer.yml: - Add TEMPLATE_SOURCE_PATH env var (selects subdirectory variant within repo) - Rewrite template rendering: dst_root = LAYER/REGION_DIR/ instead of '.' - Dotfiles at template root (e.g. .sc-automation.yml) go to account repo root - Document flat layout convention in comments docs/template-management.md: - Rewrite What Belongs section: flat structure, show where files land - template-s3-bucket example is now 3 flat files (not nested infrastructure/west/) - TEMPLATE_SOURCE_PATH explained inline with multi-variant example - Remove old Subdirectory Templates section (replaced with inline example) - tf-run.data, .sc-automation.yml.j2, .terraform.lock.hcl notes updated docs/HOW-IT-WORKS.md: - BUILD phase step 3: document flat layout + dotfile root exception --- buildspec-proposer.yml | 36 +++++++++++--- docs/HOW-IT-WORKS.md | 8 ++-- docs/template-management.md | 95 +++++++++++++++++++++---------------- 3 files changed, 89 insertions(+), 50 deletions(-) diff --git a/buildspec-proposer.yml b/buildspec-proposer.yml index 624710b..28e457d 100644 --- a/buildspec-proposer.yml +++ b/buildspec-proposer.yml @@ -15,7 +15,8 @@ version: 0.2 # # Optional env-var overrides: # GIT_BRANCH - branch to commit/PR from (default: propose/sc-automation) -# TEMPLATE_REPO - GHE repo containing Jinja2/.tf template files +# TEMPLATE_REPO - GHE repo containing workload template files (flat layout) +# TEMPLATE_SOURCE_PATH - subdirectory within TEMPLATE_REPO to use as root (empty = whole repo) # TEMPLATE_VARS - JSON map of Jinja2 variables for template rendering # EXTRA_FILES - JSON map {"relative/path": "content"} written after template rendering # --------------------------------------------------------------------------- @@ -32,6 +33,7 @@ env: # Per-build defaults (overridden via environmentVariablesOverride in Lambda) GIT_BRANCH: "propose/sc-automation" TEMPLATE_REPO: "" + TEMPLATE_SOURCE_PATH: "" # subdirectory within TEMPLATE_REPO to use as root (empty = whole repo) TEMPLATE_VARS: "{}" EXTRA_FILES: "{}" @@ -74,8 +76,11 @@ phases: - git checkout -B "${GIT_BRANCH}" # --- Render template repo (if specified) --- - # Clone TEMPLATE_REPO, render .j2 files with TEMPLATE_VARS via Jinja2 StrictUndefined, - # copy non-template files as-is. Results land at the same relative paths in the account repo. + # Clone TEMPLATE_REPO; render .j2 files with TEMPLATE_VARS via Jinja2 StrictUndefined; + # copy non-template files as-is. + # Template files are FLAT (no layer/workspace nesting inside the template repo). + # They are written into ${LAYER}/${REGION_DIR}/ in the account repo, which is + # already known from the env vars supplied by the Lambda. - | if [ -n "${TEMPLATE_REPO}" ]; then git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${TEMPLATE_REPO}.git" /tmp/template-repo @@ -84,8 +89,19 @@ phases: from jinja2 import Environment, FileSystemLoader, StrictUndefined template_vars = json.loads(os.environ.get('TEMPLATE_VARS', '{}')) + layer = os.environ['LAYER'] + region_dir = os.environ['REGION_DIR'] + src_root = pathlib.Path('/tmp/template-repo') - dst_root = pathlib.Path('.') # already inside cloned account repo + # Flat template files land at LAYER/REGION_DIR/ in the account repo. + # source_path lets a single template repo hold multiple product variants + # as subdirectories; only that subdirectory is used as the source root. + source_path = os.environ.get('TEMPLATE_SOURCE_PATH', '').strip('/') + if source_path: + src_root = src_root / source_path + + dst_root = pathlib.Path('.') / layer / region_dir + dst_root.mkdir(parents=True, exist_ok=True) rendered = 0 copied = 0 @@ -93,8 +109,14 @@ phases: if src.is_dir() or any(part.startswith('.git') for part in src.parts): continue rel = src.relative_to(src_root) + # Files starting with '.' at the template root are written to the account + # repo root (e.g. .sc-automation.yml), not into LAYER/REGION_DIR/. + if len(rel.parts) == 1 and rel.name.startswith('.'): + dst_base = pathlib.Path('.') + else: + dst_base = dst_root if src.suffix == '.j2': - dst = dst_root / rel.with_suffix('') + dst = dst_base / rel.with_suffix('') dst.parent.mkdir(parents=True, exist_ok=True) env = Environment( loader=FileSystemLoader(str(src.parent)), @@ -105,11 +127,11 @@ phases: dst.write_text(content) rendered += 1 else: - dst = dst_root / rel + dst = dst_base / rel dst.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(src, dst) copied += 1 - print(f'Template repo: rendered {rendered} .j2 file(s), copied {copied} file(s)') + print(f'Template repo: rendered {rendered} .j2 file(s), copied {copied} file(s) -> {layer}/{region_dir}/') PYEOF else echo 'No TEMPLATE_REPO specified — skipping template rendering' diff --git a/docs/HOW-IT-WORKS.md b/docs/HOW-IT-WORKS.md index b507b94..e81925c 100644 --- a/docs/HOW-IT-WORKS.md +++ b/docs/HOW-IT-WORKS.md @@ -241,9 +241,11 @@ TEMPLATE_REPO, TEMPLATE_VARS, EXTRA_FILES, GITHUB_TOKEN 1. Rewrite git remote URLs (`ssh://` → `https://`) using the GHE PAT 2. `git clone` the account repo; `git checkout -B ${GIT_BRANCH}` 3. If `TEMPLATE_REPO` is set: - - Clone the template repo - - Walk all files; render `.j2` files with Jinja2 (`StrictUndefined`) - - Copy rendered + non-template files into account repo at same relative paths + - Clone the template repo (at `TEMPLATE_SOURCE_PATH` subdirectory if set) + - Template files are **flat** — no `layer/workspace/` nesting inside the repo + - Render `.j2` files with Jinja2 (`StrictUndefined`); copy non-template files as-is + - All files land in `${LAYER}/${REGION_DIR}/` in the account repo + - Exception: dotfiles (`.sc-automation.yml`, etc.) go to the account repo root 4. If `EXTRA_FILES` is non-empty: - Parse the JSON dict; write each `path → content` entry directly (overrides templates) 5. **Bootstrap workspace state files** (all file generation must be in the Proposer PR): diff --git a/docs/template-management.md b/docs/template-management.md index 4a08803..4213fdd 100644 --- a/docs/template-management.md +++ b/docs/template-management.md @@ -38,32 +38,55 @@ that existing structure. If the template were to include `.tf-control`, ## What Belongs in a Template Repo -A template repo contains only the workload-specific delta: +A template repo is **flat**. It contains only the files that will be written +into `${LAYER}/${REGION_DIR}/` in the target account repo. The Proposer +already knows the destination path from the `LAYER` and `REGION_DIR` env vars +passed by the Lambda — there is no need to encode that in the template structure. ``` template-{product_type}/ -├── {layer}/ -│ └── {workspace}/ -│ ├── {workload}.tf.j2 # workload resources — rendered by Proposer -│ └── tf-run.data # apply step sequence for this workspace -└── .sc-automation.yml.j2 # optional: Proposer writes this if absent +├── {workload}.tf.j2 # workload resources — rendered into LAYER/REGION_DIR/ +├── tf-run.data # apply step sequence — copied into LAYER/REGION_DIR/ +└── .sc-automation.yml.j2 # optional — written to repo root (dotfiles are special-cased) ``` ### Minimal real example — `template-s3-bucket` ``` template-s3-bucket/ -├── infrastructure/ -│ └── west/ -│ ├── INF.s3-standard.tf.j2 # S3 bucket + policy resources -│ └── tf-run.data # REMOTE-STATE + tf-directory-setup + ALL +├── INF.s3-standard.tf.j2 # S3 bucket + policy resources +├── tf-run.data # REMOTE-STATE + tf-directory-setup + ALL └── .sc-automation.yml.j2 ``` -That is the entire template. Nothing else. The account repo already provides -the execution context: Terraform binary version, plugin cache, proxy settings, -provider config, region, credentials, and the layer-level `remote_state.yml` -from which the workspace `remote_state.yml` is derived. +When the Proposer runs with `LAYER=infrastructure REGION_DIR=west`, these three +files land at: +``` +{account-repo}/infrastructure/west/INF.s3-standard.tf +{account-repo}/infrastructure/west/tf-run.data +{account-repo}/.sc-automation.yml ← dotfiles go to repo root +``` + +The same template repo works unchanged for any account, any region, any layer. +No account-specific values. No directory nesting. + +### Multiple variants in one repo (`TEMPLATE_SOURCE_PATH`) + +When a template repo holds more than one product variant, use subdirectories +and set `TEMPLATE_SOURCE_PATH` to select the one to use: + +``` +template-s3/ +├── standard/ +│ ├── INF.s3-standard.tf.j2 +│ └── tf-run.data +└── encrypted/ + ├── INF.s3-encrypted.tf.j2 + └── tf-run.data +``` + +With `TEMPLATE_SOURCE_PATH=encrypted`, the Proposer uses `encrypted/` as the +root and the nesting is stripped — files still land flat in `LAYER/REGION_DIR/`. ### When the target layer does not yet exist @@ -97,10 +120,10 @@ values come from the validated Pydantic model, not from a `.j2` file. ## Template Repository Conventions -### `tf-run.data` — required in every workspace the template touches +### `tf-run.data` — required, placed at the workspace root -Every workspace directory added by the template must include a `tf-run.data` -with at minimum: +The template must include a `tf-run.data` at its root (it lands in +`LAYER/REGION_DIR/` after the Proposer copies it). Minimum content: ``` VERSION 1.0 @@ -118,19 +141,19 @@ ALL - `TAG apply-start` lets an operator re-run from this point without re-running the setup directives. -### `.sc-automation.yml.j2` — optional +### `.sc-automation.yml.j2` — optional, written to repo root -If the template includes `.sc-automation.yml.j2`, the Proposer renders and -commits it. If absent, the Proposer writes a default `.sc-automation.yml` -using the product type and executor project from the Lambda's `TfRunRequest` -model. Either way, the file ends up at the repo root on `main` after merge. +Files whose names start with `.` at the template root are written to the +**account repo root**, not into `LAYER/REGION_DIR/`. This is how +`.sc-automation.yml.j2` ends up at the right place without a separate +mechanism. If absent, the Proposer writes a default `.sc-automation.yml` +from the Lambda's `TfRunRequest` model. ### `.terraform.lock.hcl` — include if possible -If the template is authored for a known provider set (e.g. `hashicorp/aws`), -include a pre-generated `.terraform.lock.hcl` in each workspace directory. -This avoids a from-scratch provider resolution on first `tf-init` and gives -reviewers visibility into the locked provider versions. +Include a pre-generated `.terraform.lock.hcl` at the template root; it lands +in `LAYER/REGION_DIR/`. This avoids a from-scratch provider resolution on +first `tf-init` and gives reviewers visibility into locked provider versions. If omitted, the Executor generates it on first `tf-init` and commits it back to `main` (tagged `[skip ci]`). @@ -164,21 +187,13 @@ environment-specific values. --- -## Subdirectory Templates - -A single template repo can contain multiple product variants as subdirectories. -The Lambda passes `source_path` to the Proposer to clone only the relevant subtree: - -``` -template-s3/ -├── standard/ -│ └── infrastructure/west/INF.s3-standard.tf.j2 -└── encrypted/ - └── infrastructure/west/INF.s3-encrypted.tf.j2 -``` +## Subdirectory Templates (`TEMPLATE_SOURCE_PATH`) -A product that specifies `source_path: encrypted` copies only -`infrastructure/west/INF.s3-encrypted.tf.j2` into the account repo. +See the [Multiple variants](#multiple-variants-in-one-repo-template_source_path) +section above. The `TEMPLATE_SOURCE_PATH` env var tells the Proposer which +subdirectory of the template repo to treat as the root. The selected subtree +is still rendered flat into `LAYER/REGION_DIR/` — the subdirectory path is +stripped entirely. --- From 6728094d0733532229860be2143bfc448f5bc609 Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Wed, 20 May 2026 14:05:36 -0400 Subject: [PATCH 18/27] docs: update generalized architecture to reflect flat delta-overlay template model - Template repos are flat: just .tf.j2 + tf-run.data, no nested layer/region dirs - LAYER and REGION_DIR are Proposer env vars; files are written to the correct path at copy time, not encoded in template directory structure - Remove lambda/templates/{product_type}/ tree (templates live in the template repo) - Layer-level remote_state.yml built by Lambda Pydantic model extra_files() from validated SC form inputs, not stored in template repo - Pydantic model example updated with account_alias field + extra_files() method - Onboarding checklist updated: no skeleton clone, no lambda/templates/ step --- ...eralized-terraform-product-architecture.md | 89 ++++++++++++++----- 1 file changed, 67 insertions(+), 22 deletions(-) diff --git a/docs/generalized-terraform-product-architecture.md b/docs/generalized-terraform-product-architecture.md index 71b9772..944c6ac 100644 --- a/docs/generalized-terraform-product-architecture.md +++ b/docs/generalized-terraform-product-architecture.md @@ -87,32 +87,54 @@ The following components require **no changes** to support new product types: ### 1. Template repo on GHE -Create a new repo under `SCT-Engineering/` (e.g. `template-s3-bucket`) that follows -the standard account repo directory layout. This repo is cloned by the executor -CodeBuild build and serves as the starting point for rendered files. +Create a new repo under `SCT-Engineering/` (e.g. `template-s3-bucket`) containing +**only the workload-specific files** — nothing else. -The template repo must contain: -- Standard `.tf-control`, `.tf-control.tfrc`, `region.tf`, `credentials.d/`, `variables.d/` -- Layer directories (`common/`, `infrastructure/`, `vpc/`) as applicable -- `remote_state.yml` stubs that the Proposer build will populate +Account repos already carry all standard scaffolding from initial setup: +`.tf-control`, `.tf-control.tfrc`, `region.tf`, `credentials.d/`, `variables.d/`, +and layer-level `remote_state.yml` files with account-specific values. +Duplicating any of that in a template repo would overwrite working values with +stubs and make the template non-reusable across accounts. + +A minimal template repo looks like: + +``` +template-s3-bucket/ +├── INF.s3-standard.tf.j2 # S3 bucket + policy resources +├── tf-run.data # REMOTE-STATE + tf-directory-setup + ALL steps +└── .sc-automation.yml.j2 # optional; Proposer writes a default if absent +``` + +The files are **flat**. `LAYER` and `REGION_DIR` are env vars already known to the +Proposer build — it writes the rendered files into `${ACCOUNT_REPO}/${LAYER}/${REGION_DIR}/` +at copy time. There is no reason to encode layer or region as directory structure +inside the template repo. + +If the target layer does not yet exist in the account repo, the Lambda Pydantic +model constructs the layer-level `remote_state.yml` from SC form inputs +(`account_id`, `account_alias`, `bucket`, `profile`, `region`) and passes it to +the Proposer via `EXTRA_FILES`. The template repo never carries this file. ### 2. Jinja2 templates -Add a new subdirectory under `lambda/templates/{product_type}/` containing the -`.tf.j2` and `.hcl.j2` files that are rendered by the Proposer build before being -committed to the new repo branch. +Jinja2 template files (`.tf.j2`) live **in the template repo itself** — flat, alongside +`tf-run.data`. There is no separate `lambda/templates/` directory tree. The Proposer +build clones the template repo and renders every `.j2` file it finds, writing the +result (minus the `.j2` extension) into `${LAYER}/${REGION_DIR}/` in the account repo. + +Example for an S3 product: ``` -lambda/templates/ -├── eks_cluster/ # existing -│ ├── infrastructure/west/cluster.tf.j2 -│ └── ... -├── s3_bucket/ # new -│ ├── infrastructure/west/s3.tf.j2 -│ └── ... -└── {future_product}/ # pattern +template-s3-bucket/ +├── INF.s3-standard.tf.j2 # rendered → infrastructure/west/INF.s3-standard.tf +├── tf-run.data +└── .sc-automation.yml.j2 ``` +Subdirectory variants (e.g. `standard/` vs `encrypted/`) are supported via the +`source_path` parameter — the Proposer copies only the named subdirectory's contents, +stripped of the subdirectory prefix. + ### 3. Pydantic config model Add a new model in `lambda/models/{product_type}.py`: @@ -123,7 +145,10 @@ class S3BucketConfig(BaseModel): bucket_name: str account_name: str aws_account_id: str + account_alias: str # used to build remote_state.yml profile field environment: Literal["dev", "test", "prod"] + layer: str = "infrastructure" + region_dir: str = "west" aws_region: str = "us-gov-west-1" versioning_enabled: bool = True lifecycle_days: int = 90 @@ -131,9 +156,27 @@ class S3BucketConfig(BaseModel): workload: str tier: str partition: str = "gov" + + def extra_files(self) -> dict[str, str]: + """Layer-level remote_state.yml — only needed if layer doesn't exist yet.""" + return { + f"{self.layer}/remote_state.yml": render_remote_state_yml( + directory=self.layer, + account_id=self.aws_account_id, + account_alias=self.account_alias, + bucket=f"inf-tfstate-{self.aws_account_id}", + bucket_region="us-gov-east-1", + profile=f"{self.aws_account_id}-{self.account_alias}", + region=self.aws_region, + aws_environment="gov", + ) + } ``` -The model enforces required fields and default values before any CodeBuild build is started. +The model enforces required fields and default values before any CodeBuild build +is started. The `extra_files()` method produces the layer-level `remote_state.yml` +from validated inputs — account-specific values stay in the Lambda model, not in +the template repo. ### 4. Lambda dispatcher @@ -195,9 +238,11 @@ existing EKS product config. The following checklist can be handed to a product team or platform engineer to onboard any new Terraform workload without Lambda or CodeBuild changes: -- [ ] Create `SCT-Engineering/template-{product_type}` repo from the standard account repo skeleton -- [ ] Add `lambda/templates/{product_type}/` with Jinja2 templates for each rendered file -- [ ] Add `lambda/models/{product_type}.py` with a Pydantic model defining required inputs +- [ ] Create `SCT-Engineering/template-{product_type}` containing **only** the workload + delta: flat `.tf.j2` file(s) + `tf-run.data` (+ optional `.sc-automation.yml.j2`) +- [ ] Add `lambda/models/{product_type}.py` with a Pydantic model defining required + inputs and an `extra_files()` method that builds the layer-level `remote_state.yml` + from validated SC form inputs - [ ] Register the handler in `lambda/app.py` `PRODUCT_HANDLERS` table - [ ] Create `service-catalog/{product_type}-product-template.yaml` CFN template - [ ] Add census config YAML and SC portfolio registration in `terraform-service-catalog-census` From 7f32318e4207cabe9b2abb5e1c88dea4cb43a7ee Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Wed, 20 May 2026 14:21:45 -0400 Subject: [PATCH 19/27] docs: handler.py lives in template repo; Lambda fetches at runtime --- ...eralized-terraform-product-architecture.md | 158 ++++++++++++------ docs/service-catalog-census-integration.md | 11 +- docs/template-management.md | 14 +- 3 files changed, 122 insertions(+), 61 deletions(-) diff --git a/docs/generalized-terraform-product-architecture.md b/docs/generalized-terraform-product-architecture.md index 944c6ac..4f8d663 100644 --- a/docs/generalized-terraform-product-architecture.md +++ b/docs/generalized-terraform-product-architecture.md @@ -135,71 +135,129 @@ Subdirectory variants (e.g. `standard/` vs `encrypted/`) are supported via the `source_path` parameter — the Proposer copies only the named subdirectory's contents, stripped of the subdirectory prefix. -### 3. Pydantic config model +### 3. `handler.py` in the template repo -Add a new model in `lambda/models/{product_type}.py`: +`handler.py` lives **at the root of the template repo** alongside the Jinja2 templates. +It is the single place that defines everything the Lambda needs to know about the product: +required inputs, defaults, and any computed `EXTRA_FILES`. No files inside the Lambda +repository are created or modified. + +#### Contract + +| Symbol | Type | Purpose | +|--------|------|---------| +| `PRODUCT_TYPE` | `str` | Unique key; must match the `product_type` field in the CFN Properties block | +| `handle(props: dict) -> dict` | callable | Receives normalized CFN props; returns (possibly modified) props ready for `TfRunRequest` | + +`handle()` is called before `TfRunRequest` is constructed. It should: +- Apply product-specific defaults (`layer`, `region_dir`, `template_repo`, …) +- Validate required inputs (via a Pydantic model or plain assertions) +- Inject computed `extra_files` entries (e.g. a layer-level `remote_state.yml`) + +#### Example — complete `handler.py` for an S3 bucket product ```python -class S3BucketConfig(BaseModel): - """Input model for S3 bucket SC product.""" +# template-s3-bucket/handler.py +from __future__ import annotations +from pydantic import BaseModel +from typing import Literal + +PRODUCT_TYPE = "s3_bucket" + + +class _Config(BaseModel): bucket_name: str account_name: str aws_account_id: str - account_alias: str # used to build remote_state.yml profile field + account_alias: str environment: Literal["dev", "test", "prod"] layer: str = "infrastructure" region_dir: str = "west" aws_region: str = "us-gov-west-1" - versioning_enabled: bool = True - lifecycle_days: int = 90 team: str workload: str tier: str - partition: str = "gov" - - def extra_files(self) -> dict[str, str]: - """Layer-level remote_state.yml — only needed if layer doesn't exist yet.""" - return { - f"{self.layer}/remote_state.yml": render_remote_state_yml( - directory=self.layer, - account_id=self.aws_account_id, - account_alias=self.account_alias, - bucket=f"inf-tfstate-{self.aws_account_id}", - bucket_region="us-gov-east-1", - profile=f"{self.aws_account_id}-{self.account_alias}", - region=self.aws_region, - aws_environment="gov", - ) - } + + +def handle(props: dict) -> dict: + cfg = _Config(**{k: v for k, v in props.items() if k in _Config.model_fields}) + props.setdefault("layer", cfg.layer) + props.setdefault("region_dir", cfg.region_dir) + props.setdefault("template_repo", "template-s3-bucket") + # Inject layer-level remote_state.yml if the layer is new + props.setdefault("extra_files", {}) + props["extra_files"].setdefault( + f"{cfg.layer}/remote_state.yml", + _render_remote_state(cfg), + ) + return props + + +def _render_remote_state(cfg: _Config) -> str: + return ( + f"directory: \"{cfg.layer}\"\n" + f"profile: \"{cfg.aws_account_id}-{cfg.account_alias}\"\n" + f"bucket: \"inf-tfstate-{cfg.aws_account_id}\"\n" + f"bucket_region: \"us-gov-east-1\"\n" + f"region: \"{cfg.aws_region}\"\n" + f"account_id: \"{cfg.aws_account_id}\"\n" + f"account_alias: \"{cfg.account_alias}\"\n" + f"aws_environment: \"gov\"\n" + ) ``` -The model enforces required fields and default values before any CodeBuild build -is started. The `extra_files()` method produces the layer-level `remote_state.yml` -from validated inputs — account-specific values stay in the Lambda model, not in -the template repo. +Because `handler.py` is versioned in the template repo, the Pydantic model and defaults +evolve alongside the templates — no Lambda redeploy required. + +### 4. Lambda dispatcher — runtime fetch from template repo -### 4. Lambda dispatcher +The Lambda has no `handlers/` directory and no handler registry. Instead, it fetches +`handler.py` directly from the template repo via the GHE API at request time and loads +it dynamically. The template repo is identified by the `template_repo` field already +present in the CFN Properties. -A single routing table maps `product_type` to the correct handler: +``` +lambda/ +└── app.py ← one-time change: fetch + exec handler.py, then call handle(props) + no lambda/handlers/ directory, no lambda/models/ directory +``` + +#### How it works (design intent for `app.py`) ```python -PRODUCT_HANDLERS = { - "eks_cluster": handle_eks, - "s3_bucket": handle_s3, - # future: "rds_postgres": handle_rds -} - -def handle_create(props: dict): - product_type = props.get("product_type", "eks_cluster") # default: backward-compat - handler = PRODUCT_HANDLERS.get(product_type) - if not handler: - raise ValueError(f"Unknown product_type: {product_type}") - return handler(props) +# 1. Read template_repo from CFN props (before TfRunRequest is constructed) +template_repo = normalized.get("template_repo") # e.g. "template-s3-bucket" +if not template_repo: + raise ValueError("template_repo is required") + +# 2. Fetch handler.py from GHE via the raw contents API +github_org = os.environ.get("GITHUB_ORG_NAME", "SCT-Engineering") +github_api = os.environ.get("GITHUB_API", "https://github.e.it.census.gov/api/v3") +handler_url = f"{github_api}/repos/{github_org}/{template_repo}/contents/handler.py" +# ...fetch with Authorization header, base64-decode the content... + +# 3. Load the module dynamically +import types, importlib +mod = types.ModuleType("_handler") +exec(compile(handler_source, "handler.py", "exec"), mod.__dict__) + +# 4. Validate the contract and dispatch +if not (callable(getattr(mod, "handle", None)) and getattr(mod, "PRODUCT_TYPE", None)): + raise ValueError(f"{template_repo}/handler.py must define PRODUCT_TYPE and handle()") +normalized = mod.handle(normalized) +tf_req = TfRunRequest(**normalized) ``` -This is a **one-time change** to `lambda/app.py`. After it is in place, adding a new -product type requires only a new entry in the table and a new handler function — no -other Lambda changes. +#### Security boundary + +The Lambda only fetches `handler.py` from repos whose name is in an allow-list prefix +(`template-*` within `SCT-Engineering`). The GHE token used has **read-only** scope on +template repos, so a compromised template repo cannot write to account repos via this +path. Handler execution is the only place arbitrary code runs — this is intentional and +auditable (every template repo change is a PR in the SCT-Engineering org). + +**Adding a new product type requires only creating a new template repo with a `handler.py`. +No Lambda code changes, no Lambda redeployment, no registry entries.** ### 5. CloudFormation product template @@ -238,12 +296,12 @@ existing EKS product config. The following checklist can be handed to a product team or platform engineer to onboard any new Terraform workload without Lambda or CodeBuild changes: -- [ ] Create `SCT-Engineering/template-{product_type}` containing **only** the workload - delta: flat `.tf.j2` file(s) + `tf-run.data` (+ optional `.sc-automation.yml.j2`) -- [ ] Add `lambda/models/{product_type}.py` with a Pydantic model defining required - inputs and an `extra_files()` method that builds the layer-level `remote_state.yml` - from validated SC form inputs -- [ ] Register the handler in `lambda/app.py` `PRODUCT_HANDLERS` table +- [ ] Create `SCT-Engineering/template-{product_type}` containing: + - `handler.py` — `PRODUCT_TYPE`, Pydantic model, `handle()` function + - flat `.tf.j2` file(s) rendered by the Proposer + - `tf-run.data` + - `.sc-automation.yml.j2` (optional; Proposer writes a default if absent) + **No files in the Lambda repository need to be created or modified.** - [ ] Create `service-catalog/{product_type}-product-template.yaml` CFN template - [ ] Add census config YAML and SC portfolio registration in `terraform-service-catalog-census` - [ ] Test end-to-end via `scripts/test_service_catalog.py` with the new product type diff --git a/docs/service-catalog-census-integration.md b/docs/service-catalog-census-integration.md index 3502c7a..575be1a 100644 --- a/docs/service-catalog-census-integration.md +++ b/docs/service-catalog-census-integration.md @@ -27,15 +27,18 @@ roles), or census-managed (portfolios, products, constraints) — and handled ac ``` sc-lambda-ghactions/ ← Lambda + CodeBuild buildspecs + SC product templates -├── lambda/app.py ← Lambda handler (dispatcher by product_type) -├── lambda/models/{product_type}.py ← Pydantic input models per product type -├── lambda/templates/{product_type}/ ← Jinja2 HCL templates per product type +├── lambda/app.py ← Lambda handler (fetches + runs handler.py from template repo at runtime) ├── service-catalog/{product_type}-product-template.yaml ← CFN product template └── deploy/ ← Terraform: Lambda, ECR, IAM, Function URL +SCT-Engineering/template-{product_type}/ ← one repo per product type; fully self-contained +├── handler.py ← PRODUCT_TYPE + Pydantic model + handle() +├── {workload}.tf.j2 ← Jinja2 HCL templates (flat) +├── tf-run.data ← tf-run steps +└── .sc-automation.yml.j2 ← optional webhook config template + terraform-sc-fleet/ ← Fleet operations manifest (all managed workloads) packer-pipeline/ ← Container build CLI -template-{product_type}/ ← Template repos (one per product type) ``` ### `terraform-service-catalog-census` (census repo) diff --git a/docs/template-management.md b/docs/template-management.md index 4213fdd..ddb8038 100644 --- a/docs/template-management.md +++ b/docs/template-management.md @@ -238,7 +238,7 @@ creates the initial PR if it does not already exist on `main`. ```yaml # .sc-automation.yml -product_type: eks_cluster # Must match a registered PRODUCT_HANDLERS key +product_type: eks_cluster # Must match PRODUCT_TYPE in template repo's handler.py executor_project: sc-executor # CodeBuild project name for the Executor build dry_run: true # If true, Executor runs tf plan only (no apply) template_repo: SCT-Engineering/template-eks-cluster # Source template repo @@ -302,11 +302,11 @@ changes and the post-apply file diff is empty, the commit step is skipped. Checklist when onboarding a new product type: -- [ ] Create `SCT-Engineering/template-{product_type}` containing **only** the workload - delta: `{layer}/{workspace}/{workload}.tf.j2` + `tf-run.data` -- [ ] Add a Pydantic model in `lambda/models/{product_type}.py` that validates - product-specific inputs and builds `TEMPLATE_VARS` + any `EXTRA_FILES` - (e.g. layer-level `remote_state.yml` if the target layer may not exist yet) -- [ ] Register the handler in `lambda/app.py` `PRODUCT_HANDLERS` table +- [ ] Create `SCT-Engineering/template-{product_type}` containing: + - `handler.py` — `PRODUCT_TYPE`, Pydantic model, `handle()` function + - flat `.tf.j2` file(s) (rendered into `${LAYER}/${REGION_DIR}/` by the Proposer) + - `tf-run.data` + - `.sc-automation.yml.j2` (optional) + **No files in the Lambda repository need to be created or modified.** - [ ] Create a CFN product template in `service-catalog/{product_type}-product-template.yaml` - [ ] Add the product to `terraform-service-catalog-census` (see [service-catalog-census-integration.md](service-catalog-census-integration.md)) From 86f549b0fc5c4bc3f29d24463f2039f47a96dce0 Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Wed, 20 May 2026 14:29:23 -0400 Subject: [PATCH 20/27] docs: SC product registration via Terraform for_each, not manual steps --- ...eralized-terraform-product-architecture.md | 64 +++++++++++++++---- docs/template-management.md | 5 +- 2 files changed, 54 insertions(+), 15 deletions(-) diff --git a/docs/generalized-terraform-product-architecture.md b/docs/generalized-terraform-product-architecture.md index 4f8d663..aa13c04 100644 --- a/docs/generalized-terraform-product-architecture.md +++ b/docs/generalized-terraform-product-architecture.md @@ -259,15 +259,15 @@ auditable (every template repo change is a PR in the SCT-Engineering org). **Adding a new product type requires only creating a new template repo with a `handler.py`. No Lambda code changes, no Lambda redeployment, no registry entries.** -### 5. CloudFormation product template +### 5. CloudFormation product template + Terraform registration -Create a new `service-catalog/{product_type}-product-template.yaml`. The template -follows the same pattern as the EKS product template: +The CFN template for a product type lives in `service-catalog/{product_type}-product-template.yaml` +inside the `sc-lambda-ghactions` repo. It follows the same pattern as the existing EKS template: - Parameters for user-facing form fields - A single `Custom::TerraformRepo` resource -- Properties passed in `snake_case` to avoid the PascalCase normalizer issue -- `product_type` included as a static string property +- Properties in `snake_case` (avoids PascalCase normalizer edge cases) +- `product_type` as a static string property - `aws_account_id` and `aws_region` resolved via `!Sub` — not user-facing parameters ```yaml @@ -283,11 +283,48 @@ Properties: tier: !Ref Tier ``` -### 6. Census config YAML (portfolio registration) +**The template is never registered manually.** The `deploy/service_catalog.tf` workspace +manages all SC infrastructure — S3 upload, portfolio, product, provisioning artifact, +launch role, launch constraint, and principal associations — using `for_each` over a +`var.sc_products` map. This is the same proven pattern as `deploy_product/main.tf` in +`lambda-template-repo-generator`. + +Adding a new product type requires only two steps: + +1. Add an entry to `var.sc_products` in `deploy/terraform.tfvars`: + +```hcl +sc_products = { + ekscluster = { + name = "EKS Cluster Repository Creator" + description = "Creates an EKS cluster account repo from template-eks-cluster." + template = "service-catalog/eks-cluster-product-template.yaml" + version = "2.0.0" + } + s3bucket = { + name = "S3 Bucket Repository Creator" + description = "Creates an S3 bucket account repo from template-s3-bucket." + template = "service-catalog/s3-bucket-product-template.yaml" + version = "1.0.0" + } +} +``` + +2. Run `tf apply` in `deploy/` — Terraform creates (or updates) all SC resources for the new entry. + +No AWS Console clicks, no manual artifact uploads, no census pipeline PR required to +test a new product type in the dev/lab environment. + +### 6. Promotion to production via the census pipeline + +The `deploy/` workspace is the **development and testing path**. Once a new product type +has been validated end-to-end, it is promoted to production by adding the corresponding +entry to `terraform-service-catalog-census/templates/products/{product_type}/` and opening +a PR in `terraform-service-catalog-census`. The census pipeline applies this to all shared +accounts via the standard portfolio-sharing mechanism. -Add a new YAML file under `terraform-service-catalog-census/templates/products/{product_type}/` -to register the product in the SC portfolio. This follows the same structure as the -existing EKS product config. +The CFN template file (`service-catalog/{product_type}-product-template.yaml`) is the +single source of truth — both the `deploy/` workspace and the census pipeline read from it. --- @@ -302,10 +339,11 @@ onboard any new Terraform workload without Lambda or CodeBuild changes: - `tf-run.data` - `.sc-automation.yml.j2` (optional; Proposer writes a default if absent) **No files in the Lambda repository need to be created or modified.** -- [ ] Create `service-catalog/{product_type}-product-template.yaml` CFN template -- [ ] Add census config YAML and SC portfolio registration in `terraform-service-catalog-census` -- [ ] Test end-to-end via `scripts/test_service_catalog.py` with the new product type -- [ ] Confirm `.sc-automation.yml` is written correctly by the Proposer build +- [ ] Add `service-catalog/{product_type}-product-template.yaml` CFN template to `sc-lambda-ghactions` +- [ ] Add an entry to `var.sc_products` in `deploy/terraform.tfvars` and run `tf apply` in `deploy/` — + Terraform registers the SC portfolio, product, artifact, launch role, constraint, and principals +- [ ] Validate end-to-end in the dev/lab account via `scripts/test_service_catalog.py` +- [ ] Promote to production: add entry to `terraform-service-catalog-census/templates/products/{product_type}/` and open a PR --- diff --git a/docs/template-management.md b/docs/template-management.md index ddb8038..d52408a 100644 --- a/docs/template-management.md +++ b/docs/template-management.md @@ -308,5 +308,6 @@ Checklist when onboarding a new product type: - `tf-run.data` - `.sc-automation.yml.j2` (optional) **No files in the Lambda repository need to be created or modified.** -- [ ] Create a CFN product template in `service-catalog/{product_type}-product-template.yaml` -- [ ] Add the product to `terraform-service-catalog-census` (see [service-catalog-census-integration.md](service-catalog-census-integration.md)) +- [ ] Add `service-catalog/{product_type}-product-template.yaml` to `sc-lambda-ghactions` +- [ ] Add entry to `var.sc_products` in `deploy/terraform.tfvars` and run `tf apply` in `deploy/` +- [ ] Promote to production via `terraform-service-catalog-census` PR From 4f67dd8bc584e1ee9f8869b11ed64f3ecf69d8e0 Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Wed, 20 May 2026 14:35:10 -0400 Subject: [PATCH 21/27] docs: deploy_products/ workspace replaces census pipeline dependency; OU sharing + StackSet --- ...eralized-terraform-product-architecture.md | 81 ++-- docs/service-catalog-census-integration.md | 393 +++++++----------- docs/template-management.md | 5 +- 3 files changed, 211 insertions(+), 268 deletions(-) diff --git a/docs/generalized-terraform-product-architecture.md b/docs/generalized-terraform-product-architecture.md index aa13c04..de17343 100644 --- a/docs/generalized-terraform-product-architecture.md +++ b/docs/generalized-terraform-product-architecture.md @@ -259,7 +259,7 @@ auditable (every template repo change is a PR in the SCT-Engineering org). **Adding a new product type requires only creating a new template repo with a `handler.py`. No Lambda code changes, no Lambda redeployment, no registry entries.** -### 5. CloudFormation product template + Terraform registration +### 5. CloudFormation product template The CFN template for a product type lives in `service-catalog/{product_type}-product-template.yaml` inside the `sc-lambda-ghactions` repo. It follows the same pattern as the existing EKS template: @@ -283,48 +283,61 @@ Properties: tier: !Ref Tier ``` -**The template is never registered manually.** The `deploy/service_catalog.tf` workspace -manages all SC infrastructure — S3 upload, portfolio, product, provisioning artifact, -launch role, launch constraint, and principal associations — using `for_each` over a -`var.sc_products` map. This is the same proven pattern as `deploy_product/main.tf` in -`lambda-template-repo-generator`. +### 6. `deploy_products/` — dedicated Terraform workspace for SC product management -Adding a new product type requires only two steps: +SC portfolio and product registration lives in a **dedicated `deploy_products/` workspace**, +separate from `deploy/` (which manages the Lambda and CodeBuild engine). This mirrors the +pattern established by `lambda-template-repo-generator`, which has `deploy/` for the +Lambda infrastructure and `deploy_product/` for the SC product registration. -1. Add an entry to `var.sc_products` in `deploy/terraform.tfvars`: +``` +sc-lambda-ghactions/ +├── deploy/ ← Lambda, ECR, CodeBuild, IAM (engine — rarely touched) +└── deploy_products/ ← SC portfolio, products, S3, launch roles, OU sharing +``` + +The workspace is driven by two variables: ```hcl +# deploy_products/terraform.tfvars + sc_products = { - ekscluster = { + eks_cluster = { name = "EKS Cluster Repository Creator" description = "Creates an EKS cluster account repo from template-eks-cluster." - template = "service-catalog/eks-cluster-product-template.yaml" + template = "${path.module}/../service-catalog/eks-cluster-product-template.yaml" version = "2.0.0" } - s3bucket = { + s3_bucket = { name = "S3 Bucket Repository Creator" description = "Creates an S3 bucket account repo from template-s3-bucket." - template = "service-catalog/s3-bucket-product-template.yaml" + template = "${path.module}/../service-catalog/s3-bucket-product-template.yaml" version = "1.0.0" } } -``` - -2. Run `tf apply` in `deploy/` — Terraform creates (or updates) all SC resources for the new entry. -No AWS Console clicks, no manual artifact uploads, no census pipeline PR required to -test a new product type in the dev/lab environment. +# AWS Organizations OU IDs to share the portfolio with. +# Every account in these OUs will see the portfolio in their SC console automatically. +share_ous = [ + "ou-xxxx-xxxxxxxx", # platform-engineering + "ou-xxxx-yyyyyyyy", # app-teams +] +``` -### 6. Promotion to production via the census pipeline +Terraform iterates `var.sc_products` with `for_each` to create the S3 object, SC product, +provisioning artifact, and launch constraint for each entry. A single shared portfolio +(`aws_servicecatalog_portfolio`) is created once and shared to the OUs listed in +`var.share_ous` via `aws_servicecatalog_portfolio_share` — no per-account work required. -The `deploy/` workspace is the **development and testing path**. Once a new product type -has been validated end-to-end, it is promoted to production by adding the corresponding -entry to `terraform-service-catalog-census/templates/products/{product_type}/` and opening -a PR in `terraform-service-catalog-census`. The census pipeline applies this to all shared -accounts via the standard portfolio-sharing mechanism. +**Adding a new product type:** +1. Create the template repo and CFN template (steps above) +2. Add one entry to `var.sc_products` in `deploy_products/terraform.tfvars` +3. Run `tf apply` in `deploy_products/` -The CFN template file (`service-catalog/{product_type}-product-template.yaml`) is the -single source of truth — both the `deploy/` workspace and the census pipeline read from it. +No census pipeline PR, no YAML config files, no Terragrunt, no AWS Console clicks. +The `deploy_products/` workspace replaces the `terraform-service-catalog-census` dependency +for this system entirely. Any account in the configured OUs gets access to new products +immediately after apply. --- @@ -338,12 +351,11 @@ onboard any new Terraform workload without Lambda or CodeBuild changes: - flat `.tf.j2` file(s) rendered by the Proposer - `tf-run.data` - `.sc-automation.yml.j2` (optional; Proposer writes a default if absent) - **No files in the Lambda repository need to be created or modified.** -- [ ] Add `service-catalog/{product_type}-product-template.yaml` CFN template to `sc-lambda-ghactions` -- [ ] Add an entry to `var.sc_products` in `deploy/terraform.tfvars` and run `tf apply` in `deploy/` — - Terraform registers the SC portfolio, product, artifact, launch role, constraint, and principals -- [ ] Validate end-to-end in the dev/lab account via `scripts/test_service_catalog.py` -- [ ] Promote to production: add entry to `terraform-service-catalog-census/templates/products/{product_type}/` and open a PR +- [ ] Add `service-catalog/{product_type}-product-template.yaml` to `sc-lambda-ghactions` +- [ ] Add one entry to `var.sc_products` in `deploy_products/terraform.tfvars` +- [ ] Run `tf apply` in `deploy_products/` — creates S3 artifact, SC product, provisioning + artifact, launch constraint; all OU-member accounts see the new product immediately +- [ ] Validate end-to-end via `scripts/test_service_catalog.py` --- @@ -382,6 +394,9 @@ CloudFormation stacks or SC provisioned products. | Secrets Manager secrets | 2 (GHE tokens) + 1 (webhook) | No change | | Lambda Function URL | 1 | No change | | ECR repositories | 1 | No change | +| SC portfolios | 1 (EKS) | 1 (shared across all product types) | +| SC products | 1 | +1 per new product type (S3 object + SC product resource) | +| CFN StackSets | 0 | 1 (launch role deployed to all OU member accounts via `deploy_products/`) | -There is **no additional AWS infrastructure cost** to add new product types. Each new -product type is purely a code and configuration change. +Each new product type adds a single SC product + S3 artifact. No new Lambda functions, +no new CodeBuild projects, no new secrets — and no dependency on external pipeline teams. diff --git a/docs/service-catalog-census-integration.md b/docs/service-catalog-census-integration.md index 575be1a..a0cd9bf 100644 --- a/docs/service-catalog-census-integration.md +++ b/docs/service-catalog-census-integration.md @@ -1,89 +1,78 @@ -# Service Catalog Census Integration +# Service Catalog Product Deployment -**Ported and generalized from:** `lambda-template-repo-generator/design-docs/SERVICE_CATALOG_CENSUS_INTEGRATION.md` -**Updated for:** sc-lambda-ghactions (CodeBuild-based initial rollout; GHA planned for later) -**Date:** 2026-05-19 +**Previously titled:** Service Catalog Census Integration +**Updated for:** sc-lambda-ghactions +**Date:** 2026-05-20 **Status:** DRAFT --- ## Executive Summary -This document covers how sc-lambda-ghactions products are registered in the -`terraform-service-catalog-census` repo, which manages all enterprise Service Catalog -portfolios and products via Terragrunt. Each new product type (EKS cluster, S3 bucket, -RDS instance, etc.) requires entries in the census repo to become available in the SC -console org-wide. +All Service Catalog infrastructure for this system is managed from a single +**`deploy_products/`** Terraform workspace inside `sc-lambda-ghactions`. There is no +dependency on `terraform-service-catalog-census` or any external pipeline. -The census integration is designed for **enterprise-wide deployment from the outset**. -Every resource is classified by deployment scope — central (Lambda, ECR), StackSet (launch -roles), or census-managed (portfolios, products, constraints) — and handled accordingly. +The workspace handles the complete deployment lifecycle: +- S3 upload of CFN product templates (versioned) +- SC portfolio + `aws_servicecatalog_portfolio_share` to org OUs +- SC products, provisioning artifacts, and launch constraints +- A CloudFormation StackSet that deploys the IAM launch role to every account in the shared OUs automatically + +Adding a new product type = one entry in `terraform.tfvars` + `tf apply`. No census repo +PRs, no Terragrunt, no YAML config files, no AWS Console clicks. --- ## System Layout -### sc-lambda-ghactions system (4 repos) +### sc-lambda-ghactions (all in one repo) ``` -sc-lambda-ghactions/ ← Lambda + CodeBuild buildspecs + SC product templates -├── lambda/app.py ← Lambda handler (fetches + runs handler.py from template repo at runtime) -├── service-catalog/{product_type}-product-template.yaml ← CFN product template -└── deploy/ ← Terraform: Lambda, ECR, IAM, Function URL +sc-lambda-ghactions/ +├── lambda/app.py ← Lambda handler (fetches handler.py from template repo at runtime) +├── service-catalog/{type}-template.yaml ← CFN product templates (one per product type) +├── deploy/ ← Engine: Lambda, ECR, CodeBuild, IAM, Function URL +└── deploy_products/ ← SC: portfolio, products, OU sharing, launch role StackSet SCT-Engineering/template-{product_type}/ ← one repo per product type; fully self-contained ├── handler.py ← PRODUCT_TYPE + Pydantic model + handle() ├── {workload}.tf.j2 ← Jinja2 HCL templates (flat) -├── tf-run.data ← tf-run steps +├── tf-run.data ← tf-run orchestration steps └── .sc-automation.yml.j2 ← optional webhook config template - -terraform-sc-fleet/ ← Fleet operations manifest (all managed workloads) -packer-pipeline/ ← Container build CLI ``` -### `terraform-service-catalog-census` (census repo) +### Two workspaces, two responsibilities -``` -terraform-service-catalog-census/ -├── main-modules/service-catalog/ ← Main Terraform module -├── modules/ -│ ├── sc-portfolio/ ← Portfolio + principal association -│ ├── sc-product/ ← Product + S3 upload + versioning -│ └── cfn-roles-actions/ ← Launch roles via CFN StackSets -├── templates/ -│ ├── products/ -│ │ ├── eks-terragrunt-repo/ ← CFN product template (versioned YAMLs) -│ │ ├── s3-bucket-repo/ ← (planned) -│ │ └── {product-type}-repo/ ← pattern -│ └── role-templates/ ← IAM launch role CFN snippets -├── non-prod/csvd-dev/west/ -│ ├── configurations/ -│ │ ├── portfolios/*.yaml.tftpl ← Portfolio definitions -│ │ └── products/**/*.yaml.tftpl ← Product definitions -│ └── service-catalog/ -└── prod/operations-gov/ ← Prod (shares to org) -``` +| Workspace | Contains | Apply frequency | +|-----------|----------|-----------------| +| `deploy/` | Lambda, ECR, CodeBuild projects, IAM execution roles, Function URL | Rarely — only when the engine changes | +| `deploy_products/` | S3 templates, SC portfolio, products, artifacts, OU sharing, launch role StackSet | Whenever a new product type is added or a template version bumps | --- ## Resource Classification -Every resource falls into one of three deployment tiers: - -| Tier | What | Deployment mechanism | Scope | -|------|------|---------------------|-------| -| **Central** | Lambda, ECR, Secrets Manager, GHE token, Function URL | `sc-lambda-ghactions/deploy/` (`tf apply`) | csvd-dev only | -| **StackSet** | IAM launch role per product type | `cfn-roles-actions` StackSet via census repo | All OU-shared accounts | -| **Census-managed** | SC portfolio, product, provisioning artifact, constraints | YAML config in census repo → `terragrunt apply` | SC admin account + shared OUs | +| Resource | Workspace | Scope | +|----------|-----------|-------| +| Lambda function | `deploy/` | csvd-dev only (invoked cross-account via ServiceToken) | +| ECR repository | `deploy/` | csvd-dev only | +| CodeBuild projects (proposer + executor) | `deploy/` | csvd-dev only | +| Lambda cross-account invocation policy | `deploy/` | Org-wide via `aws:PrincipalOrgID` condition | +| S3 bucket + product template objects | `deploy_products/` | csvd-dev only | +| SC portfolio | `deploy_products/` | csvd-dev (shared to OUs) | +| `aws_servicecatalog_portfolio_share` | `deploy_products/` | All OU member accounts | +| SC products + provisioning artifacts | `deploy_products/` | csvd-dev (visible in shared accounts) | +| Launch role (IAM) | `deploy_products/` via CFN StackSet | All OU member accounts | +| Launch constraint | `deploy_products/` | Per product, references launch role ARN pattern | --- -## Step 1 — Central Infrastructure (`sc-lambda-ghactions/deploy/`) +## Step 1 — Engine Infrastructure (`deploy/`) -The Lambda is centralized in csvd-dev. CloudFormation in any org account invokes -it cross-account via the `ServiceToken` ARN. +The Lambda is centralized in csvd-dev. CloudFormation in any org account invokes it +cross-account via the `ServiceToken` ARN. A single resource policy covers the whole org: -**Lambda resource policy** — allows any account in the org: ```hcl resource "aws_lambda_permission" "cloudformation_org" { statement_id = "AllowCloudFormationOrgInvoke" @@ -98,218 +87,156 @@ resource "aws_lambda_permission" "cloudformation_org" { } ``` -No per-account Lambda deployment is needed. Provisioners never need the Lambda locally — -their CloudFormation stack calls it cross-account via the `ServiceToken`. +No per-account Lambda deployment. This resource lives in `deploy/` and is applied once. --- -## Step 2 — IAM Launch Roles (StackSet) - -One IAM launch role is required **per product type** in every account that will -provision the product via SC. These are deployed via the `cfn-roles-actions` StackSet, -which auto-deploys to all accounts in shared OUs. - -### Launch role template (per product type) - -Add a file to `templates/role-templates/`: - -```yaml -# templates/role-templates/sc-{product_type}-launch-role.yaml -Type: AWS::IAM::Role -Properties: - RoleName: !Sub "r-ent-servicecatalog-${ProductType}-sc-launch-role" - AssumeRolePolicyDocument: - Statement: - - Effect: Allow - Principal: { Service: servicecatalog.amazonaws.com } - Action: sts:AssumeRole - Policies: - - PolicyName: InvokeCentralLambda - PolicyDocument: - Statement: - - Effect: Allow - Action: lambda:InvokeFunction - Resource: !Sub "arn:${AWS::Partition}:lambda:${LambdaRegion}:${CentralAccountId}:function:sc-template-automation" - - Effect: Allow - Action: [cloudformation:*, s3:GetObject] - Resource: "*" -``` +## Step 2 — SC Products & OU Sharing (`deploy_products/`) -### Registering in `roles.yaml.tftpl` - -```yaml -# non-prod/csvd-dev/west/configurations/roles.yaml.tftpl -- template: sc-eks-cluster-launch-role.yaml # existing - parameters: - - parameter: CentralAccountId - value: "229685449397" - - parameter: LambdaRegion - value: us-gov-west-1 - - parameter: ProductType - value: eks_cluster - -- template: sc-s3-bucket-launch-role.yaml # new product type - parameters: - - parameter: CentralAccountId - value: "229685449397" - - parameter: LambdaRegion - value: us-gov-west-1 - - parameter: ProductType - value: s3_bucket -``` +The `deploy_products/` workspace manages all SC resources from a single `tf apply`. -**Only one `terragrunt apply`** is needed after adding a new role entry. The StackSet -propagates to all shared accounts automatically via `auto_deployment { enabled = true }`. +### `var.sc_products` — product registry ---- +```hcl +# deploy_products/terraform.tfvars +sc_products = { + eks_cluster = { + name = "EKS Cluster Repository Creator" + description = "Creates an EKS cluster account repo from template-eks-cluster." + template = "${path.module}/../service-catalog/eks-cluster-product-template.yaml" + version = "2.0.0" + } + s3_bucket = { + name = "S3 Bucket Repository Creator" + description = "Creates an S3 bucket account repo from template-s3-bucket." + template = "${path.module}/../service-catalog/s3-bucket-product-template.yaml" + version = "1.0.0" + } +} -## Step 3 — Census Portfolio and Product Config - -### Portfolio YAML - -Portfolios are defined in `configurations/portfolios/`. The sc-lambda-ghactions products -belong in a single shared portfolio (or alongside existing census portfolios): - -```yaml -# configurations/portfolios/sc-automation.yaml.tftpl -sc_automation: - name: "Service Catalog Automation Portfolio" - description: >- - Self-service infrastructure provisioning via sc-lambda-ghactions. - Supports any Terraform workload type. - provider_name: CSVD - products: - - eks_cluster_repo - - s3_bucket_repo - user_roles: - - /census/*/sc-end-user-role - share_ous: - - name: census-workload-accounts +# AWS Organizations OU IDs — every account in these OUs sees the portfolio automatically +share_ous = [ + "ou-xxxx-xxxxxxxx", # platform-engineering + "ou-xxxx-yyyyyyyy", # app-teams +] ``` -### Product YAML (per product type) - -```yaml -# configurations/products/eks-cluster-repo/EKS_CLUSTER_REPO.yaml.tftpl -eks_cluster_repo: - name: "EKS Cluster Repository Creator" - description: >- - Creates a GitHub Enterprise repository with Terragrunt EKS cluster - configuration and opens a review PR. - type: CLOUD_FORMATION_TEMPLATE - distributor: CSVD - support_email: csvd.aws.service.catalog.team.list@census.gov - launch_role: r-ent-servicecatalog-eks-cluster-sc-launch-role - template_constraints: - Parameters: - # Lock the Lambda ARN — users cannot redirect to a different Lambda - ServiceToken: "arn:${Partition}:lambda:us-gov-west-1:229685449397:function:sc-template-automation" - versions: - - name: "1.0.0" - description: "Initial CodeBuild-based version" - file_path: products/eks-cluster-repo/1-0-0.yaml -``` +Terraform iterates `var.sc_products` with `for_each` to create: +- `aws_s3_object` — versioned CFN template in the artifacts bucket +- `aws_servicecatalog_product` — SC product backed by the S3 object +- `aws_servicecatalog_provisioning_artifact` — the working artifact version (the initial one created by `create-product` is deprecated automatically, as learned from `lambda-template-repo-generator`) +- `aws_servicecatalog_product_portfolio_association` — links product to the shared portfolio +- `aws_servicecatalog_constraint` — attaches the launch role -### Product template location +### OU portfolio sharing -The CFN product template lives at: -``` -templates/products/{product_type}-repo/{version}.yaml +```hcl +resource "aws_servicecatalog_portfolio_share" "ou" { + for_each = toset(var.share_ous) + + portfolio_id = aws_servicecatalog_portfolio.this.id + type = "ORGANIZATIONAL_UNIT" + principal_id = each.value + + share_principals = true # member accounts inherit principal associations +} ``` -This is a copy of (or symlink to) `sc-lambda-ghactions/service-catalog/{product_type}-product-template.yaml`. -When a new version of the product template is released, add a new versioned file here -and bump the `versions` list in the product YAML. +Every account in the listed OUs immediately sees all products in the portfolio — no +per-account work required. ---- +### IAM launch role — StackSet -## Step 4 — Moving the Lambda to a Different Account +The IAM launch role must exist in every member account that will provision products. +`deploy_products/` manages a CloudFormation StackSet that deploys it org-wide: -If the central Lambda needs to move to a different AWS account, the following must be -updated. **All other components are account-agnostic.** +```hcl +resource "aws_cloudformation_stack_set" "launch_role" { + name = "sc-automation-launch-role" + permission_model = "SERVICE_MANAGED" + capabilities = ["CAPABILITY_NAMED_IAM"] + template_body = file("${path.module}/cfn/sc-launch-role.yaml") + + parameters = { + CentralAccountId = data.aws_caller_identity.current.account_id + LambdaRegion = data.aws_region.current.name + LambdaName = var.lambda_function_name + } -| Resource | Location | What changes | -|----------|----------|-------------| -| Lambda + all central infra | `sc-lambda-ghactions/deploy/` | Re-deploy in new account | -| Launch role `lambda:InvokeFunction` ARN | `roles.yaml.tftpl` → `CentralAccountId` parameter | Update to new account ID — one change propagates to all shared accounts via StackSet | -| Template constraint `ServiceToken` | Product YAML `template_constraints` | Update ARN value | -| GitHub token secrets | Secrets Manager in new account | Recreate manually | + auto_deployment { + enabled = true + retain_stacks_on_account_removal = false + } +} -**Migration order:** Update StackSet launch roles (step 3) → wait for propagation → update -template constraint (step 4). Reversing the order causes a `lambda:InvokeFunction` permission -denial window. +resource "aws_cloudformation_stack_set_instance" "ou" { + for_each = toset(var.share_ous) -### Why parameterizing `CentralAccountId` matters + stack_set_name = aws_cloudformation_stack_set.launch_role.name + deployment_targets { + organizational_unit_ids = [each.value] + } +} +``` -The account ID is only in `roles.yaml.tftpl` under the `CentralAccountId` parameter. The -role template YAML itself is static and account-agnostic. A single value change propagates -to all shared accounts via the StackSet — no role template file needs updating. +The CFN template for the launch role is a static file at +`deploy_products/cfn/sc-launch-role.yaml`. It is the same role as in +`lambda-template-repo-generator/deploy_product/main.tf` — CFN + CloudFormation +permissions + `lambda:InvokeFunction` on the central Lambda ARN — but parameterized +so it works across any account without hard-coding the Lambda account ID. ---- +### Launch constraint -## Adding a New Product Type to the Census Portfolio +The constraint ARN pattern references the role deployed by the StackSet: -Checklist for each new product type: +```hcl +resource "aws_servicecatalog_constraint" "launch" { + for_each = var.sc_products -- [ ] Add CFN product template at `templates/products/{product_type}-repo/1-0-0.yaml` -- [ ] Add product YAML at `configurations/products/{product_type}-repo/{PRODUCT}.yaml.tftpl` -- [ ] Add launch role template at `templates/role-templates/sc-{product_type}-launch-role.yaml` -- [ ] Add launch role entry in `roles.yaml.tftpl` -- [ ] Add product key to portfolio YAML `products:` list -- [ ] Run `terragrunt apply` in `non-prod/csvd-dev/west/service-catalog/` -- [ ] Validate: product appears in SC console; end-to-end test from a workload account + portfolio_id = aws_servicecatalog_portfolio.this.id + product_id = aws_servicecatalog_product.this[each.key].id + type = "LAUNCH" ---- + parameters = jsonencode({ + # The role ARN uses !Sub at CFN time — member accounts resolve their own account ID + LocalRoleName = "sc-automation-launch-role" + }) +} +``` -## Validation Checklist +--- -### After central Lambda deploy: -- [ ] Lambda resource policy allows org-wide CloudFormation invocation -- [ ] Cross-account test: invoke Lambda from a different account via CFN Custom Resource +## Adding a New Product Type -### After StackSet launch role deploy: -- [ ] StackSet instances show `CURRENT` in CloudFormation console for target OUs -- [ ] Launch role exists in at least 2-3 workload accounts (spot check) -- [ ] Role trust policy allows `servicecatalog.amazonaws.com` +1. Create `SCT-Engineering/template-{product_type}` with `handler.py`, `.tf.j2` files, `tf-run.data` +2. Add `service-catalog/{product_type}-product-template.yaml` to `sc-lambda-ghactions` +3. Add one entry to `var.sc_products` in `deploy_products/terraform.tfvars` +4. Run `tf apply` in `deploy_products/` — product is live in all OU member accounts immediately -### After census product deploy: -- [ ] Portfolio visible in SC console in the admin account -- [ ] Portfolio shared to target OUs (verify in a workload account) -- [ ] Product associated with portfolio; launch constraint attached -- [ ] Template constraint locks `ServiceToken` to correct Lambda ARN -- [ ] End-to-end test: provision from a **workload account** (not csvd-dev) +No census repo PRs. No Terragrunt. No YAML config files. --- -## Appendix: Census Config Format Reference +## Moving the Lambda to a Different Account -### Portfolio YAML schema +If the central Lambda needs to move accounts, only `deploy/` changes. The StackSet +launch roles reference the Lambda by name + account, managed via the `CentralAccountId` +and `LambdaName` StackSet parameters — a single `tf apply` in `deploy_products/` after +re-deploying `deploy/` propagates the updated ARN to all member accounts automatically. -```yaml -: - name: string - description: string - provider_name: string - products: [, ...] - user_roles: [/path/pattern/*] - tags: {} - share_ous: [] # OU names; empty = inherit from terraform.tfvars -``` +--- -### Product YAML schema - -```yaml -: - name: string - description: string - type: CLOUD_FORMATION_TEMPLATE - launch_role: string # IAM role NAME (not ARN) — must exist in every target account - distributor: string - template_constraints: - Parameters: - ParamName: locked-value - versions: - - name: "1.0.0" - file_path: products/{product-dir}/{version}.yaml - actions: [] -``` +## Validation Checklist + +### After `deploy/` apply: +- [ ] Lambda resource policy allows org-wide CloudFormation invocation +- [ ] Cross-account test: invoke Lambda from a different account via CFN Custom Resource + +### After `deploy_products/` apply: +- [ ] StackSet instances show `CURRENT` for all target OUs +- [ ] Launch role exists in at least 2–3 spot-check workload accounts +- [ ] Portfolio visible in SC console in csvd-dev +- [ ] Portfolio shared to target OUs (verify from a workload account) +- [ ] Each product has an active provisioning artifact; initial broken artifact is deprecated +- [ ] Launch constraint references the correct role name +- [ ] End-to-end test: provision from a **workload account** (not csvd-dev) diff --git a/docs/template-management.md b/docs/template-management.md index d52408a..0507459 100644 --- a/docs/template-management.md +++ b/docs/template-management.md @@ -309,5 +309,6 @@ Checklist when onboarding a new product type: - `.sc-automation.yml.j2` (optional) **No files in the Lambda repository need to be created or modified.** - [ ] Add `service-catalog/{product_type}-product-template.yaml` to `sc-lambda-ghactions` -- [ ] Add entry to `var.sc_products` in `deploy/terraform.tfvars` and run `tf apply` in `deploy/` -- [ ] Promote to production via `terraform-service-catalog-census` PR +- [ ] Add one entry to `var.sc_products` in `deploy_products/terraform.tfvars` +- [ ] Run `tf apply` in `deploy_products/` — portfolio, product, artifact, launch roles, + and OU sharing are updated automatically; all OU-member accounts see the change immediately From 8606feb4e34ee2b7db5ea0e3ef10c865cbf34f6c Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Wed, 20 May 2026 14:43:18 -0400 Subject: [PATCH 22/27] docs: version pinning via template_repo_ref; SemVer tagging for template repos --- ...eralized-terraform-product-architecture.md | 55 +++++++++++++++++-- docs/template-management.md | 7 ++- 2 files changed, 54 insertions(+), 8 deletions(-) diff --git a/docs/generalized-terraform-product-architecture.md b/docs/generalized-terraform-product-architecture.md index de17343..94610e6 100644 --- a/docs/generalized-terraform-product-architecture.md +++ b/docs/generalized-terraform-product-architecture.md @@ -225,19 +225,21 @@ lambda/ #### How it works (design intent for `app.py`) ```python -# 1. Read template_repo from CFN props (before TfRunRequest is constructed) -template_repo = normalized.get("template_repo") # e.g. "template-s3-bucket" +# 1. Read template_repo + optional ref from CFN props +template_repo = normalized.get("template_repo") # e.g. "template-s3-bucket" +template_repo_ref = normalized.get("template_repo_ref", "") # e.g. "v2.0.0"; empty = default branch if not template_repo: raise ValueError("template_repo is required") -# 2. Fetch handler.py from GHE via the raw contents API +# 2. Fetch handler.py from GHE via the contents API, at the pinned ref github_org = os.environ.get("GITHUB_ORG_NAME", "SCT-Engineering") github_api = os.environ.get("GITHUB_API", "https://github.e.it.census.gov/api/v3") -handler_url = f"{github_api}/repos/{github_org}/{template_repo}/contents/handler.py" +ref_param = f"?ref={template_repo_ref}" if template_repo_ref else "" +handler_url = f"{github_api}/repos/{github_org}/{template_repo}/contents/handler.py{ref_param}" # ...fetch with Authorization header, base64-decode the content... # 3. Load the module dynamically -import types, importlib +import types mod = types.ModuleType("_handler") exec(compile(handler_source, "handler.py", "exec"), mod.__dict__) @@ -245,6 +247,7 @@ exec(compile(handler_source, "handler.py", "exec"), mod.__dict__) if not (callable(getattr(mod, "handle", None)) and getattr(mod, "PRODUCT_TYPE", None)): raise ValueError(f"{template_repo}/handler.py must define PRODUCT_TYPE and handle()") normalized = mod.handle(normalized) +# template_repo_ref is preserved in normalized; CodeBuild receives it as TEMPLATE_REPO_REF tf_req = TfRunRequest(**normalized) ``` @@ -259,6 +262,32 @@ auditable (every template repo change is a PR in the SCT-Engineering org). **Adding a new product type requires only creating a new template repo with a `handler.py`. No Lambda code changes, no Lambda redeployment, no registry entries.** +#### Version pinning + +The `template_repo_ref` property pins the exact git ref (tag, branch, or SHA) that both +the Lambda and the Proposer CodeBuild use. This is how template repo changes are gated +from production use. + +| Value | Behaviour | +|-------|-----------| +| `v2.0.0` (SemVer tag) | Lambda fetches `handler.py` at that tag; Proposer clones and checks out that tag. **Recommended for production.** | +| `main` (branch) | Always latest; appropriate for development and testing. | +| `abc1234` (SHA) | Exact commit; maximally stable but requires manual update. | +| *(absent / empty)* | GHE API returns the default branch; same as `main`. | + +**The ref is set as a static string in the CFN product template — it is not a +user-facing form parameter.** Bumping to a new version is an operator action: + +1. Tag the template repo: `git tag v2.0.0 && git push origin v2.0.0` +2. Update `template_repo_ref` in `service-catalog/{product_type}-product-template.yaml` +3. Run `tf apply` in `deploy_products/` with a bumped `version` key — this creates a new + SC provisioning artifact. Existing provisioned products are unaffected until they are + updated or re-provisioned. + +Because `template_repo_ref` flows through the Lambda to the CodeBuild `TEMPLATE_REPO_REF` +env var, the Lambda and the Proposer always run the **exact same version** of `handler.py` +and the Jinja2 templates — there is no split-brain risk between the two. + ### 5. CloudFormation product template The CFN template for a product type lives in `service-catalog/{product_type}-product-template.yaml` @@ -274,6 +303,8 @@ inside the `sc-lambda-ghactions` repo. It follows the same pattern as the existi Properties: ServiceToken: !Sub "arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:sc-template-automation" product_type: s3_bucket + template_repo: template-s3-bucket + template_repo_ref: v2.0.0 # pinned; bump here + new SC artifact version to release bucket_name: !Ref BucketName account_name: !Ref AccountName aws_account_id: !Sub "${AWS::AccountId}" @@ -283,6 +314,10 @@ Properties: tier: !Ref Tier ``` +`template_repo` and `template_repo_ref` are static strings — they are not user-facing +parameters and do not appear on the SC product form. Changing them requires creating a +new SC provisioning artifact version (managed by `deploy_products/`). + ### 6. `deploy_products/` — dedicated Terraform workspace for SC product management SC portfolio and product registration lives in a **dedicated `deploy_products/` workspace**, @@ -324,6 +359,11 @@ share_ous = [ ] ``` +> **Version alignment:** the `version` key in `sc_products` is the **SC provisioning artifact +> label** displayed in the SC console (e.g. `"2.0.0"`). It is independent of, but should match, +> the `template_repo_ref` property baked into the CFN template file. Convention: +> bump `version` in tfvars and update `template_repo_ref` in the CFN YAML at the same time. + Terraform iterates `var.sc_products` with `for_each` to create the S3 object, SC product, provisioning artifact, and launch constraint for each entry. A single shared portfolio (`aws_servicecatalog_portfolio`) is created once and shared to the OUs listed in @@ -351,8 +391,11 @@ onboard any new Terraform workload without Lambda or CodeBuild changes: - flat `.tf.j2` file(s) rendered by the Proposer - `tf-run.data` - `.sc-automation.yml.j2` (optional; Proposer writes a default if absent) -- [ ] Add `service-catalog/{product_type}-product-template.yaml` to `sc-lambda-ghactions` +- [ ] Tag the initial release: `git tag v1.0.0 && git push origin v1.0.0` +- [ ] Add `service-catalog/{product_type}-product-template.yaml` to `sc-lambda-ghactions`, + setting `template_repo_ref: v1.0.0` as a static property - [ ] Add one entry to `var.sc_products` in `deploy_products/terraform.tfvars` + with `version = "1.0.0"` matching the tag - [ ] Run `tf apply` in `deploy_products/` — creates S3 artifact, SC product, provisioning artifact, launch constraint; all OU-member accounts see the new product immediately - [ ] Validate end-to-end via `scripts/test_service_catalog.py` diff --git a/docs/template-management.md b/docs/template-management.md index 0507459..64584a3 100644 --- a/docs/template-management.md +++ b/docs/template-management.md @@ -254,7 +254,7 @@ variables: # Extra key/value pairs injected as CodeBuild | `product_type` | ✅ | Routes to the correct Pydantic model and template directory | | `executor_project` | ✅ | CodeBuild project started by the webhook on PR merge | | `dry_run` | ✅ | `true` → `tf plan` only; `false` → `tf apply` | -| `template_repo` | ✅ | GHE repo used as the Jinja2 template source | +| `template_repo` | ✅ | GHE repo used as the Jinja2 template source; the Lambda fetches it at the ref specified by `template_repo_ref` in the CFN template | | `template_source_path` | ❌ | Subdirectory within `template_repo`; omit for whole-repo templates | | `fleet_entry` | ❌ | Relative path of this workload's entry in `terraform-sc-fleet` | | `variables` | ❌ | Product-type-specific overrides; merged with SSM global defaults | @@ -308,7 +308,10 @@ Checklist when onboarding a new product type: - `tf-run.data` - `.sc-automation.yml.j2` (optional) **No files in the Lambda repository need to be created or modified.** -- [ ] Add `service-catalog/{product_type}-product-template.yaml` to `sc-lambda-ghactions` +- [ ] Tag the initial release: `git tag v1.0.0 && git push origin v1.0.0` +- [ ] Add `service-catalog/{product_type}-product-template.yaml` to `sc-lambda-ghactions`, + setting `template_repo_ref: v1.0.0` as a static property in the CFN `Properties` block - [ ] Add one entry to `var.sc_products` in `deploy_products/terraform.tfvars` + with `version = "1.0.0"` matching the tag - [ ] Run `tf apply` in `deploy_products/` — portfolio, product, artifact, launch roles, and OU sharing are updated automatically; all OU-member accounts see the change immediately From 336248dfaae2547da7b11097d7cce21a32523e11 Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Wed, 20 May 2026 14:57:17 -0400 Subject: [PATCH 23/27] docs: fix stale GHA architecture references; align executor docs with buildspec reality --- .github/copilot-instructions.md | 42 +++++++++++++++++++++------------ docs/HOW-IT-WORKS.md | 6 ++++- docs/repo-vars-and-secrets.md | 5 ++-- 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index d87774f..67936ff 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -8,21 +8,34 @@ system at Census. The architecture is: ``` SC Console (user fills product form) └─> CFN Stack (Custom::* resource) - └─> Lambda (centralized in csvd-dev, 229685449397, us-gov-west-1) + └─> Lambda tf-run-executor-trigger (csvd-dev, 229685449397, us-gov-west-1) ├─> Validates inputs (Pydantic v2 models) ├─> Fetches GHE token from Secrets Manager - ├─> POSTs repository_dispatch to target repo on GHE - └─> Polls GHA workflow run → returns repo URL + PR URL to CFN - -GitHub Enterprise (github.e.it.census.gov) - └─> GHA workflow (repository_dispatch event) - ├─> Clones the target account repo - ├─> Renders HCL/YAML files from templates - └─> Commits + opens PR (repo-init → main) + ├─> Starts CodeBuild: tf-run-proposer + └─> Polls CodeBuild → returns PR URL + repo URL to CFN + +CodeBuild: tf-run-proposer (csvd-dev) + └─> Clones target account repo + └─> Renders HCL/YAML files from template repo (Jinja2) + └─> Commits rendered files → opens PR (propose/sc-automation → main) + + ↕ Human reviews diff and merges PR ↕ + +GHE push webhook → Lambda tf-run-webhook-handler + └─> Reads .sc-automation.yml from default branch + └─> Starts CodeBuild: tf-run-executor (fire-and-forget) + +CodeBuild: tf-run-executor (csvd-dev) + └─> Clones account repo at main (post-merge state) + └─> Optionally assumes cross-account IAM role (sc-automation-codebuild-role) + └─> Runs tf-run apply in LAYER/REGION_DIR + └─> Commits post-apply changes (lock file, remote_state symlinks) to main [skip ci] + └─> Writes ✅/❌ commit status to GHE ``` -This replaces the current CodeBuild + terraform-eks-deployment path with a -GHA-native approach that keeps workflow logic inside the target repos. +This replaces the current CodeBuild + terraform-eks-deployment path. +Workflow logic lives in `buildspec-proposer.yml` and `buildspec-executor.yml` +in this repo; product-type-specific logic lives in `handler.py` in each template repo. --- @@ -154,7 +167,7 @@ scripts in `/apps/terraform/bin/`. Key behavior: - `aws_account_id` and `aws_region` are auto-resolved via `!Sub` in CFN; do NOT add them as user-facing SC form parameters - Lambda ServiceToken: `arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:{name}` -- Lambda timeout must be ≥ CodeBuild/GHA poll window (currently 900s) +- Lambda timeout must be ≥ CodeBuild poll window (currently 900s) --- @@ -165,8 +178,7 @@ scripts in `/apps/terraform/bin/`. Key behavior: - ❌ Do not write temp files to `/tmp` — use `~/tmp` - ❌ Do not use `terraform` directly — use `tf` alias (`tf plan`, `tf apply`) - ❌ Do not run AWS CLI/boto3 without `export AWS_DEFAULT_REGION=us-gov-west-1` -- ❌ Do not add `vpc_id` — field is `vpc_name` -- ❌ Do not use `HappyPathway/terraform-github-repo` public module -- ✅ DO use `CSVD/terraform-github-repo` (https://github.e.it.census.gov/CSVD/terraform-github-repo) - ✅ DO use `gh` CLI for PR management - ✅ DO use `GH_HOST=github.e.it.census.gov` for all GHE commands +- ✅ Cross-account role name is `sc-automation-codebuild-role` — must exist in every target + account and trust the CodeBuild IAM role from csvd-dev before the first executor run diff --git a/docs/HOW-IT-WORKS.md b/docs/HOW-IT-WORKS.md index e81925c..a5ffaa4 100644 --- a/docs/HOW-IT-WORKS.md +++ b/docs/HOW-IT-WORKS.md @@ -172,12 +172,16 @@ a watched account repo. Never triggered by a user or SC product. | `LAYER` | Webhook Lambda reads from `.sc-automation.yml` | | `REGION_DIR` | Webhook Lambda reads from `.sc-automation.yml` | | `TARGET_ACCOUNT_ID` | Webhook Lambda reads from `.sc-automation.yml` | -| `VAULT_AWS_ROLE` | Webhook Lambda reads from `.sc-automation.yml` | | `DRY_RUN` | Webhook Lambda reads from `.sc-automation.yml` | | `TF_RUN_START_TAG` | Webhook Lambda reads from `.sc-automation.yml` | | `COMMIT_SHA` | Webhook Lambda reads from the GHE push payload | | `GITHUB_TOKEN` | Lambda reads from Secrets Manager `ghe-runner/github-token` | +> **ADR-002 (Proposed):** A future revision will add `VAULT_AWS_ROLE` so the executor +> obtains short-lived credentials from the Vault AWS Secrets Engine instead of +> assuming `sc-automation-codebuild-role` directly. Until ADR-002 is accepted and +> implemented, `buildspec-executor.yml` uses the static `aws sts assume-role` model. + **Build definition:** `buildspec-executor.yml` --- diff --git a/docs/repo-vars-and-secrets.md b/docs/repo-vars-and-secrets.md index 85a2153..9af8576 100644 --- a/docs/repo-vars-and-secrets.md +++ b/docs/repo-vars-and-secrets.md @@ -239,7 +239,8 @@ aws secretsmanager create-secret \ - **Secret rotation:** Implement automatic rotation for long-lived credentials - **Environment-scoped secrets:** Dev/test/prod variants of secrets per repo -- **Organization-level variables:** Push shared variables once to org level instead - of per-repo, reducing GHE API call volume +- **Organization-level SSM parameters:** Consolidate shared variables (e.g. `TERRAFORM_VERSION`) + into a single SSM path read once at Lambda invocation rather than duplicated across + every `by-type/` subtree, reducing SSM API call volume - **Validation rules:** Reject variable names that conflict with CodeBuild reserved names (e.g. `CODEBUILD_*`, `AWS_*` built-ins) From b14a0849b324c20971bebaee153a208c87a6a2da Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Fri, 22 May 2026 14:01:10 -0400 Subject: [PATCH 24/27] adding supporting documentation for AWS Account Bootstrapping and how it could either support or be changed to work better with this project --- docs/account-bootstrap-analysis.md | 826 +++++++++++++++++++++++++++++ 1 file changed, 826 insertions(+) create mode 100644 docs/account-bootstrap-analysis.md diff --git a/docs/account-bootstrap-analysis.md b/docs/account-bootstrap-analysis.md new file mode 100644 index 0000000..e2bffb4 --- /dev/null +++ b/docs/account-bootstrap-analysis.md @@ -0,0 +1,826 @@ +# Account Repo Analysis & Bootstrap Automation Proposal + +**Date:** 2026-05-22 +**Status:** Proposed +**Author:** AI analysis of `account-repos` workspace +**Audience:** Platform Engineering / SCT-Engineering + +--- + +## 1. Executive Summary + +This document records a systematic analysis of the ~100 AWS account repositories +cloned under `~/git/account-repos` and maps the common structural elements into +a proposed series of **sc-lambda-ghactions workspaces** (Service Catalog products +backed by template repos) that can automate the full account bootstrap lifecycle. + +The analysis found that every account repo — regardless of account type, partition +(GovCloud vs commercial EW), or team ownership — follows a strictly ordered, +repeatable sequence of Terraform workspaces. The content of each workspace is +highly parameterized but structurally identical across accounts. This makes the +entire bootstrap sequence a strong candidate for sc-lambda-ghactions automation. + +A key finding is that the **git-secret / GPG credential system** is the primary +architectural blocker for headless automation of the `common/` layer (IAM foundation). +ADR-002's Vault AWS Secrets Engine, extended to cover Vault KV for provider +credentials, directly unblocks this. Section 6 covers this in detail. + +Where full automation is not possible, this document states so explicitly and +explains why. + +--- + +## 2. Account Repo Structure — Universal Elements + +Every account directory under `account-repos/` contains exactly the following +top-level items. No account was missing any of these. + +### 2.1 Top-Level Layout + +``` +{account-id}-{account-alias}/ +├── applications/ # app-workspace scaffold (most accounts) +│ └── structure/ # mirrors common/ infrastructure/ vpc/ as templates +├── common/ # IAM: policies, roles, groups, users, SAML, LDAP +├── credentials.d/ # per-region AWS credential .tf files +├── edl-automation/ # EDL-specific automation (EDL accounts only) +├── includes.d/ # shared variable definitions (tags) +├── infrastructure/ # TF state backend, S3 logs, CloudTrail, Config +├── init/ # git repo setup, git-secret, GPG key +│ ├── git-secret/ # team-member GPG public keys (.gpg.asc) +│ ├── git-setup/ # IaC to create/configure the GitHub repo +│ └── gpg-setup/ # account-specific GPG key generation +├── provider_configs.d/ # provider secrets: GitHub, LDAP, Infoblox, DNS +├── variables.d/ # variables.common.tf, variables.tfstate.tf, per-region .tfvars +├── vpc/ # VPC resources per region +├── INF.SETUP.md # step-by-step human bootstrap guide +├── README.md +├── TOP # high-level apply-phase sequence (non-apps repos) +├── outputs.common.tf +├── region.tf # locals { region = var.region } +└── tf-run.data # orchestration phases (TAG/COMMENT directives) +``` + +### 2.2 Within `common/` + +``` +common/ +├── INF.account-info.tf # module "account_settings" — alias, password policy +├── INF.general-policies.tf # managed + custom IAM policies +├── INF.saml.tf # IAM SAML provider +├── INF.ldap-ou-create.tf # base LDAP OU for the account +├── INF.role.inf-cloud-admin.tf +├── INF.group.inf-cloud-admin.tf +├── INF.role.inf-network-admin.tf +├── INF.role.inf-flowlogs.tf +├── INF.group.inf-ip-restriction.tf +├── INF.remote-roles.tf # additional SAML roles +├── INF.admin-user.{username}.tf # one file per admin user (variable count) +├── inf-cloud-admin.users.tf +├── INF.service.cloudforms.tf +├── sso/ # per-SSO-permission-set subdirectories +├── remote_state.backend.tf # symlink (→ .s3 or .none depending on state) +├── remote_state.common.tf +├── outputs.common.tf +├── region.tf +└── versions.tf +``` + +### 2.3 Within `infrastructure/` + +``` +infrastructure/ +├── INF.tfstate.tf # S3 bucket + DynamoDB for TF state +├── east/ # per-region workspace +│ ├── INF.cloudtrail.tf +│ ├── INF.config.tf +│ ├── INF.s3-access-logs.tf +│ ├── INF.s3-flow-logs.tf +│ ├── INF.object-logs.tf +│ ├── INF.dynamic-route53.tf +│ ├── INF.ses-domain.tf +│ ├── INF.preload-kms.tf +│ ├── INF.splunk-description.tf +│ ├── locals.tf +│ ├── region.tf +│ ├── remote_state.backend.tf +│ └── versions.tf +└── west/ # same structure as east/ +``` + +### 2.4 Provider Configurations (`provider_configs.d/`) + +Present in every account without exception: + +| File | Provider | +|---|---| +| `provider.github.tf` | GitHub Enterprise | +| `provider.github.variables.tf` | GitHub provider variables | +| `provider.github.auto.tfvars.secret` | GitHub PAT (git-secret encrypted) | +| `provider.ldap.tf` | LDAP (legacy) | +| `provider.ldap.variables.tf` | LDAP variables | +| `provider.ldap.auto.tfvars.secret` | LDAP bind password (git-secret encrypted) | +| `provider.ldap_new.tf` | LDAP (new provider) | +| `provider.ldap_new.variables.tf` | LDAP new variables | +| `provider.ldap_new.auto.tfvars.secret` | LDAP new bind password | +| `provider.dns.tf` | DNS (Infoblox/Route53) | +| `provider.infoblox.tf` | Infoblox (EW accounts only) | +| `provider.infoblox.variables.tf` | Infoblox variables (EW only) | +| `provider.infoblox.auto.tfvars.secret` | Infoblox creds (EW only) | +| `tf-run.data` | Phase orchestration for this layer | + +### 2.5 Variable Files (`variables.d/`) + +| File | Purpose | +|---|---| +| `variables.common.tf` | Variable declarations for all common inputs | +| `variables.tfstate.tf` | Variable declarations for state backend | +| `{region}.variables.common.auto.tfvars` | Per-region values (account_id, alias, region) | + +--- + +## 3. Bootstrap Phase Sequence + +All account repos share the same bootstrap execution order, controlled by `TOP` +and `tf-run.data`. The sequence is: + +``` +Phase 0: MANUAL — AWS account creation, initial bootstrap IAM user +Phase 1: init/ — GPG key, git-secret, GitHub repo +Phase 2: provider_configs.d/ — provider secret initialization +Phase 3: infrastructure/ (partial) — TF state backend (S3 + DynamoDB) +Phase 4: infrastructure/{region}/ — S3 access log buckets per region +Phase 5: common/ — IAM foundation (see ordered sub-steps below) +Phase 6: infrastructure/ (finalize) — flow log buckets, object logging, etc. +Phase 7: infrastructure/{region}/ — CloudTrail, Config, SES, Route53 per region +Phase 8: vpc/ — VPC per region +Phase 9: applications/structure/ — (if applicable) app workspace scaffold +``` + +### Phase 5 Sub-Steps (ordered dependencies) + +``` +5.1 general — managed_policies, custom_policies, custom_policy_documents +5.2 account_settings — account alias + IAM password policy +5.3 saml — IAM SAML provider +5.4 ldap_ou — base LDAP OU (prerequisite for all LDAP objects) +5.5 role inf-cloud-admin + group inf-cloud-admin (apply twice: create file, then LDAP object) +5.6 role inf-network-admin + group inf-network-admin +5.7 role inf-flowlogs +5.8 group inf-ip-restriction +5.9 splunk_user +5.10 service_cloudforms +5.11 admin user accounts (one tf file per user; parallel-safe within the group) +5.12 other SAML roles (remote roles — apply twice each) +``` + +--- + +## 4. Account Type Variations + +### 4.1 Partition + +The authoritative discriminator is the `aws_environment` variable in each account's `variables.d/*.variables.common.auto.tfvars`: + +| `aws_environment` value | Meaning | `credentials.d/` contents | Primary region | +|---|---|---|---| +| `"gov"` | AWS GovCloud (US) | `us-gov-east-1.credentials.tf`, `us-gov-west-1.credentials.tf` (2 files) | `us-gov-east-1` | +| `"ew"` | East-West commercial network zone | 17 commercial-region files (one account, `ent-ew-sectools-prod`, has 30 as newer regions were added) | `us-east-1` | + +**The `-gov` vs `-ew` name suffix does not reliably map to partition alone.** Examples: +- `ent-gov-operations-prod` → `aws_environment = "gov"` (GovCloud despite `-prod` suffix) +- `csvd-dev-ew` → has commercial-region credentials but is the GovCloud-linked commercial account for `csvd-dev-gov`, not a standalone commercial workload account +- `do2-prod` (no `-ew` suffix) → `aws_environment = "ew"` + +The `-ew` suffix, when present alongside a corresponding `-gov` account, designates the **GovCloud linked commercial account** — the pairing required by AWS for every GovCloud account. These accounts carry the standard 17-region commercial credential set but serve a different operational role than standalone commercial workload accounts. + +**Consequence for bootstrap automation:** `aws_environment` is the correct input field to use, not a derived partition value from the alias name. It must be supplied explicitly in the SC form. + +**Infoblox provider:** present in `provider_configs.d/` for `aws_environment = "ew"` accounts only. + +### 4.2 Program/Team + +| Pattern | Directories added | Notes | +|---|---|---| +| `edl-*` | `edl-automation/` | EDL-specific automation harness | +| `ent-gov-network-*` | `vpc-shared/` instead of `applications/` | Network accounts share VPC | +| `_apps-{stack}` | Separate repo per application stack; `SUBMODULE` file replaces `TOP` | Each stack is its own GitHub repo | + +### 4.3 `_apps-*` Repos + +Accounts with application stacks have companion repos following the naming +convention `{account-id}-{alias}_apps-{stack-name}`. These share the same +`common/`, `credentials.d/`, `infrastructure/`, `provider_configs.d/` scaffolding +as the base account repo but contain a `SUBMODULE` orchestration file rather than +`TOP`. They are registered as GitHub submodules in the base account repo. + +--- + +## 5. Key Inputs Required Per New Account + +These are the minimum parameterized values needed to generate a complete account +repo from scratch: + +| Input | Examples | Notes | +|---|---|---| +| `account_id` | `001476713248` | 12-digit AWS account number | +| `account_alias` | `edl-core-dev-gov` | Used in all resource naming | +| `aws_environment` | `gov` or `ew` | Sourced directly from the account's tfvars; controls region set + Infoblox presence. Do not derive from the alias name. | +| `primary_region` | `us-gov-east-1` or `us-east-1` | Drives first-region workspace | +| `secondary_region` | `us-gov-west-1` or `us-west-2` | Drives second-region workspace | +| `program` | `edl`, `ent`, `ma`, `lab`, etc. | Controls edl-automation inclusion | +| `environment` | `dev`, `nonprod`, `prod`, `common` | Tags and policy scoping | +| `admin_users` | `[badra001, dwara001, ...]` | Generates `INF.admin-user.*.tf` files | +| `team_gpg_keys` | Map of username → GPG public key | Populates `init/git-secret/` | +| `github_org` | `SCT-Engineering` or specific org | For `init/git-setup/` | +| `github_repo_name` | `{account_id}-{alias}` | Usually derived from above | +| `tfstate_bucket` | `inf-tfstate-{account_id}` | S3 bucket for remote state | +| `app_stacks` | `[]` or `[adsd-eks, tco-imds]` | Whether to create `_apps-*` repos | +| `include_edl_automation` | `true/false` | EDL accounts only | +| `include_vpc_shared` | `true/false` | Network accounts only | + +--- + +## 6. GPG Keys and git-secret — What They Actually Protect + +Understanding the GPG/git-secret system precisely is critical to assessing what +can be automated and what ADR-002 (Vault) can eliminate. + +### 6.1 Two Distinct GPG Key Systems + +There are **two separate GPG key concepts** in every account repo, serving +different purposes: + +#### Account-specific GPG keypair (`init/gpg-setup/`) + +`tf apply` in `init/gpg-setup/` generates a unique GPG keypair for the account +(e.g. `tf-001476713248-edl-core-dev-gov`). Its purpose, from `INF.gpg-setup.md`: + +> "This key is used for encrypting specific resource values, such as **IAM +> passwords** or **IAM access keys**." + +It is **not** used to protect provider credentials. It encrypts the IAM console +passwords and AWS access keys that Terraform generates for admin users (module +`admin_{username}`) so those sensitive values can be committed to the repo and +distributed out-of-band without appearing in plaintext. The key artifacts are: + +| File | Contents | How stored | +|---|---|---| +| `tf-{account}.gpg.b64` | Public key (base64) | Plaintext in git; symlinked at `TOP/init/tf-gpg-key.b64` | +| `tf-{account}.gpg.asc` | Public key (ascii-armored) | Plaintext in git | +| `tf-{account}.gpg.secret-key.secret` | **Private key** | Encrypted by git-secret | + +The private key is itself protected by git-secret — you need a team member's +personal GPG key to retrieve it. + +#### Team member GPG public keys (`init/git-secret/*.gpg.asc`) + +One file per engineer, sourced from `terraform/support/keys/gpg-public-keys`. +These are the **recipients** for git-secret's multi-key encryption. Anyone whose +key is in this directory can run `git secret reveal` to decrypt the protected +files in the repo. Adding or removing an engineer requires: importing their key, +running `git-secret tell $EMAIL`, running `git-secret hide` (re-encrypts all +files for all current recipients), and committing the result. + +### 6.2 What git-secret Actually Encrypts + +The `.secret` extension marks a git-secret encrypted file. The plaintext +counterpart (without `.secret`) is gitignored. Encrypted files found across all +account repos: + +| Encrypted file | Plaintext contains | Used by | +|---|---|---| +| `provider_configs.d/provider.github.auto.tfvars.secret` | `github_token`, `github_org`, `github_url` | All TF workspaces using GitHub provider | +| `provider_configs.d/provider.ldap.auto.tfvars.secret` | `ldap_user`, `ldap_password` | All TF workspaces creating LDAP objects | +| `provider_configs.d/provider.ldap_new.auto.tfvars.secret` | `ldap_user`, `ldap_password` (new provider) | Same | +| `provider_configs.d/provider.infoblox.auto.tfvars.secret` | Infoblox API credentials | EW accounts only | +| `init/gpg-setup/tf-{account}.gpg.secret-key.secret` | Account GPG private key | Local operators who need to decrypt IAM passwords | +| `vpc/{region}/.../access_key.yml.secret` | IAM access keys for service accounts | Where AWS access keys are stored in VPC workloads | + +### 6.3 Why This Blocks CodeBuild Automation Today + +`git secret reveal` requires a GPG private key present in the local keychain. +CodeBuild has no such key — it was never designed to be a git-secret recipient. +As a result: + +- **Any TF workspace that sources the GitHub or LDAP provider** (i.e., `common/`, + any SSO workspace, any workspace in `provider_configs.d/`) **cannot be run by + the executor in its current form.** The `.auto.tfvars.secret` files would not + be decrypted, the provider would have empty credentials, and the apply would fail. + +- The executor **can** run workspaces that use only the AWS provider (e.g., + `infrastructure/`, `vpc/`) because those rely on STS credentials injected via + environment variables, not on git-secret-managed files. + +This is the core automation gap. It is not just a manual step — it is an +architectural incompatibility between git-secret and headless automation. + +### 6.4 What Vault Can Replace + +ADR-002 is framed around AWS credential issuance (replacing static +`sts:AssumeRole`), but the Vault KV Secrets Engine can trivially extend to +replace git-secret for all provider credentials as well. The mapping is direct: + +| Today (git-secret) | With Vault KV | +|---|---| +| `provider.github.auto.tfvars.secret` → `git secret reveal` | `vault kv get secret/accounts/{alias}/github` → write `.auto.tfvars` at build time | +| `provider.ldap.auto.tfvars.secret` → `git secret reveal` | `vault kv get secret/accounts/{alias}/ldap` → write `.auto.tfvars` at build time | +| `provider.infoblox.auto.tfvars.secret` → `git secret reveal` | `vault kv get secret/accounts/{alias}/infoblox` → write `.auto.tfvars` at build time | +| Account GPG private key → `git secret reveal` | `vault kv get secret/accounts/{alias}/gpg-private-key` → decrypt IAM passwords at build time | + +The executor buildspec would add a `vault kv get` call per needed provider before +running `tf-init`/`tf-run`, injecting the plaintext credentials as temporary +files that are never committed. This replaces the entire `git secret reveal` +ceremony and eliminates the need for any team member to maintain GPG keys in a +git repo. + +### 6.5 What Vault Cannot Eliminate + +Even with Vault managing all secrets, two manual steps survive: + +1. **Account-specific GPG keypair generation (M4):** The `init/gpg-setup/` + module still generates a keypair used to encrypt IAM passwords that Terraform + outputs. If the Terraform `admin-user` module is redesigned to deliver + passwords via Vault KV (i.e., `vault kv put secret/accounts/{alias}/users/ + {username}/password $(tf output password)`) rather than GPG-encrypted files + in the repo, this step becomes unnecessary. This is an account-module change, + not a sc-lambda-ghactions change. + +2. **Bootstrapping Vault itself with the first account credential:** The very + first time a new account is bootstrapped, Vault does not yet have that + account's LDAP password or GitHub PAT. An operator must do a one-time + `vault kv put` for each credential. This is a single 3-command operation per + credential per account — far simpler than the full git-secret ceremony — and + can be performed by a central platform team without any access to the target + account's GPG keychain. + +### 6.6 Vault Scope Expansion Summary + +If ADR-002 is implemented and extended to cover provider credentials via Vault KV: + +| Manual step | Current status | With Vault | +|---|---|---| +| M4 — GPG keypair generation | Required per account | Eliminated if admin-user module writes passwords to Vault KV | +| M5 — Team member GPG key collection | Required per account per new team member | Eliminated — no git-secret recipients needed | +| M6 — `*.auto.tfvars.secret` encryption | Required per credential per account | Replaced by one `vault kv put` per credential (one-time, central team) | +| M10 — LDAP objects in `common/` | Currently blocked for CodeBuild | Unblocked — executor reads LDAP credentials from Vault at build time | + +The practical effect: implementing Vault KV for provider credentials **unlocks +full automation of `common/`** — the largest and most complex bootstrap workspace +— which is currently the hardest manual phase. + +--- + +## 7. Proposed sc-lambda-ghactions Workspace Series + +The following describes each proposed Service Catalog product / template repo +needed to automate the bootstrap sequence. They map directly onto the phase +sequence in section 3. + +Each workspace corresponds to one sc-lambda-ghactions **Proposer invocation** +(one PR, one executor run). They are ordered by dependency. + +--- + +### Workspace 0: `bootstrap-account-repo` + +**Template repo:** `template-bootstrap-account-repo` +**Layer:** `init` (special — not a standard TF layer; this creates the repo itself) +**Purpose:** Create the GitHub repo, set branch protections, configure teams, +write the top-level scaffold files (`TOP`, `tf-run.data`, `region.tf`, +`outputs.common.tf`, `README.md`, `INF.SETUP.md`). + +**Inputs:** +- `account_id`, `account_alias`, `aws_environment`, `program`, `environment` +- `primary_region`, `secondary_region` +- `github_org`, `github_teams` (list with permissions) +- `admin_users` (for `INF.SETUP.md` generation) + +**Rendered outputs:** +- `init/git-setup/INF.repo-setup.tf` — GitHub repo resource + team membership +- `TOP` — apply phase sequence file +- `tf-run.data` — orchestration file +- `README.md`, `INF.SETUP.md` — human documentation + +**What the executor does:** `tf apply` in `init/git-setup/` creates the GitHub +repo via the GitHub Terraform provider. + +> ⚠️ **Cannot be automated:** GPG key generation for `init/gpg-setup/` requires +> a human with GnuPG to generate the account-specific keypair, encrypt it, and +> commit the `.gpg.asc` and `.gpg.b64` artifacts. The private key must be +> distributed out-of-band to account operators. This step remains manual. + +> ⚠️ **Cannot be automated:** Each team member's GPG public key (`init/git-secret/ +> {username}.gpg.asc`) must be provided by the individual. The proposer can +> render the `git-secret` setup script, but the keys themselves must come from +> a known-good source (e.g., a team keyring registry). If such a registry exists +> in Secrets Manager or SSM, this can be automated; otherwise it remains manual. + +--- + +### Workspace 1: `bootstrap-provider-configs` + +**Template repo:** `template-provider-configs` +**Layer:** `provider_configs.d` +**Region dir:** `global` (no region scoping for this layer) +**Purpose:** Render all provider configuration `.tf` files and stub out the +encrypted secret placeholders. Sets up the GitHub, LDAP, LDAP-new, DNS, and +(if `aws_environment = "ew"`) Infoblox providers. + +**Inputs:** `account_id`, `account_alias`, `aws_environment` + +**Rendered outputs:** +- `provider_configs.d/provider.github.tf` +- `provider_configs.d/provider.ldap.tf`, `provider.ldap_new.tf` +- `provider_configs.d/provider.dns.tf` +- `provider_configs.d/provider.infoblox.tf` (EW only) +- `provider_configs.d/tf-run.data` +- All `variables.tf` counterparts + +> ⚠️ **Cannot be automated without Vault KV (see section 6):** The `*.auto.tfvars.secret` +> files (GitHub PAT, LDAP bind password, Infoblox credentials) are git-secret encrypted +> files that CodeBuild cannot decrypt. With ADR-002 extended to Vault KV, the executor +> would instead call `vault kv get secret/accounts/{alias}/{provider}` at build time and +> write the credentials as temporary files before `tf-init` — replacing git-secret entirely. +> Until that is implemented, an operator must manually `vault kv put` (or `git-secret hide`) +> the credentials once per account. This is also what gates full automation of `common/` +> (Workspace 6), which uses both the LDAP and GitHub providers. + +--- + +### Workspace 2: `bootstrap-credentials` + +**Template repo:** `template-credentials` +**Layer:** `credentials.d` +**Region dir:** `global` +**Purpose:** Generate the per-region AWS credential provider `.tf` files. + +**Inputs:** `account_id`, `account_alias`, `aws_environment` (`gov` or `ew`) +- `gov`: generates `us-gov-east-1.credentials.tf`, `us-gov-west-1.credentials.tf` +- `ew`: generates all 17 (or 30 for newer builds) commercial-region credential files + +**Rendered outputs:** `credentials.d/{region}.credentials.tf` for each region + +**What the executor does:** No TF apply needed; this is a file generation step. +The executor can be set to `DRY_RUN=true` for this workspace — the files just +need to be committed. + +--- + +### Workspace 3: `bootstrap-variables` + +**Template repo:** `template-variables` +**Layer:** `variables.d` +**Region dir:** `global` +**Purpose:** Generate `variables.common.tf`, `variables.tfstate.tf`, and per-region +`{region}.variables.common.auto.tfvars` files. + +**Inputs:** `account_id`, `account_alias`, `aws_environment`, `primary_region`, +`secondary_region`, `environment`, `program`, `tfstate_bucket` + +**Rendered outputs:** +- `variables.d/variables.common.tf` +- `variables.d/variables.tfstate.tf` +- `variables.d/{primary_region}.variables.common.auto.tfvars` +- `variables.d/{secondary_region}.variables.common.auto.tfvars` +- `includes.d/variables.account_tags.tf` +- `includes.d/variables.application_tags.tf` +- `includes.d/variables.infrastructure_tags.tf` + +**What the executor does:** `DRY_RUN=true` — file generation only. + +--- + +### Workspace 4: `bootstrap-infrastructure-tfstate` + +**Template repo:** `template-infrastructure-tfstate` +**Layer:** `infrastructure` +**Region dir:** `global` +**Purpose:** Bootstrap the Terraform state backend: S3 bucket + DynamoDB table. +This is the first workspace that touches real AWS infrastructure. + +**Prerequisite:** AWS bootstrap IAM user created manually (see section 7, +manual step M1). This executor run assumes the bootstrap user's credentials. + +**Inputs:** `account_id`, `account_alias`, `aws_environment`, `primary_region`, +`tfstate_bucket` + +**Rendered outputs:** +- `infrastructure/INF.tfstate.tf` +- `infrastructure/remote_state.backend.tf` (→ `.none` initially; executor re-links to `.s3`) +- `infrastructure/tf-run.data` +- `infrastructure/region.tf`, `versions.tf` + +**What the executor does:** +1. `tf apply -target=module.tfstate` — creates S3 + DynamoDB +2. Re-links `remote_state.backend.tf` → `.s3` (commits back to `main`) + +> ⚠️ **Cannot be automated:** Initial application of this workspace requires the +> `bootstrap` IAM user's credentials, which exist only in AWS Console and must +> be manually provided to CodeBuild (e.g., via SSM SecureString or injected as +> build-time overrides). One approach: after the account is created, an operator +> stores the bootstrap credentials in Secrets Manager under a known path, the +> executor reads them, applies, then the `common/` phase rotates to real users. +> This is architecturally possible but requires an agreed credential handoff +> convention not yet established. + +--- + +### Workspace 5: `bootstrap-infrastructure-regional-logs` + +**Template repo:** `template-infrastructure-regional-logs` +**Layer:** `infrastructure` +**Region dir:** `{primary_region}` then `{secondary_region}` (two separate runs) +**Purpose:** Create the `inf-logs-{account}-{region}` S3 access log bucket in +each region. Required before any ALB, S3 bucket, or object-log resources can be +configured. + +**Inputs:** `account_id`, `account_alias`, `aws_environment`, target region + +**Rendered outputs:** +- `infrastructure/{region}/INF.s3-access-logs.tf` +- `infrastructure/{region}/region.tf`, `remote_state.backend.tf`, `versions.tf` + +**What the executor does:** `tf apply -target=module.logs` per region. + +--- + +### Workspace 6: `bootstrap-common` + +**Template repo:** `template-common` +**Layer:** `common` +**Region dir:** `global` (common layer has no per-region split) +**Purpose:** Render and apply all IAM foundation resources in dependency order. + +This is the largest and most complex bootstrap workspace. Because of the +ordered dependency chain within `common/` (see section 3, Phase 5 sub-steps), +the executor must respect `TF_RUN_START_TAG` to resume from a given step. + +**Inputs:** +- `account_id`, `account_alias`, `aws_environment`, `environment`, `program` +- `admin_users` list (one `INF.admin-user.{username}.tf` per entry) +- `saml_provider_metadata` (SAML XML metadata from identity provider) +- `ldap_base_dn`, `ldap_account_ou` + +**Rendered outputs:** +- `common/INF.account-info.tf` +- `common/INF.general-policies.tf` +- `common/INF.saml.tf` +- `common/INF.ldap-ou-create.tf` +- `common/INF.role.inf-cloud-admin.tf` +- `common/INF.group.inf-cloud-admin.tf` +- `common/INF.role.inf-network-admin.tf` +- `common/INF.role.inf-flowlogs.tf` +- `common/INF.group.inf-ip-restriction.tf` +- `common/INF.service.cloudforms.tf` +- `common/INF.admin-user.{username}.tf` for each user in `admin_users` +- `common/inf-cloud-admin.users.tf` +- `common/remote_state.backend.tf`, `remote_state.common.tf` +- `common/outputs.common.tf`, `region.tf`, `versions.tf` +- `common/tf-run.data` (ordered TAG sequence matching section 3 Phase 5) + +**What the executor does:** +Runs `tf-run apply` which walks the `TAG` sequence in `common/tf-run.data`. +The `TF_RUN_START_TAG` env var allows resuming after a partial failure. + +> ⚠️ **Partially automatable — SAML metadata:** The SAML provider metadata XML +> must be obtained from the identity provider (e.g., Okta or ADFS) and passed +> as an input. If the IdP is Okta and an API exists, this can be automated. If +> the metadata is managed manually, it must be provided by an operator. + +> ⚠️ **Partially automatable — LDAP:** The `INF.ldap-ou-create.tf` module +> requires LDAP bind credentials in `provider_configs.d/` to be decryptable at +> runtime. Until ADR-002 (Vault AWS Secrets Engine) is implemented, these +> credentials must already be git-secret encrypted and present in the repo from +> Workspace 1. If that was done, LDAP steps are fully automated. If not, they +> require manual intervention. + +> ⚠️ **Two-pass apply for SAML roles:** Each `INF.role.*.tf` that creates LDAP +> objects requires two sequential `tf apply` calls (first creates a local file, +> second creates the LDAP object). The executor's `tf-run.data` TAG sequence +> handles this natively — no special tooling needed — but the operator must +> ensure the `common/tf-run.data` TAG ordering encodes the two-pass pattern. + +--- + +### Workspace 7: `bootstrap-infrastructure-finalize` + +**Template repo:** `template-infrastructure-finalize` +**Layer:** `infrastructure` +**Region dir:** `{primary_region}` and `{secondary_region}` (two runs) +**Purpose:** Apply the remaining infrastructure resources after `common/` is +complete (which provides the SAML roles required for flow-log and object-log +bucket policies). + +**Inputs:** `account_id`, `account_alias`, `aws_environment`, region + +**Rendered outputs per region:** +- `infrastructure/{region}/INF.s3-flow-logs.tf` +- `infrastructure/{region}/INF.object-logs.tf` +- `infrastructure/{region}/INF.cloudtrail.tf` +- `infrastructure/{region}/INF.config.tf` +- `infrastructure/{region}/INF.dynamic-route53.tf` +- `infrastructure/{region}/INF.ses-domain.tf` +- `infrastructure/{region}/INF.preload-kms.tf` +- `infrastructure/{region}/INF.splunk-description.tf` +- `infrastructure/{region}/locals.tf` + +**What the executor does:** `tf-run apply` walking the TAG sequence in the +regional `tf-run.data`. + +--- + +### Workspace 8: `bootstrap-vpc` + +**Template repo:** `template-vpc` +**Layer:** `vpc` +**Region dir:** `{primary_region}` and `{secondary_region}` (two runs) +**Purpose:** Create the VPC and associated networking resources in each region. + +**Inputs:** +- `account_id`, `account_alias`, `aws_environment`, region +- `vpc_cidr`, `subnet_cidrs` (map of AZ → CIDR) +- `vpc_name` (usually derived from account alias) +- Network account ID for VPC sharing (if applicable) + +**Rendered outputs:** +- `vpc/{region}/INF.vpc.tf` +- `vpc/{region}/INF.subnets.tf` +- `vpc/{region}/INF.tgw-attachment.tf` (if transit gateway) +- `vpc/{region}/region.tf`, `remote_state.backend.tf`, `versions.tf` + +**What the executor does:** `tf-run apply` applying VPC resources. + +> ⚠️ **Cannot be fully automated:** VPC CIDR allocation must be coordinated with +> the network team's IPAM system. The CIDRs cannot be derived automatically +> without an IPAM API integration. An operator must supply them via SC form +> inputs or the allocation must be read from an external registry (e.g., Infoblox +> or an internal IPAM). + +--- + +### Workspace 9 (optional): `bootstrap-applications-structure` + +**Template repo:** `template-applications-structure` +**Layer:** `applications` +**Region dir:** `structure` +**Purpose:** Scaffold the `applications/structure/` directories that mirror +`common/`, `infrastructure/`, and `vpc/` as templates for app teams. Only needed +for accounts that will host application stacks. + +**Inputs:** `account_id`, `account_alias`, `aws_environment`, `primary_region`, +`secondary_region`, `app_stacks` (list of stack names for `_apps-*` repos) + +**Rendered outputs:** +- `applications/structure/common/`, `infrastructure/`, `vpc/` scaffold files +- Symlinks matching the base account repo pattern + +--- + +### Workspace 10 (optional): `bootstrap-apps-repo` + +**Template repo:** `template-apps-repo` +**Layer:** `init` (creates a new GitHub repo) +**Purpose:** Create and scaffold a `{account-id}-{alias}_apps-{stack-name}` repo +for each application stack, registering it as a submodule of the base account repo. + +Repeat once per stack name in `app_stacks`. + +--- + +## 8. Manual Steps That Cannot Be Automated + +The following steps are explicitly outside the scope of sc-lambda-ghactions +automation in its current form. Each has the reason stated. + +| # | Step | Why It Cannot Be Automated | Potential Future Path | +|---|---|---|---| +| M1 | AWS account creation | Account Vending Machine / AWS Organizations automation is out of scope for this system | Integrate with AWS Control Tower or an internal AVM product | +| M2 | `bootstrap` IAM user creation | Requires AWS Console + AdminAccess before any IaC exists | Control Tower / account vending pre-creates a bootstrap role | +| M3 | Bootstrap IAM credentials handoff | Access key + secret for the bootstrap user must be securely handed to the first executor run | Store in Secrets Manager during AVM; executor reads from known path | +| M4 | GPG keypair generation (`init/gpg-setup/`) | Generates keypair used to encrypt Terraform-created IAM passwords in the repo | Eliminated if admin-user module is updated to write passwords to Vault KV instead of GPG-encrypting them in the repo (see section 6.5) | +| M5 | Team member GPG public key collection | `git secret tell` requires each engineer's public key as a git-secret recipient | **Fully eliminated by ADR-002 + Vault KV** — no git-secret recipients needed when secrets live in Vault (see section 6.4) | +| M6 | `*.auto.tfvars.secret` encryption | git-secret requires the account GPG key on the operator's keychain; CodeBuild cannot decrypt these files | **Substantially eliminated by ADR-002 extended to Vault KV** — replaced by one `vault kv put` per credential; this also unblocks `common/` automation (see section 6.3–6.4) | +| M7 | SAML provider metadata XML | Must be retrieved from the IdP (Okta, ADFS, etc.) | Automate if IdP has an API; otherwise operator pastes metadata into SC form | +| M8 | VPC CIDR allocation | CIDRs must come from an IPAM system | Automate via Infoblox API or internal IPAM product integration | +| M9 | `bootstrap` user rotation | After admin users are created, the bootstrap user's access key must be disabled and the TF import performed | Low complexity; could be a separate SC product (`import-bootstrap-user`) | +| M10 | Two-pass SAML role applies | LDAP objects require `tf apply` twice per role | Already handled by `tf-run.data` TAG sequence; not a manual step if executor is running cleanly | +| M11 | Initial `git checkout -b initial-setup` push | Per `common/INF.SETUP.md` — the first clean `git push` after `common/` complete | Could be part of executor post-apply commit; low risk to automate | + +--- + +## 9. Mapping to sc-lambda-ghactions Concepts + +### 8.1 `.sc-automation.yml` per workspace + +Each workspace PR committed by the Proposer includes a `.sc-automation.yml` +written to the account repo root. Because multiple workspaces touch the same +repo, the convention must be to write workspace-specific YAML -- or scope the +file to the workspace layer using the `.sc-automation.yml` path scoping +already built into the webhook handler. + +Proposed convention: +```yaml +# .sc-automation.yml written by template-bootstrap-common proposer +account_repo: 001476713248-edl-core-dev-gov +layer: common +region_dir: global +target_account_id: "001476713248" +dry_run: false +tf_run_start_tag: "" # set to a TAG label to resume from a partial failure +``` + +### 8.2 Cross-account IAM role + +Every PR-merge-triggered executor run for a target account requires +`sc-automation-codebuild-role` to exist in that account. During bootstrap, +this role does not yet exist at the time Workspace 4 runs. Two options: + +**Option A (recommended):** The AWS account vending process (Control Tower or +AVM) pre-creates `sc-automation-codebuild-role` as part of the account baseline +before any bootstrap workspace runs. This is the cleanest design. + +**Option B:** Workspace 4 (`bootstrap-infrastructure-tfstate`) runs in the target +account using the bootstrap user's static credentials injected via Secrets Manager +rather than `sts:AssumeRole`. After `common/` creates the real admin users and +the `sc-automation-codebuild-role` can itself be applied as a module in `common/`, +all subsequent workspaces use the standard assume-role path. + +### 8.3 Template repo versioning + +Each template repo (section 6) should be tagged (`v1.0.0`, `v1.1.0`, etc.) and +the CFN product template for that workspace should pin `template_repo_ref`. This +ensures that re-running a bootstrap workspace on an existing account (e.g., to +add a new admin user) uses the exact same templates that created the account. + +### 8.4 Ordered product invocation via the SC console + +The 10 workspace products should be presented in an SC portfolio named +**Account Bootstrap** with a display order that mirrors the dependency sequence. +There is no current mechanism in sc-lambda-ghactions to enforce ordering between +products — the human operator is responsible for launching them in sequence. + +A future enhancement could add a dependency state machine (Step Functions or +DynamoDB tracking) to block Workspace N from launching until Workspace N-1 +has a successful executor commit status. This is out of scope for the initial +implementation. + +--- + +## 10. Template Repo Summary + +| Workspace | Template Repo | TF Layer | Primary SC Input Fields | +|---|---|---|---| +| 0 | `template-bootstrap-account-repo` | `init/git-setup` | account_id, alias, aws_environment, github_teams, admin_users | +| 1 | `template-provider-configs` | `provider_configs.d` | account_id, alias, aws_environment | +| 2 | `template-credentials` | `credentials.d` | account_id, alias, aws_environment | +| 3 | `template-variables` | `variables.d` | account_id, alias, aws_environment, regions, environment, program | +| 4 | `template-infrastructure-tfstate` | `infrastructure` | account_id, alias, aws_environment, primary_region, tfstate_bucket | +| 5 | `template-infrastructure-regional-logs` | `infrastructure/{region}` | account_id, alias, region | +| 6 | `template-common` | `common` | account_id, alias, aws_environment, admin_users, saml_metadata, ldap config | +| 7 | `template-infrastructure-finalize` | `infrastructure/{region}` | account_id, alias, region | +| 8 | `template-vpc` | `vpc/{region}` | account_id, alias, region, vpc_cidr, subnet_cidrs | +| 9 | `template-applications-structure` | `applications/structure` | account_id, alias, aws_environment, regions, app_stacks | +| 10 | `template-apps-repo` | `init/git-setup` (new repo) | account_id, alias, stack_name | + +--- + +## 11. Phased Implementation Recommendation + +Given the complexity of the full sequence, this implementation plan stages the +work into three phases aligned with the manual blockers: + +### Phase 1 — Structural scaffolding (no executing Terraform) +Workspaces 0, 1, 2, 3 — these produce only committed files and repo configuration. +They do not require the bootstrap IAM user or any running AWS infrastructure. +High confidence of automation; implement first. + +### Phase 2 — Infrastructure foundation (requires bootstrap credential handoff) +Workspaces 4 and 5 — these apply Terraform against the new account. +Blocked on establishing the bootstrap credential convention (M1–M3). +Implement after the credential handoff pattern is agreed. + +### Phase 3 — IAM, VPC, app structure +Workspaces 6, 7, 8, 9, 10 — these are the most account-specific and depend on +secrets (LDAP, SAML metadata) and IPAM allocation. +The git-secret dependency (M6) is the largest blocker; see section 6 for a full +analysis. Retiring git-secret in favor of Vault KV (extending ADR-002) is a +prerequisite for full unattended automation of `common/`. With Vault KV in place, +the only remaining manual inputs for `common/` are SAML metadata and VPC CIDRs. + +--- + +## 12. Comparison to Current State + +| Aspect | Today (manual `INF.SETUP.md`) | With proposed sc-lambda-ghactions products | +|---|---|---| +| Repo creation | Manual `git init` + GitHub API | Automated — Workspace 0 PR + executor | +| Provider file generation | Hand-edit per account | Automated — Workspace 1 | +| Credentials file generation | Hand-edit per region | Automated — Workspace 2 | +| TF state bootstrap | Manual CLI commands | Automated — Workspace 4 | +| IAM roles/groups/users | Ordered manual `tf apply` per module | Automated — Workspace 6 TAG sequence | +| VPC | Manual `tf apply` | Automated — Workspace 8 (requires CIDR input) | +| Secrets management | git-secret (manual encrypt cycle) | Manual until ADR-002 | +| Time to first usable account | Days | Hours (Phase 1+2 only); minutes if Phase 3 secrets are available | +| Auditability | `git log` in account repo | PR per workspace in GitHub; CodeBuild logs; GHE commit status | +| Repeatability | Operator knowledge / `INF.SETUP.md` | SC product form fields; idempotent Proposer | From 66838cfa508ec7829963fd3960a6dcdeab6476dc Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Thu, 28 May 2026 12:22:31 -0400 Subject: [PATCH 25/27] docs: update CHECKPOINT with Jira sub-tasks and add decision documents index; create ADR-003 for Vault cluster topology; create ADR-004 for account baseline IAM role; create ADR-005 for Service Catalog portfolio sharing strategy --- design-docs/CHECKPOINT.md | 29 +++- docs/account-bootstrap-analysis.md | 68 ++++++-- docs/decisions/003-vault-cluster-topology.md | 134 ++++++++++++++ .../004-account-baseline-iam-role.md | 164 ++++++++++++++++++ docs/decisions/005-portfolio-org-sharing.md | 141 +++++++++++++++ 5 files changed, 516 insertions(+), 20 deletions(-) create mode 100644 docs/decisions/003-vault-cluster-topology.md create mode 100644 docs/decisions/004-account-baseline-iam-role.md create mode 100644 docs/decisions/005-portfolio-org-sharing.md diff --git a/design-docs/CHECKPOINT.md b/design-docs/CHECKPOINT.md index 702551d..bfddd14 100644 --- a/design-docs/CHECKPOINT.md +++ b/design-docs/CHECKPOINT.md @@ -2,7 +2,34 @@ ## 1. Last Updated -**2026-05-06** — Implementation complete: Phases 1–3 fully built and committed. +**2026-05-28** — Jira sub-tasks created under CSC-1341; three new ADRs added to `docs/decisions/`. + +--- + +## 1a. Jira Ticket Index + +Parent: **[CSC-1341](https://jira.it.census.gov/browse/CSC-1341)** — [sc-lambda-ghactions] Design & implement next-gen SC automation system + +| Key | Summary | Priority | Status | ADR | +|-----|---------|----------|--------|-----| +| [CSC-1342](https://jira.it.census.gov/browse/CSC-1342) | Build and push Lambda container image to ECR (via packer-pipeline) | High | To Do | — | +| [CSC-1343](https://jira.it.census.gov/browse/CSC-1343) | End-to-end test: SC provision → CodeBuild → tf-run → PR → CFN SUCCESS | High | To Do | — | +| [CSC-1344](https://jira.it.census.gov/browse/CSC-1344) | Provision account baseline IAM role (sc-automation-codebuild-role) | High | To Do | [ADR-004](../docs/decisions/004-account-baseline-iam-role.md) | +| [CSC-1345](https://jira.it.census.gov/browse/CSC-1345) | ADR-002: Implement Vault AWS Secrets Engine for cross-account credentials | High | To Do | [ADR-002](../docs/decisions/002-vault-aws-secrets-engine.md) | +| [CSC-1346](https://jira.it.census.gov/browse/CSC-1346) | Vault cluster topology decision | Medium | To Do | [ADR-003](../docs/decisions/003-vault-cluster-topology.md) | +| [CSC-1348](https://jira.it.census.gov/browse/CSC-1348) | OU sharing and StackSet for Service Catalog portfolio | Medium | To Do | [ADR-005](../docs/decisions/005-portfolio-org-sharing.md) | +| [CSC-1349](https://jira.it.census.gov/browse/CSC-1349) | Migration runbook: lambda-template-repo-generator → sc-lambda-ghactions | Medium | To Do | — | +| [CSC-1350](https://jira.it.census.gov/browse/CSC-1350) | Phase 4 observability: CloudWatch dashboard + SNS alerts on FAILED builds | Low | To Do | — | + +**Decision documents index:** + +| ADR | File | Status | Linked tickets | +|-----|------|--------|---------------| +| ADR-001 | [docs/decisions/001-webhook-auto-apply.md](../docs/decisions/001-webhook-auto-apply.md) | Accepted | — | +| ADR-002 | [docs/decisions/002-vault-aws-secrets-engine.md](../docs/decisions/002-vault-aws-secrets-engine.md) | Proposed | CSC-1345 | +| ADR-003 | [docs/decisions/003-vault-cluster-topology.md](../docs/decisions/003-vault-cluster-topology.md) | Proposed | CSC-1346 | +| ADR-004 | [docs/decisions/004-account-baseline-iam-role.md](../docs/decisions/004-account-baseline-iam-role.md) | Accepted | CSC-1344, CSC-1348 | +| ADR-005 | [docs/decisions/005-portfolio-org-sharing.md](../docs/decisions/005-portfolio-org-sharing.md) | Proposed | CSC-1348 | --- diff --git a/docs/account-bootstrap-analysis.md b/docs/account-bootstrap-analysis.md index e2bffb4..8a1a214 100644 --- a/docs/account-bootstrap-analysis.md +++ b/docs/account-bootstrap-analysis.md @@ -46,10 +46,10 @@ top-level items. No account was missing any of these. ├── edl-automation/ # EDL-specific automation (EDL accounts only) ├── includes.d/ # shared variable definitions (tags) ├── infrastructure/ # TF state backend, S3 logs, CloudTrail, Config -├── init/ # git repo setup, git-secret, GPG key -│ ├── git-secret/ # team-member GPG public keys (.gpg.asc) +├── init/ # git repo setup; git-secret/gpg-setup present in legacy repos only +│ ├── git-secret/ # ⚠️ legacy — team-member GPG public keys; eliminated with Vault │ ├── git-setup/ # IaC to create/configure the GitHub repo -│ └── gpg-setup/ # account-specific GPG key generation +│ └── gpg-setup/ # ⚠️ legacy — account-specific GPG key generation; eliminated with Vault ├── provider_configs.d/ # provider secrets: GitHub, LDAP, Infoblox, DNS ├── variables.d/ # variables.common.tf, variables.tfstate.tf, per-region .tfvars ├── vpc/ # VPC resources per region @@ -146,7 +146,7 @@ and `tf-run.data`. The sequence is: ``` Phase 0: MANUAL — AWS account creation, initial bootstrap IAM user -Phase 1: init/ — GPG key, git-secret, GitHub repo +Phase 1: init/ — GitHub repo creation (GPG/git-secret eliminated with Vault) Phase 2: provider_configs.d/ — provider secret initialization Phase 3: infrastructure/ (partial) — TF state backend (S3 + DynamoDB) Phase 4: infrastructure/{region}/ — S3 access log buckets per region @@ -231,7 +231,6 @@ repo from scratch: | `program` | `edl`, `ent`, `ma`, `lab`, etc. | Controls edl-automation inclusion | | `environment` | `dev`, `nonprod`, `prod`, `common` | Tags and policy scoping | | `admin_users` | `[badra001, dwara001, ...]` | Generates `INF.admin-user.*.tf` files | -| `team_gpg_keys` | Map of username → GPG public key | Populates `init/git-secret/` | | `github_org` | `SCT-Engineering` or specific org | For `init/git-setup/` | | `github_repo_name` | `{account_id}-{alias}` | Usually derived from above | | `tfstate_bucket` | `inf-tfstate-{account_id}` | S3 bucket for remote state | @@ -326,25 +325,56 @@ replace git-secret for all provider credentials as well. The mapping is direct: | `provider.github.auto.tfvars.secret` → `git secret reveal` | `vault kv get secret/accounts/{alias}/github` → write `.auto.tfvars` at build time | | `provider.ldap.auto.tfvars.secret` → `git secret reveal` | `vault kv get secret/accounts/{alias}/ldap` → write `.auto.tfvars` at build time | | `provider.infoblox.auto.tfvars.secret` → `git secret reveal` | `vault kv get secret/accounts/{alias}/infoblox` → write `.auto.tfvars` at build time | -| Account GPG private key → `git secret reveal` | `vault kv get secret/accounts/{alias}/gpg-private-key` → decrypt IAM passwords at build time | +| Account GPG private key (encrypts IAM passwords in repo) | **Eliminated** — admin-user module writes passwords directly to `vault kv put secret/accounts/{alias}/users/{username}`; no GPG, no `.secret` files | -The executor buildspec would add a `vault kv get` call per needed provider before +The executor buildspec adds a `vault kv get` call per needed provider before running `tf-init`/`tf-run`, injecting the plaintext credentials as temporary files that are never committed. This replaces the entire `git secret reveal` -ceremony and eliminates the need for any team member to maintain GPG keys in a -git repo. +ceremony. **The `.gitsecret/` directory, `init/gpg-setup/`, and `init/git-secret/` +are eliminated from all new account repos** — they are artifacts of the old system +and have no role in Vault-managed accounts. + +#### CodeBuild authentication to Vault + +CodeBuild authenticates to Vault using the **AWS auth method** — no credentials +are injected, stored, or rotated. CodeBuild proves its identity via +`sts:GetCallerIdentity`; Vault verifies the IAM role ARN directly with AWS. + +The proposer's `tf apply` provisions the Vault auth role for each new account: + +```hcl +resource "vault_aws_auth_backend_role" "codebuild" { + backend = "aws" + role = "sc-automation-${var.account_id}" + auth_type = "iam" + bound_iam_principal_arns = [ + "arn:${var.partition}:iam::${var.account_id}:role/sc-automation-codebuild-role" + ] + token_policies = ["sc-automation-${var.account_id}"] + token_ttl = 900 +} +``` + +Then in the executor buildspec: + +```bash +vault login -method=aws -no-print role=sc-automation-${ACCOUNT_ID} +GITHUB_TOKEN=$(vault kv get -field=github_token secret/accounts/${ACCOUNT_ALIAS}/github) +``` + +No AppRole, no Secret IDs, nothing in Secrets Manager. The IAM role *is* the credential. ### 6.5 What Vault Cannot Eliminate -Even with Vault managing all secrets, two manual steps survive: +With Vault managing all secrets and CodeBuild authenticating via the AWS auth +method, one manual step survives: -1. **Account-specific GPG keypair generation (M4):** The `init/gpg-setup/` - module still generates a keypair used to encrypt IAM passwords that Terraform - outputs. If the Terraform `admin-user` module is redesigned to deliver - passwords via Vault KV (i.e., `vault kv put secret/accounts/{alias}/users/ - {username}/password $(tf output password)`) rather than GPG-encrypted files - in the repo, this step becomes unnecessary. This is an account-module change, - not a sc-lambda-ghactions change. +1. **Account-specific GPG keypair generation (M4): Eliminated.** The + `init/gpg-setup/` directory and the entire `.gitsecret/` tree are dropped from + new account repos. The `admin-user` module delivers IAM passwords directly to + `vault kv put secret/accounts/{alias}/users/{username}` rather than to + GPG-encrypted files. Operators retrieve passwords via `vault kv get` using + their own IAM credentials — no GPG toolchain required at any point. 2. **Bootstrapping Vault itself with the first account credential:** The very first time a new account is bootstrapped, Vault does not yet have that @@ -360,8 +390,8 @@ If ADR-002 is implemented and extended to cover provider credentials via Vault K | Manual step | Current status | With Vault | |---|---|---| -| M4 — GPG keypair generation | Required per account | Eliminated if admin-user module writes passwords to Vault KV | -| M5 — Team member GPG key collection | Required per account per new team member | Eliminated — no git-secret recipients needed | +| M4 — GPG keypair generation | Required per account | **Eliminated** — `init/gpg-setup/` and `.gitsecret/` removed from new repos; IAM passwords go to Vault KV | +| M5 — Team member GPG key collection | Required per account per new team member | **Eliminated** — no git-secret recipients; operators access secrets via IAM + Vault policy | | M6 — `*.auto.tfvars.secret` encryption | Required per credential per account | Replaced by one `vault kv put` per credential (one-time, central team) | | M10 — LDAP objects in `common/` | Currently blocked for CodeBuild | Unblocked — executor reads LDAP credentials from Vault at build time | diff --git a/docs/decisions/003-vault-cluster-topology.md b/docs/decisions/003-vault-cluster-topology.md new file mode 100644 index 0000000..78fbd0c --- /dev/null +++ b/docs/decisions/003-vault-cluster-topology.md @@ -0,0 +1,134 @@ +# ADR-003: Vault Cluster Topology for SC Automation + +## In Plain Language + +Before we can implement ADR-002 (dynamic AWS credentials from Vault), we need to decide +*which* Vault cluster the SC automation system will talk to, how that cluster is organized, +and how CodeBuild builds will authenticate to it. + +This document records the topology decision: existing shared cluster vs. dedicated cluster, +namespace layout, and the auth method CodeBuild will use to prove its identity to Vault. + +**Status:** Proposed +**Date:** 2026-05-28 +**Depends on:** ADR-002 (`002-vault-aws-secrets-engine.md`) +**Jira:** [CSC-1346](https://jira.it.census.gov/browse/CSC-1346) + +--- + +## Context + +ADR-002 specifies that the CodeBuild executor will authenticate to Vault and request +short-lived AWS credentials from the Vault AWS Secrets Engine. But it deliberately +defers the question of *which* Vault cluster to use. Three viable topologies exist: + +### Option A — Shared Census Vault cluster, dedicated namespace + +Use an existing Census-managed Vault cluster (e.g. the platform Vault in csvd-prod +or a shared non-prod instance). Create a dedicated namespace (`sc-automation/`) so +that all SC automation policies, roles, and secrets engine mounts are isolated from +other tenants. + +**Pros:** +- No new cluster to operate or HA-tune +- Shared cluster is already monitored, patched, and backed up +- Cost is shared across all tenants + +**Cons:** +- Dependency on another team's change-management cadence +- Namespace-level isolation is good but not complete cluster isolation +- Shared cluster outage affects all tenants simultaneously + +### Option B — Dedicated Vault cluster in csvd-dev + +Deploy a standalone Vault cluster (Integrated Storage / Raft, 3-node) in csvd-dev +`us-gov-west-1` specifically for SC automation. + +**Pros:** +- Full operational control; can tune lease TTLs, auth policies, and HA config + without coordinating with other teams +- Complete isolation — a misconfiguration in SC automation cannot affect other workloads +- Can be versioned and upgraded on our own schedule + +**Cons:** +- New operational burden: cluster patching, unseal key rotation, backup scheduling +- Requires 3 EC2 instances (or ECS tasks) and associated IAM/networking +- Higher cost for a single-tenant cluster + +### Option C — Vault on Kubernetes (ECS/EKS sidecar pattern) + +Run Vault as a sidecar container alongside CodeBuild tasks (dev/agent pattern), using +`vault agent` injector to deliver credentials to the build environment. + +**Pros:** No persistent cluster to manage +**Cons:** CodeBuild does not support sidecars natively; requires workaround; substantially +more complex than Options A or B. **Not recommended.** + +--- + +## Auth Method Decision + +Regardless of cluster topology, CodeBuild will authenticate to Vault using the +**AWS IAM auth method** (`auth/aws`). The CodeBuild service role ARN +(`arn:${AWS::Partition}:iam::229685449397:role/sc-automation-codebuild-role`) is +bound to a Vault role. When the executor build starts, `vault login` presents the +current IAM identity (via `GetCallerIdentity`) — no static tokens or secrets are +needed inside the build environment. + +```hcl +# Vault IAM auth role (managed in sc-lambda-ghactions deploy/) +resource "vault_aws_auth_backend_role" "codebuild_executor" { + backend = "aws" + role = "sc-automation-codebuild" + auth_type = "iam" + bound_iam_principal_arns = ["arn:aws-us-gov:iam::229685449397:role/sc-automation-codebuild-role"] + token_ttl = 900 # 15 min — matches max CodeBuild build window + token_policies = ["sc-automation-executor"] +} +``` + +--- + +## Decision + +> **TO BE DECIDED** — this ADR is in Proposed state pending discussion with the +> platform / Vault operations team. + +Questions to answer before closing this ADR: + +1. Is there an existing Census Vault cluster available for non-prod workloads that + the SC automation team can use? What is its SLA? +2. Does the Census Vault team support dedicated namespaces for product teams? +3. What is the blast-radius / approval process for cluster-level changes on a + shared cluster that affect us? +4. Are there cost / account placement constraints that favour one topology? + +**Recommended default (pending discussion): Option A** — shared Census cluster with +a dedicated `sc-automation/` namespace. This avoids new operational burden while +still providing tenant isolation. Revisit if the shared cluster proves too slow to +change or if an outage directly impacts SC automation SLA. + +--- + +## Consequences + +### If Option A (shared cluster, dedicated namespace) + +- Platform team must grant namespace admin rights to the SC automation team +- SC automation `deploy/` Terraform must include Vault provider config pointing at + the shared cluster +- Vault cluster URL and namespace become required Terraform variables + +### If Option B (dedicated cluster in csvd-dev) + +- New Terraform module required to stand up 3-node Raft cluster in csvd-dev +- Unseal key escrow procedure must be documented and tested +- Adds ~$X/month to csvd-dev bill (to be estimated) + +--- + +## Related + +- [ADR-002: Vault AWS Secrets Engine](./002-vault-aws-secrets-engine.md) — upstream decision +- [CSC-1345](https://jira.it.census.gov/browse/CSC-1345) — ADR-002 implementation ticket +- [CSC-1346](https://jira.it.census.gov/browse/CSC-1346) — this topology decision ticket diff --git a/docs/decisions/004-account-baseline-iam-role.md b/docs/decisions/004-account-baseline-iam-role.md new file mode 100644 index 0000000..bdb1fed --- /dev/null +++ b/docs/decisions/004-account-baseline-iam-role.md @@ -0,0 +1,164 @@ +# ADR-004: Account Baseline IAM Role for SC Automation Cross-Account Access + +## In Plain Language + +For our automation to run Terraform in a target AWS account, it needs AWS credentials +in that account. Right now the plan is a pre-created IAM role called +`sc-automation-codebuild-role` in every target account that trusts the CodeBuild +service role in csvd-dev. + +This document records what that role must look like, how it gets created at scale +across the org, and the lifecycle rules around updates and removal. + +**Status:** Accepted +**Date:** 2026-05-28 +**Jira:** [CSC-1344](https://jira.it.census.gov/browse/CSC-1344) +**Note:** If ADR-002/ADR-003 are fully implemented (Vault AWS Secrets Engine), the +`sts:AssumeRole` trust in this role is eventually replaced by a Vault-issued +credential. This role definition remains the correct minimum-privilege baseline +regardless of which credential mechanism is used. + +--- + +## Context + +The executor CodeBuild build runs in csvd-dev. To apply Terraform changes in a +target account (e.g. `123456789012-some-team-workload-dev-gov`), it must assume +a role in that account. + +### Current mechanism (static AssumeRole) + +``` +csvd-dev CodeBuild role (229685449397) + └─ sts:AssumeRole ─────────────────────────────────> + sc-automation-codebuild-role (in target account) + └─ trusts 229685449397 CodeBuild role +``` + +### Future mechanism (Vault dynamic credentials — ADR-002) + +``` +csvd-dev CodeBuild → vault login (IAM auth) → Vault AWS Secrets Engine + └─ Vault generates short-lived creds for sc-automation-codebuild-role +``` + +In both cases the **target-account role definition is the same** — only the +mechanism for obtaining credentials to it changes. + +--- + +## Role Specification + +### Role name + +`sc-automation-codebuild-role` + +Consistent name across all accounts enables a single `sts:AssumeRole` permission +in the csvd-dev CodeBuild role without per-account ARN enumeration. + +### Trust policy (static AssumeRole model) + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "AllowSCAutomationCodeBuild", + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws-us-gov:iam::229685449397:role/sc-automation-codebuild-role" + }, + "Action": "sts:AssumeRole", + "Condition": { + "StringEquals": { + "sts:ExternalId": "${account_id}" + } + } + } + ] +} +``` + +The `ExternalId` condition requires the CodeBuild build to pass the target account ID, +preventing confused-deputy attacks across the org. + +### Permissions (minimum viable) + +The role needs enough access for the product workspaces it will run. For the initial +baseline, the following managed policies cover the standard layers: + +| Layer | Managed Policy | +|-------|---------------| +| `infrastructure/` | `arn:aws-us-gov:iam::aws:policy/AdministratorAccess` (scoped later) | +| `common/` | `arn:aws-us-gov:iam::aws:policy/IAMFullAccess` + `PowerUserAccess` | +| `vpc/` | `arn:aws-us-gov:iam::aws:policy/AmazonVPCFullAccess` | + +> **Future hardening:** Replace `AdministratorAccess` with a least-privilege +> customer-managed policy once product workspace IAM requirements are stable. +> Track in a follow-up ADR. + +### Permission boundary (recommended) + +Attach a permission boundary policy that prevents the role from escalating +privileges beyond the SC automation scope: + +```hcl +resource "aws_iam_role" "sc_automation" { + name = "sc-automation-codebuild-role" + assume_role_policy = data.aws_iam_policy_document.trust.json + permissions_boundary = "arn:${data.aws_partition.current.partition}:iam::${data.aws_caller_identity.current.account_id}:policy/sc-automation-boundary" +} +``` + +--- + +## Rollout Strategy + +Three options for provisioning this role at org scale: + +### Option A — CloudFormation StackSet (recommended) + +Use a `SERVICE_MANAGED` StackSet targeting the whole OU. Each new account +automatically gets the role on vending. Updates are propagated automatically. + +```yaml +# stackset template fragment +Resources: + SCAutomationRole: + Type: AWS::IAM::Role + Properties: + RoleName: sc-automation-codebuild-role + AssumeRolePolicyDocument: ... + ManagedPolicyArns: + - !Sub "arn:${AWS::Partition}:iam::aws:policy/AdministratorAccess" +``` + +### Option B — Terraform module via account-vending pipeline + +Add the role to the account baseline Terraform module that runs at account +creation time (if one exists). Requires a retroactive apply for existing accounts. + +### Option C — Manual per-account creation (not recommended) + +Acceptable only for the initial csvd-dev E2E test. Does not scale. + +**Decision: Option A (StackSet)** for org-wide rollout. Option B as secondary +mechanism if a Terraform account-vending pipeline is in place. + +--- + +## Consequences + +- Every new account joining the target OU automatically gets `sc-automation-codebuild-role` +- The StackSet stack instance must be removed before an account can be decommissioned +- Changes to the role (policy updates, ExternalId rotation) are propagated via + StackSet stack instance updates — average propagation time ~15 min for large OUs + +--- + +## Related + +- [ADR-002: Vault AWS Secrets Engine](./002-vault-aws-secrets-engine.md) +- [ADR-003: Vault Cluster Topology](./003-vault-cluster-topology.md) +- [CSC-1344](https://jira.it.census.gov/browse/CSC-1344) — provisioning ticket +- [CSC-1348](https://jira.it.census.gov/browse/CSC-1348) — OU sharing / StackSet ticket diff --git a/docs/decisions/005-portfolio-org-sharing.md b/docs/decisions/005-portfolio-org-sharing.md new file mode 100644 index 0000000..19d2e60 --- /dev/null +++ b/docs/decisions/005-portfolio-org-sharing.md @@ -0,0 +1,141 @@ +# ADR-005: Service Catalog Portfolio Org-Wide Sharing Strategy + +## In Plain Language + +The sc-lambda-ghactions SC products live in a portfolio in csvd-dev. For teams in +other AWS accounts to provision those products, they need to see the portfolio. + +This document records how the portfolio is shared org-wide — whether through AWS +Resource Access Manager (RAM), CloudFormation StackSets, or a combination — and +what access controls govern who can see and launch products. + +**Status:** Proposed +**Date:** 2026-05-28 +**Jira:** [CSC-1348](https://jira.it.census.gov/browse/CSC-1348) + +--- + +## Context + +AWS Service Catalog supports **portfolio sharing** to other accounts or entire +Organizations units via: + +1. **AWS RAM portfolio share** — shares the portfolio to an OU or the entire org. + Member accounts see the portfolio and must explicitly **accept** or it can be + auto-accepted at the org level if RAM auto-accept is enabled for the OU. + +2. **CloudFormation StackSet (SERVICE_MANAGED)** — uses a CFN StackSet to deploy + an `AWS::ServiceCatalog::AcceptedPortfolioShare` + `AWS::ServiceCatalog::PortfolioPrincipalAssociation` + resource into every member account, granting the end-user principal (e.g. an + IAM role) launch permissions automatically. + +3. **Hybrid** — RAM share provides the underlying portfolio visibility; + a StackSet handles the per-account `AcceptedPortfolioShare` + principal + association so users don't have to accept manually. + +### Current state + +`deploy/service_catalog.tf` in this repo creates the portfolio and products in +csvd-dev but does not yet configure cross-account sharing. The only currently +working provisioning path is from within csvd-dev itself. + +--- + +## Decision + +**Use the hybrid approach (RAM share + StackSet for acceptance and principal binding).** + +### Step 1 — RAM portfolio share to the OU + +Add to `deploy/service_catalog.tf`: + +```hcl +resource "aws_servicecatalog_portfolio_share" "org" { + portfolio_id = aws_servicecatalog_portfolio.main.id + type = "ORGANIZATIONAL_UNIT" + principal_id = var.target_ou_arn # e.g. "ou-xxxx-yyyyyyyy" + share_tag_options = false + wait_for_acceptance = false # StackSet handles acceptance +} +``` + +### Step 2 — StackSet for acceptance + principal association + +Deploy a `SERVICE_MANAGED` StackSet into the target OU from the management account +(or csvd-dev if delegated admin is configured): + +```yaml +Resources: + AcceptShare: + Type: AWS::ServiceCatalog::AcceptedPortfolioShare + Properties: + PortfolioId: !Ref PortfolioId # passed as StackSet parameter + + LaunchPermission: + Type: AWS::ServiceCatalog::PortfolioPrincipalAssociation + DependsOn: AcceptShare + Properties: + PortfolioId: !Ref PortfolioId + PrincipalARN: !Sub "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/SC-ProductLauncher" + PrincipalType: IAM +``` + +This gives any account in the OU automatic access to the portfolio without a +platform engineer having to accept shares manually. + +### Launch constraint + +Each product in the portfolio has a `LAUNCH` constraint that specifies the IAM +role used to provision the product in the member account. This role is the Lambda +`ServiceToken` principal in csvd-dev — no per-account launch role is needed since +the Lambda runs centrally. + +--- + +## Access Control + +| Who | Access | +|-----|--------| +| Any account in the target OU | Can see and provision products | +| Principal: `SC-ProductLauncher` IAM role | Has launch permission in their account | +| csvd-dev only | Portfolio admin / product version management | + +### Tag-based visibility (future) + +If products should only be visible to specific teams within an account, AWS Service +Catalog Tag Options can filter the portfolio view. Not implemented in initial rollout. + +--- + +## StackSet Placement + +The StackSet must be deployed from an account with **delegated administrator** rights +for Service Catalog (or from the management account). If csvd-dev is the delegated +admin for Service Catalog in this org, the StackSet can be managed from +`deploy/service_catalog.tf`. + +Confirm delegated admin status: +```bash +aws organizations list-delegated-administrators \ + --service-principal servicecatalog.amazonaws.com \ + --region us-gov-west-1 +``` + +--- + +## Consequences + +- New accounts joining the target OU automatically receive portfolio access (via + StackSet auto-deployment) +- The `target_ou_arn` variable must be added to `deploy/variables.tf` +- A separate CFN StackSet template file will live in `service-catalog/portfolio-share-stackset.yaml` +- If the OU changes (accounts move), RAM share scoping does not need to change — + the StackSet handles new instances automatically + +--- + +## Related + +- [ADR-004: Account Baseline IAM Role](./004-account-baseline-iam-role.md) +- [CSC-1344](https://jira.it.census.gov/browse/CSC-1344) — baseline IAM role (must exist before members can launch products) +- [CSC-1348](https://jira.it.census.gov/browse/CSC-1348) — implementation ticket for this decision From 3834d9ee3cfea825122ae4cdee3c7c536b6d0fb4 Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Tue, 2 Jun 2026 16:07:08 -0400 Subject: [PATCH 26/27] feat: add CROSS_ACCOUNT_ROLE for Vault-based cross-account credential flow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - buildspec-executor.yml / buildspec.yml: default CROSS_ACCOUNT_ROLE=r-inf-terraform; replace hardcoded role name with ${CROSS_ACCOUNT_ROLE} in sts:AssumeRole block (interim scaffolding — will be replaced by vault read in CSC-1345) - deploy/codebuild.tf: add CROSS_ACCOUNT_ROLE env var to executor project - deploy/iam.tf: StsAssumeRoleCrossAccount allows r-inf-terraform, r-inf-terraform-eks, sc-automation-codebuild-role (backwards compat) - lambda/app.py: add TfRunRequest.cross_account_role field (default: r-inf-terraform); pass CROSS_ACCOUNT_ROLE in CodeBuild env overrides for apply action - docs/decisions/001-webhook-auto-apply.md: add cross_account_role to schema table - design-docs/CHECKPOINT.md: update with Vault pivot and CSC-1344 blocked status Jira: CSC-1344 (Blocked on CSC-1345) --- buildspec-executor.yml | 12 ++++++++---- buildspec.yml | 11 +++++++---- deploy/codebuild.tf | 4 ++++ deploy/iam.tf | 11 ++++++++--- design-docs/CHECKPOINT.md | 12 ++++++++++++ docs/decisions/001-webhook-auto-apply.md | 1 + lambda/app.py | 16 +++++++++++----- 7 files changed, 51 insertions(+), 16 deletions(-) diff --git a/buildspec-executor.yml b/buildspec-executor.yml index 49e1d50..8973fa3 100644 --- a/buildspec-executor.yml +++ b/buildspec-executor.yml @@ -15,8 +15,10 @@ version: 0.2 # GITHUB_TOKEN - GHE PAT (PLAINTEXT, value from Secrets Manager) # # Optional env-var overrides: -# TARGET_ACCOUNT_ID - AWS account ID to assume sc-automation-codebuild-role in +# TARGET_ACCOUNT_ID - AWS account ID to assume the cross-account role in # (default: empty = run with CodeBuild role, csvd-dev only) +# CROSS_ACCOUNT_ROLE - IAM role name to assume in TARGET_ACCOUNT_ID +# (default: r-inf-terraform) # TF_RUN_START_TAG - tf-run.data TAG label to start from (default: empty = from top) # DRY_RUN - "true" = tf-run plan only, no apply (default: "false") # --------------------------------------------------------------------------- @@ -32,6 +34,7 @@ env: NO_PROXY: "github.e.it.census.gov,169.254.169.254,169.254.170.2" # Per-build defaults (overridden via environmentVariablesOverride in Lambda) TARGET_ACCOUNT_ID: "" + CROSS_ACCOUNT_ROLE: "r-inf-terraform" TF_RUN_START_TAG: "" DRY_RUN: "false" @@ -99,12 +102,13 @@ phases: - echo "Applying from $(git rev-parse --short HEAD) on main" # --- Assume cross-account role (if TARGET_ACCOUNT_ID is set) --- - # The role sc-automation-codebuild-role must exist in the target account and - # trust the CodeBuild IAM role from the central account (csvd-dev). + # The role (default: r-inf-terraform) must exist in the target account and + # trust arn:...:iam::229685449397:role/tf-run-executor-codebuild. + # Override CROSS_ACCOUNT_ROLE per-build to use a different role name. - | if [ -n "${TARGET_ACCOUNT_ID}" ]; then PARTITION=$(aws sts get-caller-identity --query Arn --output text | cut -d: -f2) - ROLE_ARN="arn:${PARTITION}:iam::${TARGET_ACCOUNT_ID}:role/sc-automation-codebuild-role" + ROLE_ARN="arn:${PARTITION}:iam::${TARGET_ACCOUNT_ID}:role/${CROSS_ACCOUNT_ROLE}" echo "Assuming cross-account role: ${ROLE_ARN}" CREDS=$(aws sts assume-role \ --role-arn "${ROLE_ARN}" \ diff --git a/buildspec.yml b/buildspec.yml index f3029ba..9a64d7a 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -19,6 +19,8 @@ version: 0.2 # TARGET_ACCOUNT_ID - AWS account ID to assume role in before running tf-run # (default: empty = run with CodeBuild's own credentials, # i.e. csvd-dev. Set this when targeting a different account.) +# CROSS_ACCOUNT_ROLE - IAM role name to assume in TARGET_ACCOUNT_ID +# (default: r-inf-terraform) # --------------------------------------------------------------------------- env: @@ -41,6 +43,7 @@ env: TEMPLATE_VARS: "{}" EXTRA_FILES: "{}" TARGET_ACCOUNT_ID: "" + CROSS_ACCOUNT_ROLE: "r-inf-terraform" phases: install: @@ -167,13 +170,13 @@ phases: # --- Assume cross-account role (if TARGET_ACCOUNT_ID is set) --- # CodeBuild runs in csvd-dev by default. To run tf-run apply against resources - # in a different AWS account, set TARGET_ACCOUNT_ID. The role - # sc-automation-codebuild-role must exist in that account and trust the - # CodeBuild IAM role from csvd-dev. + # in a different AWS account, set TARGET_ACCOUNT_ID. The role (default: + # r-inf-terraform) must exist in that account and trust the CodeBuild IAM + # role from csvd-dev. Override CROSS_ACCOUNT_ROLE per-build if needed. - | if [ -n "${TARGET_ACCOUNT_ID}" ]; then PARTITION=$(aws sts get-caller-identity --query Arn --output text | cut -d: -f2) - ROLE_ARN="arn:${PARTITION}:iam::${TARGET_ACCOUNT_ID}:role/sc-automation-codebuild-role" + ROLE_ARN="arn:${PARTITION}:iam::${TARGET_ACCOUNT_ID}:role/${CROSS_ACCOUNT_ROLE}" echo "Assuming cross-account role: ${ROLE_ARN}" CREDS=$(aws sts assume-role \ --role-arn "${ROLE_ARN}" \ diff --git a/deploy/codebuild.tf b/deploy/codebuild.tf index 74d18d5..1f4fcf3 100644 --- a/deploy/codebuild.tf +++ b/deploy/codebuild.tf @@ -172,6 +172,10 @@ resource "aws_codebuild_project" "tf_run_executor" { name = "TARGET_ACCOUNT_ID" value = "" } + environment_variable { + name = "CROSS_ACCOUNT_ROLE" + value = "r-inf-terraform" + } environment_variable { name = "TF_RUN_START_TAG" value = "" diff --git a/deploy/iam.tf b/deploy/iam.tf index 7b0df34..9eea4ed 100644 --- a/deploy/iam.tf +++ b/deploy/iam.tf @@ -112,9 +112,11 @@ data "aws_iam_policy_document" "codebuild_exec" { ] } - # Secrets Manager: read the GHE PAT at runtime (GITHUB_TOKEN env var) - # Note: CodeBuild uses PARAMETER_STORE for the token; this covers the SM read - # used during Terraform apply of source credentials (aws_codebuild_source_credential). + # Secrets Manager: read the GHE PAT at runtime. + # Both CodeBuild projects define GITHUB_TOKEN as type=SECRETS_MANAGER pointing to this + # secret. CodeBuild fetches the current value fresh at each build start using this + # permission, so the token never appears in StartBuild CloudTrail logs or BatchGetBuilds + # responses. This also covers the SM read in aws_codebuild_source_credential. statement { sid = "SecretsManagerReadGheToken" effect = "Allow" @@ -163,11 +165,14 @@ data "aws_iam_policy_document" "codebuild_exec" { # STS: allow executor to assume a cross-account role in target accounts # Only the executor needs this; proposer only needs GHE access. + # Default role is r-inf-terraform; can be overridden per-build via CROSS_ACCOUNT_ROLE. statement { sid = "StsAssumeRoleCrossAccount" effect = "Allow" actions = ["sts:AssumeRole"] resources = [ + "arn:${data.aws_partition.current.partition}:iam::*:role/r-inf-terraform", + "arn:${data.aws_partition.current.partition}:iam::*:role/r-inf-terraform-eks", "arn:${data.aws_partition.current.partition}:iam::*:role/sc-automation-codebuild-role", ] } diff --git a/design-docs/CHECKPOINT.md b/design-docs/CHECKPOINT.md index bfddd14..1d61dd4 100644 --- a/design-docs/CHECKPOINT.md +++ b/design-docs/CHECKPOINT.md @@ -10,6 +10,18 @@ Parent: **[CSC-1341](https://jira.it.census.gov/browse/CSC-1341)** — [sc-lambda-ghactions] Design & implement next-gen SC automation system +**Completed work (In Review — GHE PR #1 open):** + +| Key | Summary | Priority | Status | ADR | +|-----|---------|----------|--------|-----| +| [CSC-1351](https://jira.it.census.gov/browse/CSC-1351) | Phase 1: CodeBuild runner + buildspec | High | In Review | — | +| [CSC-1352](https://jira.it.census.gov/browse/CSC-1352) | Phase 2: Lambda CFN Custom Resource handler | High | In Review | — | +| [CSC-1353](https://jira.it.census.gov/browse/CSC-1353) | Phase 3: Service Catalog product registration | High | In Review | — | +| [CSC-1354](https://jira.it.census.gov/browse/CSC-1354) | Architecture design, .sc-automation.yml schema, and deploy Terraform | High | In Review | — | +| [CSC-1355](https://jira.it.census.gov/browse/CSC-1355) | ADR-001: Webhook auto-apply on merge accepted | High | In Review | [ADR-001](../docs/decisions/001-webhook-auto-apply.md) | + +**Open / remaining work:** + | Key | Summary | Priority | Status | ADR | |-----|---------|----------|--------|-----| | [CSC-1342](https://jira.it.census.gov/browse/CSC-1342) | Build and push Lambda container image to ECR (via packer-pipeline) | High | To Do | — | diff --git a/docs/decisions/001-webhook-auto-apply.md b/docs/decisions/001-webhook-auto-apply.md index 3f86962..f445862 100644 --- a/docs/decisions/001-webhook-auto-apply.md +++ b/docs/decisions/001-webhook-auto-apply.md @@ -73,6 +73,7 @@ Fields per entry: | `layer` | yes | `common`, `infrastructure`, or `vpc` | | `region_dir` | yes | `east`, `west`, or `global` | | `target_account_id` | no | 12-digit AWS account ID; omit to run in csvd-dev | +| `cross_account_role` | no | IAM role name to assume in `target_account_id` (default: `r-inf-terraform`) | | `tf_run_start_tag` | no | tf-run TAG label to start from | | `dry_run` | no | `true` to plan only (default: `false`) | diff --git a/lambda/app.py b/lambda/app.py index decab1d..be15e59 100644 --- a/lambda/app.py +++ b/lambda/app.py @@ -55,7 +55,8 @@ class TfRunRequest(BaseModel): git_branch: str = Field(default="propose/sc-automation", description="Branch to commit and open PR from (propose only)") # --- Executor fields (action=apply only) --- - target_account_id: str = Field(default="", description="AWS account ID to assume sc-automation-codebuild-role in before running tf-run; empty = run with CodeBuild role (csvd-dev)") + target_account_id: str = Field(default="", description="AWS account ID to assume cross_account_role in before running tf-run; empty = run with CodeBuild role (csvd-dev)") + cross_account_role: str = Field(default="r-inf-terraform", description="IAM role name to assume in target_account_id (default: r-inf-terraform)") tf_run_start_tag: str = Field(default="", description="tf-run.data TAG label to start from; empty = from beginning (apply only)") dry_run: bool = Field(default=False, description="true = tf-run plan only, no apply (apply action only)") @@ -139,11 +140,17 @@ def send_cfn_response( def start_codebuild_build( tf_req: TfRunRequest, - github_token: str, request_id: str, ) -> str: """Start the proposer or executor CodeBuild project with per-build env-var overrides. + GITHUB_TOKEN is intentionally omitted here — both CodeBuild projects define it + as type=SECRETS_MANAGER at the project level. The CodeBuild service role has + secretsmanager:GetSecretValue for that secret, so CodeBuild fetches the current + value fresh at each build start without the token ever appearing in CloudTrail + (StartBuild) or BatchGetBuilds API responses. Passing it as PLAINTEXT here would + override that project-level definition and expose the token in both. + Returns the CodeBuild build ID. """ if tf_req.action == "propose": @@ -156,7 +163,6 @@ def start_codebuild_build( {"name": "TEMPLATE_REPO", "value": tf_req.template_repo, "type": "PLAINTEXT"}, {"name": "TEMPLATE_VARS", "value": json.dumps(tf_req.template_vars), "type": "PLAINTEXT"}, {"name": "EXTRA_FILES", "value": json.dumps(tf_req.extra_files), "type": "PLAINTEXT"}, - {"name": "GITHUB_TOKEN", "value": github_token, "type": "PLAINTEXT"}, ] else: # apply project_name = os.environ.get("EXECUTOR_PROJECT_NAME", "tf-run-executor") @@ -165,9 +171,9 @@ def start_codebuild_build( {"name": "LAYER", "value": tf_req.layer, "type": "PLAINTEXT"}, {"name": "REGION_DIR", "value": tf_req.region_dir, "type": "PLAINTEXT"}, {"name": "TARGET_ACCOUNT_ID", "value": tf_req.target_account_id, "type": "PLAINTEXT"}, + {"name": "CROSS_ACCOUNT_ROLE", "value": tf_req.cross_account_role, "type": "PLAINTEXT"}, {"name": "TF_RUN_START_TAG", "value": tf_req.tf_run_start_tag, "type": "PLAINTEXT"}, {"name": "DRY_RUN", "value": str(tf_req.dry_run).lower(), "type": "PLAINTEXT"}, - {"name": "GITHUB_TOKEN", "value": github_token, "type": "PLAINTEXT"}, ] region = os.environ.get("AWS_REGION", os.environ.get("AWS_DEFAULT_REGION", "us-gov-west-1")) @@ -320,7 +326,7 @@ def lambda_handler(event: dict, context) -> dict: logger.info(f"[{request_id}] Fetching GitHub token from secret: {github_token_secret}") github_token = get_secret(github_token_secret) - build_id = start_codebuild_build(tf_req, github_token, request_id) + build_id = start_codebuild_build(tf_req, request_id) # Poll — leave 60s buffer before Lambda timeout for cfn-response PUT lambda_timeout_s = context.get_remaining_time_in_millis() / 1000 From 4b3207274b976ca0ae863c9ceaf1e33c3cf60462 Mon Sep 17 00:00:00 2001 From: Dave Arnold Date: Tue, 2 Jun 2026 16:21:57 -0400 Subject: [PATCH 27/27] docs: add Vault AWS Secrets Engine sales presentation Internal deck covering problem statement, architecture, security benefits (NIST 800-53), government/compliance considerations (BSL 1.1, OpenBao, FIPS 140-2), phased roadmap, and call to action. Jira: CSC-1345 CSC-1346 --- docs/vault-aws-secrets-engine.md | 464 +++++++++++++++++++++++++++++++ 1 file changed, 464 insertions(+) create mode 100644 docs/vault-aws-secrets-engine.md diff --git a/docs/vault-aws-secrets-engine.md b/docs/vault-aws-secrets-engine.md new file mode 100644 index 0000000..af67caf --- /dev/null +++ b/docs/vault-aws-secrets-engine.md @@ -0,0 +1,464 @@ +# HashiCorp Vault for Cross-Account Automation at Census +**Audience:** CSVD Engineering / sc-lambda-ghactions Stakeholders +**Date:** June 2026 +**Author:** David Arnold (`arnol377`) +**Related Jira:** [CSC-1345](https://jira.it.census.gov/browse/CSC-1345) · [CSC-1346](https://jira.it.census.gov/browse/CSC-1346) + +--- + +## 1. The Problem + +The `sc-lambda-ghactions` system automates AWS Service Catalog provisioning by running +`tf-run apply` via CodeBuild. To do that across multiple AWS accounts, CodeBuild needs +temporary credentials for each target account. + +**Current state:** no cross-account credential mechanism exists. + +**The naive fix (and why we rejected it):** +Add a trust policy to `r-inf-terraform` in every account that allows the CodeBuild IAM +role from `csvd-dev` to assume it. This requires: + +- A change to the management account StackSet `allow_assume_role_tf` parameter +- Trust policy propagation to **every org account** — ~450+ and growing +- Each new account onboarded requires the trust to already be in place +- Long-lived STS sessions (up to 1 hour) with no per-use audit trail + +**The right fix:** Vault AWS Secrets Engine. + +--- + +## 2. What Is HashiCorp Vault? + +Vault is a secrets management platform that controls access to tokens, passwords, +certificates, and cloud credentials. Its core value propositions are: + +| Capability | What It Means | +|---|---| +| **Dynamic Secrets** | Credentials generated on demand, expire automatically | +| **Centralized Policy** | One policy engine controls access across all secret types | +| **Audit Log** | Every read, write, and auth event logged with identity + metadata | +| **Identity-Based Access** | "Who are you?" not "What password do you know?" | +| **Encryption as a Service** | Encrypt/decrypt data without exposing keys | + +For our immediate use case, we care about two specific features: +- **AWS Secrets Engine** — generates dynamic IAM credentials per target account +- **IAM Auth Method** — lets CodeBuild authenticate using its own AWS IAM identity (no static creds) + +--- + +## 3. How It Solves the Cross-Account Problem + +``` +CodeBuild (csvd-dev, tf-run-executor-codebuild role) + │ + │ 1. "I am tf-run-executor-codebuild in 229685449397" + │ (signed by AWS STS — no password, no token) + ▼ +Vault Server (IAM Auth Method) + │ + │ 2. Validates identity via AWS STS GetCallerIdentity + │ 3. Checks policy: "executor role may request adsd-dev creds" + │ 4. Generates short-lived IAM key pair for target account + ▼ +CodeBuild receives: + AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY (TTL: 15 min) + │ + ▼ +tf-run apply runs in target account +Credentials expire automatically — nothing to rotate, nothing to leak +``` + +### What changes vs. the StackSet approach + +| Concern | StackSet Trust | Vault | +|---|---|---| +| Per-account setup | Trust policy in every account | Vault AWS backend role per account | +| New account onboarding | StackSet propagation (slow, blast radius) | Add one Vault role (seconds) | +| Credential lifetime | STS session: up to 1 hour | Configurable: 15 min recommended | +| Audit trail | CloudTrail (account-level) | Vault audit log (every access, centralized) | +| Revocation | Cannot revoke active STS session | Vault can revoke any lease instantly | +| Policy changes | StackSet → CloudFormation → IAM (slow) | `vault policy write` (instant) | + +--- + +## 4. Security Benefits + +### 4.1 Dynamic Credentials — Nothing to Rotate + +Static IAM access keys are a top attack vector (OWASP A02: Cryptographic Failures / +misconfigured credentials). With Vault: + +- No long-lived keys stored anywhere — not in Parameter Store, not in environment variables +- Every invocation gets a **unique, time-limited key pair** +- Expiry is enforced by Vault, not by developer discipline +- Compromise of one set of credentials is contained to a 15-minute window and one build job + +### 4.2 Every Access Is Audited + +Vault's audit device logs every auth event, secret read, and policy check with: + +```json +{ + "time": "2026-06-02T14:32:01Z", + "type": "response", + "auth": { + "client_token": "...", + "accessor": "...", + "display_name": "aws-tf-run-executor-codebuild", + "policies": ["default", "sc-automation-executor"], + "metadata": { + "account_id": "229685449397", + "iam_principal_arn": "arn:aws-us-gov:iam::229685449397:role/tf-run-executor-codebuild" + } + }, + "request": { + "path": "aws/creds/adsd-dev", + "operation": "read" + } +} +``` + +This gives you a **complete, tamper-evident record** of which automation job +requested credentials for which account, at what time — satisfying NIST 800-53 +AU-2, AU-3, AU-9 audit requirements. + +### 4.3 Principle of Least Privilege — Enforced Centrally + +Vault policies are written in HCL and version-controlled. Access to any given +account's credentials requires an explicit policy grant: + +```hcl +# Only allow executor to request creds for accounts it's authorized for +path "aws/creds/adsd-*" { + capabilities = ["read"] +} + +path "aws/creds/csvd-*" { + capabilities = ["read"] +} + +# Deny everything else explicitly +path "aws/*" { + capabilities = ["deny"] +} +``` + +No IAM policy sprawl, no StackSet blast radius. One file, version-controlled, +reviewed like any other code change. + +### 4.4 Break-Glass Revocation + +If a CodeBuild build is compromised mid-run, a Vault admin can: + +```bash +vault lease revoke -prefix aws/creds/adsd-dev +``` + +All active credentials for that backend are instantly invalidated — faster than +rotating an IAM key pair manually. + +### 4.5 Alignment with NIST 800-53 Controls + +| NIST Control | Requirement | Vault Feature | +|---|---|---| +| **IA-5** | Authenticator Management — no long-lived passwords | Dynamic secrets, auto-expiry | +| **AC-3** | Access Enforcement | Policy engine per path | +| **AC-17** | Remote Access | IAM Auth — cryptographic identity | +| **AU-2/3/9** | Audit Events, Content, Protection | Audit devices, tamper-evident log | +| **SC-12** | Cryptographic Key Establishment | Transit Secrets Engine (if needed later) | +| **CM-6** | Configuration Settings | Policies in version control | + +--- + +## 5. Automation Benefits + +### 5.1 Zero-Touch Account Onboarding + +When a new AWS account is bootstrapped, the only Vault step is: + +```bash +vault write aws/roles/new-account-name \ + credential_type=iam_user \ + policy_arns="arn:aws-us-gov:iam::aws:policy/AdministratorAccess" + +vault policy write sc-executor-new-account - << EOF +path "aws/creds/new-account-name" { capabilities = ["read"] } +EOF +``` + +Two commands. No StackSet, no CFN stack, no trust policy update. The executor +immediately has the ability to provision into that account. + +### 5.2 CodeBuild Integration Is Simple + +In `buildspec-executor.yml`, the existing `sts:AssumeRole` block becomes: + +```yaml +pre_build: + commands: + - | + if [ -n "$TARGET_ACCOUNT_ID" ]; then + # Authenticate to Vault using this CodeBuild job's IAM identity + VAULT_TOKEN=$(vault write -field=token auth/aws/login \ + role="sc-automation-executor" \ + iam_http_request_method="POST" \ + iam_request_url="$(base64 <<< 'https://sts.us-gov-west-1.amazonaws.com/')" \ + iam_request_body="$(base64 <<< 'Action=GetCallerIdentity&Version=2011-06-15')" \ + iam_request_headers="$(vault-aws-auth-header)") + + # Request short-lived credentials for the target account + CREDS=$(vault read -format=json aws/creds/${CROSS_ACCOUNT_ROLE}) + export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.data.access_key') + export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.data.secret_key') + export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.data.security_token') + fi +``` + +No secrets stored in environment variables. No secrets in SSM Parameter Store. +The only trust relationship needed is between CodeBuild's IAM role and Vault's +IAM auth endpoint — which is a single Vault config entry, not a per-account change. + +### 5.3 The `cross_account_role` Field Is Already Wired + +The `sc-lambda-ghactions` system already passes `CROSS_ACCOUNT_ROLE` from the +`.sc-automation.yml` file through to CodeBuild. Vault just becomes the consumer +of that field name — no CFN template changes, no Lambda changes needed. + +--- + +## 6. Government / Compliance Considerations + +> **Important:** This section is critical for Census. Read before approving deployment. + +### 6.1 Vault OSS License: Business Source License (BSL 1.1) + +In August 2023, HashiCorp changed Vault (and Terraform) from MPL 2.0 to +**Business Source License (BSL) 1.1**. + +**Key BSL terms:** +- Free to use for **any internal purpose**, including government automation +- Restriction applies only to building a **competing product** (a commercial secrets + management service sold to others) +- After **4 years**, the code converts to MPL 2.0 automatically +- No per-seat or per-server fees for self-hosted OSS usage + +**For Census Bureau use:** ✅ BSL is acceptable. Census is not building a competing +commercial secrets management product. Using Vault to automate internal AWS +infrastructure is squarely within permitted BSL use. + +**However:** Legal should formally bless this before production deployment, as BSL +is a relatively new license and some agencies have blanket policies against non-OSI-approved +licenses (BSL is **not** OSI-approved). + +### 6.2 OpenBao — The True OSS Alternative + +If BSL creates a legal / policy blocker, **OpenBao** is a drop-in replacement: + +| | Vault OSS (BSL) | OpenBao | +|---|---|---| +| License | BSL 1.1 (not OSI-approved) | **MPL 2.0** (OSI-approved) | +| Fork basis | — | Vault 1.14.x | +| API compatibility | — | 100% compatible | +| Governance | IBM/HashiCorp | Linux Foundation | +| FIPS build | Enterprise only | Community FIPS build available | +| Support | HashiCorp Enterprise contract | Community + vendors | + +OpenBao is the recommended path if legal flags BSL. The implementation is identical — +same API, same SDK, same `vault` CLI commands. + +### 6.3 FIPS 140-2 / 140-3 Requirement + +Federal systems processing sensitive data must use **FIPS 140-2 validated cryptographic +modules** (NIST SP 800-131A, OMB M-19-17). + +| Build | FIPS Status | +|---|---| +| Vault OSS | ❌ No FIPS-validated modules | +| Vault Enterprise + FIPS build | ✅ FIPS 140-2 validated (NSS/BoringCrypto) | +| OpenBao (FIPS build) | ✅ FIPS 140-2 via Go FIPS fork (non-validated) | +| OpenBao + BoringCrypto | 🔄 Community working on validated build | + +**Recommendation:** +- **Non-production / dev:** Vault OSS or OpenBao standard build is fine +- **Production ATO:** Vault Enterprise with FIPS build, OR work with your ISSO to + determine if OpenBao's BoringCrypto build satisfies the ATO boundary + +### 6.4 FedRAMP + +**FedRAMP is for cloud service providers (CSPs), not agencies.** +Census Bureau does not need Vault itself to be FedRAMP authorized. You need: + +1. Vault to run on **FedRAMP-authorized infrastructure** → ✅ AWS GovCloud (us-gov-west-1) + is FedRAMP High authorized +2. Vault to be included in the **system boundary** of an agency ATO +3. The ISSO and AO to authorize Vault as a software component under FISMA + +**HCP Vault Dedicated** (HashiCorp's cloud-hosted Vault) does hold FedRAMP Moderate +authorization — but that is a separate product and would require routing traffic to +HashiCorp's infrastructure, which may not be acceptable for GovCloud workloads. + +**Recommended path:** Self-hosted Vault OSS/Enterprise on AWS GovCloud, included in +the existing Census ATO boundary. This is the same pattern used by other FISMA-High +federal agencies running Vault on GovCloud. + +### 6.5 Compliance Summary + +| Requirement | Vault OSS | Vault Enterprise | OpenBao | Notes | +|---|---|---|---|---| +| BSL license | ✅ internal use OK | ✅ | ✅ MPL 2.0 | Legal sign-off needed for BSL | +| FIPS 140-2 | ❌ | ✅ FIPS build | 🔄 in progress | Required for production ATO | +| FedRAMP (self-hosted) | ✅ via agency ATO | ✅ via agency ATO | ✅ | Not a Vault property — agency ATO | +| AWS GovCloud compatible | ✅ | ✅ | ✅ | Runs on any compute | +| NIST 800-53 audit controls | ✅ audit log | ✅ | ✅ | All builds have audit devices | +| No long-lived credentials | ✅ | ✅ | ✅ | Core Vault capability | + +--- + +## 7. Deployment Architecture + +``` +AWS GovCloud (us-gov-west-1) +└── VPC: csvd-dev-gov + + ┌─────────────────────────────────────┐ + │ ECS Fargate / EC2 (TBD: CSC-1346) │ + │ │ + │ Vault Server (HA cluster) │ + │ ├── Backend: S3 (encrypted) │ + │ ├── HA: DynamoDB lock │ + │ ├── Unseal: AWS KMS auto-unseal │ + │ ├── Audit: CloudWatch Logs │ + │ └── TLS: ACM Private CA │ + └──────────────────┬──────────────────┘ + │ + ┌──────────▼──────────┐ + │ Vault Backends │ + ├─────────────────────┤ + │ aws/ ← dynamic │ + │ roles/adsd-dev │ → IAM credentials for 015325649777 + │ roles/csvd-dev │ → IAM credentials for 229685449397 + │ roles/ditd-prod │ → IAM credentials for ... + │ ... │ + ├─────────────────────┤ + │ auth/aws/ │ + │ roles/sc-executor│ → trusts tf-run-executor-codebuild + └─────────────────────┘ + │ + ┌──────────▼──────────────────────────────┐ + │ CodeBuild: tf-run-executor │ + │ (229685449397, us-gov-west-1) │ + │ │ + │ 1. vault login (IAM auth) │ + │ 2. vault read aws/creds/${ROLE} │ + │ 3. tf-run apply with dynamic creds │ + └─────────────────────────────────────────┘ +``` + +**Cluster topology is an open question (CSC-1346).** Options: +- **Single-node ECS Fargate** — simplest, lowest cost, acceptable for dev/non-prod +- **3-node ECS Fargate HA** — recommended for production +- **EC2 Auto Scaling Group** — most resilient, more ops overhead +- **HCP Vault Dedicated (GovCloud)** — managed, FedRAMP Moderate, but cost and + network routing to HashiCorp infra needs evaluation + +--- + +## 8. What We Are NOT Proposing + +To keep scope realistic: + +- ❌ Replacing AWS Secrets Manager for application secrets +- ❌ PKI / certificate management (yet) +- ❌ Database credentials (yet) +- ❌ Transit encryption as a service (yet) +- ❌ Org-wide Vault rollout — CSVD first, expand after buy-in + +The initial scope is **one thing:** dynamic AWS credentials for the `sc-lambda-ghactions` +executor. Everything else is future potential. + +--- + +## 9. Risks and Mitigations + +| Risk | Likelihood | Impact | Mitigation | +|---|---|---|---| +| Vault cluster goes down, blocking deployments | Medium | High | HA cluster + runbook; circuit-breaker in buildspec | +| ISSO does not authorize Vault for ATO boundary | Medium | High | Engage ISSO early; start with dev account only | +| BSL license rejected by legal | Low | Medium | Switch to OpenBao (same API, MPL 2.0) | +| Team unfamiliar with Vault ops | Medium | Medium | Start small; document runbooks; CSC-1346 topology decision | +| KMS auto-unseal key deletion | Very Low | Critical | KMS key deletion protection enabled; backup unseal keys in SSM | + +--- + +## 10. Recommended Path Forward + +### Phase 1 — Proof of Concept (2 weeks) +- [ ] **CSC-1346** — Decide cluster topology (recommendation: single-node Fargate for PoC) +- [ ] Deploy Vault OSS in `csvd-dev` dev environment +- [ ] Configure AWS IAM auth + one AWS backend role for `csvd-dev` account +- [ ] Wire `buildspec-executor.yml` to use `vault read` instead of `sts:AssumeRole` +- [ ] Demo to CSVD stakeholders + +### Phase 2 — ISSO Engagement + ATO Review (parallel) +- [ ] Work with Census ISSO to add Vault as a component in the ATO boundary +- [ ] Assess FIPS 140-2 requirement — Vault Enterprise vs OpenBao FIPS build +- [ ] Legal review of BSL 1.1 for internal government use + +### Phase 3 — Production Hardening (post-ATO) +- [ ] HA cluster (3-node Fargate) +- [ ] KMS auto-unseal in production +- [ ] CloudWatch audit log forwarding +- [ ] Vault policies for all target accounts +- [ ] **CSC-1344** unblocked → E2E test (**CSC-1343**) + +### Phase 4 — Expansion (post buy-in from Manny) +- [ ] Onboard other teams (adsd, ditd, ent) — one Vault role per account +- [ ] Standardize in account bootstrapping runbook + +--- + +## 11. Call to Action + +| Who | Ask | +|---|---| +| **Manny** | Executive buy-in to invest in Vault as org-wide credential platform | +| **CSVD team** (`badra001`, `dwara001`, `pubba001`, `kalep001`, `alade001`) | Review this proposal; join CSC-1345/CSC-1346 discussion | +| **Census ISSO** | Early engagement on ATO boundary inclusion | +| **Census Legal** | BSL 1.1 license review (or OpenBao as fallback) | +| **`arnol377`** | CSC-1346 topology decision → PoC deployment | + +--- + +## Appendix A — Quick Reference + +```bash +# How CodeBuild authenticates (IAM auth) +vault write auth/aws/login \ + role="sc-automation-executor" \ + iam_http_request_method=POST \ + iam_request_url=... \ + iam_request_body=... \ + iam_request_headers=... + +# How executor gets creds for a target account +vault read aws/creds/adsd-dev +# Returns: access_key, secret_key, security_token (TTL: 15m) + +# Admin: add a new account +vault write aws/roles/new-account \ + credential_type=assumed_role \ + role_arns="arn:aws-us-gov:iam::ACCOUNT_ID:role/r-inf-terraform" + +# Admin: revoke all active leases for an account +vault lease revoke -prefix aws/creds/adsd-dev +``` + +## Appendix B — Links + +- HashiCorp Vault docs: https://developer.hashicorp.com/vault/docs +- Vault AWS Secrets Engine: https://developer.hashicorp.com/vault/docs/secrets/aws +- Vault AWS Auth Method: https://developer.hashicorp.com/vault/docs/auth/aws +- OpenBao project: https://openbao.org +- BSL 1.1 full text: https://www.hashicorp.com/bsl +- Jira CSC-1345: https://jira.it.census.gov/browse/CSC-1345 +- Jira CSC-1346: https://jira.it.census.gov/browse/CSC-1346