diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index d87774f..67936ff 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -8,21 +8,34 @@ system at Census. The architecture is: ``` SC Console (user fills product form) └─> CFN Stack (Custom::* resource) - └─> Lambda (centralized in csvd-dev, 229685449397, us-gov-west-1) + └─> Lambda tf-run-executor-trigger (csvd-dev, 229685449397, us-gov-west-1) ├─> Validates inputs (Pydantic v2 models) ├─> Fetches GHE token from Secrets Manager - ├─> POSTs repository_dispatch to target repo on GHE - └─> Polls GHA workflow run → returns repo URL + PR URL to CFN - -GitHub Enterprise (github.e.it.census.gov) - └─> GHA workflow (repository_dispatch event) - ├─> Clones the target account repo - ├─> Renders HCL/YAML files from templates - └─> Commits + opens PR (repo-init → main) + ├─> Starts CodeBuild: tf-run-proposer + └─> Polls CodeBuild → returns PR URL + repo URL to CFN + +CodeBuild: tf-run-proposer (csvd-dev) + └─> Clones target account repo + └─> Renders HCL/YAML files from template repo (Jinja2) + └─> Commits rendered files → opens PR (propose/sc-automation → main) + + ↕ Human reviews diff and merges PR ↕ + +GHE push webhook → Lambda tf-run-webhook-handler + └─> Reads .sc-automation.yml from default branch + └─> Starts CodeBuild: tf-run-executor (fire-and-forget) + +CodeBuild: tf-run-executor (csvd-dev) + └─> Clones account repo at main (post-merge state) + └─> Optionally assumes cross-account IAM role (sc-automation-codebuild-role) + └─> Runs tf-run apply in LAYER/REGION_DIR + └─> Commits post-apply changes (lock file, remote_state symlinks) to main [skip ci] + └─> Writes ✅/❌ commit status to GHE ``` -This replaces the current CodeBuild + terraform-eks-deployment path with a -GHA-native approach that keeps workflow logic inside the target repos. +This replaces the current CodeBuild + terraform-eks-deployment path. +Workflow logic lives in `buildspec-proposer.yml` and `buildspec-executor.yml` +in this repo; product-type-specific logic lives in `handler.py` in each template repo. --- @@ -154,7 +167,7 @@ scripts in `/apps/terraform/bin/`. Key behavior: - `aws_account_id` and `aws_region` are auto-resolved via `!Sub` in CFN; do NOT add them as user-facing SC form parameters - Lambda ServiceToken: `arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:{name}` -- Lambda timeout must be ≥ CodeBuild/GHA poll window (currently 900s) +- Lambda timeout must be ≥ CodeBuild poll window (currently 900s) --- @@ -165,8 +178,7 @@ scripts in `/apps/terraform/bin/`. Key behavior: - ❌ Do not write temp files to `/tmp` — use `~/tmp` - ❌ Do not use `terraform` directly — use `tf` alias (`tf plan`, `tf apply`) - ❌ Do not run AWS CLI/boto3 without `export AWS_DEFAULT_REGION=us-gov-west-1` -- ❌ Do not add `vpc_id` — field is `vpc_name` -- ❌ Do not use `HappyPathway/terraform-github-repo` public module -- ✅ DO use `CSVD/terraform-github-repo` (https://github.e.it.census.gov/CSVD/terraform-github-repo) - ✅ DO use `gh` CLI for PR management - ✅ DO use `GH_HOST=github.e.it.census.gov` for all GHE commands +- ✅ Cross-account role name is `sc-automation-codebuild-role` — must exist in every target + account and trust the CodeBuild IAM role from csvd-dev before the first executor run diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..87bef24 --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +# Packer pipeline zip files +tf-run-executor-builder.zip + +# Terraform state and local overrides +*.tfstate +*.tfstate.backup +*.tfvars +.terraform/ +.terraform.lock.hcl +.terraform_commits +terraform_data_dirs/ +varfiles/ diff --git a/buildspec-executor.yml b/buildspec-executor.yml new file mode 100644 index 0000000..8973fa3 --- /dev/null +++ b/buildspec-executor.yml @@ -0,0 +1,185 @@ +version: 0.2 + +# --------------------------------------------------------------------------- +# tf-run-executor buildspec +# +# Purpose: clone account repo main branch, optionally assume a cross-account +# IAM role, and run tf-run apply in the target layer/region directory. +# This is triggered AFTER a proposer PR has been reviewed and merged. +# It does not render templates or open a PR. +# +# Required env-var overrides per build (supplied by Lambda): +# ACCOUNT_REPO - account repo name, e.g. 229685449397-csvd-dev-platform-dev-gov +# LAYER - terraform layer: common | infrastructure | vpc +# REGION_DIR - region directory: east | west | global +# GITHUB_TOKEN - GHE PAT (PLAINTEXT, value from Secrets Manager) +# +# Optional env-var overrides: +# TARGET_ACCOUNT_ID - AWS account ID to assume the cross-account role in +# (default: empty = run with CodeBuild role, csvd-dev only) +# CROSS_ACCOUNT_ROLE - IAM role name to assume in TARGET_ACCOUNT_ID +# (default: r-inf-terraform) +# TF_RUN_START_TAG - tf-run.data TAG label to start from (default: empty = from top) +# DRY_RUN - "true" = tf-run plan only, no apply (default: "false") +# --------------------------------------------------------------------------- + +env: + variables: + GITHUB_ORG: "SCT-Engineering" + TF_BINARY_S3_PREFIX: "s3://csvd-packer-pipeline-assets/terraform" + GH_CLI_S3_PREFIX: "s3://csvd-packer-pipeline-assets/tools" + CENSUS_CA_S3: "s3://csvd-packer-pipeline-assets/certs/census-ca.pem" + TERRAFORM_SUPPORT_REPO: "terraform/support" + HTTPS_PROXY: "http://proxy.tco.census.gov:3128" + NO_PROXY: "github.e.it.census.gov,169.254.169.254,169.254.170.2" + # Per-build defaults (overridden via environmentVariablesOverride in Lambda) + TARGET_ACCOUNT_ID: "" + CROSS_ACCOUNT_ROLE: "r-inf-terraform" + TF_RUN_START_TAG: "" + DRY_RUN: "false" + +phases: + install: + commands: + # --- Version governance: clone terraform/support to read org-canonical versions --- + - git clone --depth 1 "https://${GITHUB_TOKEN}@github.e.it.census.gov/${TERRAFORM_SUPPORT_REPO}.git" /tmp/tf-support + - export TF_VERSION=$(cat /tmp/tf-support/terraform/VERSION) + - export GH_VERSION=$(cat /tmp/tf-support/github-cli-releases/VERSION) + - echo "Using Terraform ${TF_VERSION}, gh CLI ${GH_VERSION}" + + # --- Terraform binary (registry.terraform.io is blocked on Census network; use S3) --- + - aws s3 cp "${TF_BINARY_S3_PREFIX}/terraform_${TF_VERSION}_linux_amd64.zip" /tmp/terraform.zip + - unzip -o /tmp/terraform.zip -d /usr/local/bin/ && chmod +x /usr/local/bin/terraform + - ln -sf /usr/local/bin/terraform /usr/local/bin/tf + + # --- Census CA certificate (required for TLS to github.e.it.census.gov) --- + - aws s3 cp "$CENSUS_CA_S3" /etc/pki/ca-trust/source/anchors/census-ca.pem + - update-ca-trust extract + + # --- tf-run toolchain (sourced from terraform/support, already cloned above) --- + # Canonical versions live in terraform/support local-app/ — no copies kept in this repo. + - cp /tmp/tf-support/local-app/tf-run/tf-run.sh /usr/local/bin/tf-run + - cp /tmp/tf-support/local-app/tf-control/tf-control.sh /usr/local/bin/tf-control.sh + - cp /tmp/tf-support/local-app/tf-directory-setup/tf-directory-setup.py /usr/local/bin/tf-directory-setup.py + - chmod +x /usr/local/bin/tf-run /usr/local/bin/tf-control.sh /usr/local/bin/tf-directory-setup.py + # Create tf-{action} symlinks expected by tf-run and account repo steps + - > + for action in init plan apply destroy refresh output validate import state fmt taint console; do + ln -sf /usr/local/bin/tf-control.sh /usr/local/bin/tf-${action}; + done + # Account repo .tf-control files set TFCOMMAND=terraform_latest (the Census workstation alias). + # In CodeBuild the binary is just 'terraform'; create the alias so tf-control.sh resolves it. + - ln -sf /usr/local/bin/terraform /usr/local/bin/terraform_latest + + # --- Plugin cache directory (referenced by .tf-control.tfrc in every account repo) --- + # .tf-control.tfrc sets plugin_cache_dir = "/data/terraform/terraform.d/plugin-cache" + # and filesystem_mirror path = "/data/terraform/terraform.d/providers". + # Create both so Terraform does not error on init; the mirror is empty so Terraform + # falls through to the 'direct' block in the tfrc (via Census proxy to registry.terraform.io). + - mkdir -p /data/terraform/terraform.d/plugin-cache /data/terraform/terraform.d/providers + + # --- Python deps for tf-directory-setup.py --- + - pip3 install --quiet python-dateutil pyyaml + + # --- gh CLI (from S3, for any post-apply verification steps) --- + - aws s3 cp "${GH_CLI_S3_PREFIX}/gh_${GH_VERSION}_linux_amd64.tar.gz" /tmp/gh.tar.gz + - mkdir -p /tmp/gh-cli && tar -xzf /tmp/gh.tar.gz -C /tmp/gh-cli --strip-components=1 + - cp /tmp/gh-cli/bin/gh /usr/local/bin/gh && chmod +x /usr/local/bin/gh + + build: + commands: + # --- Configure git to rewrite SSH URLs to HTTPS --- + # Module sources in account repos use ssh://git@github.e.it.census.gov/... or git@... + # This rewrite transparently redirects them to HTTPS + PAT at the git layer. + - git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "ssh://git@github.e.it.census.gov/" + - git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "git@github.e.it.census.gov:" + + # --- Clone account repo from main (the reviewed + merged state) --- + - git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${ACCOUNT_REPO}.git" repo + - cd repo + # Verify we are on main (not a work branch) + - git checkout main + - echo "Applying from $(git rev-parse --short HEAD) on main" + + # --- Assume cross-account role (if TARGET_ACCOUNT_ID is set) --- + # The role (default: r-inf-terraform) must exist in the target account and + # trust arn:...:iam::229685449397:role/tf-run-executor-codebuild. + # Override CROSS_ACCOUNT_ROLE per-build to use a different role name. + - | + if [ -n "${TARGET_ACCOUNT_ID}" ]; then + PARTITION=$(aws sts get-caller-identity --query Arn --output text | cut -d: -f2) + ROLE_ARN="arn:${PARTITION}:iam::${TARGET_ACCOUNT_ID}:role/${CROSS_ACCOUNT_ROLE}" + echo "Assuming cross-account role: ${ROLE_ARN}" + CREDS=$(aws sts assume-role \ + --role-arn "${ROLE_ARN}" \ + --role-session-name "sc-automation-${ACCOUNT_REPO}" \ + --query Credentials \ + --output json) + export AWS_ACCESS_KEY_ID=$(echo "$CREDS" | python3 -c "import json,sys; print(json.load(sys.stdin)['AccessKeyId'])") + export AWS_SECRET_ACCESS_KEY=$(echo "$CREDS" | python3 -c "import json,sys; print(json.load(sys.stdin)['SecretAccessKey'])") + export AWS_SESSION_TOKEN=$(echo "$CREDS" | python3 -c "import json,sys; print(json.load(sys.stdin)['SessionToken'])") + echo "Assumed role in account ${TARGET_ACCOUNT_ID}" + else + echo "No TARGET_ACCOUNT_ID set — running with CodeBuild role (csvd-dev only)" + fi + + # --- Run Terraform in target layer/region directory --- + # tf-run auto-proceeds on non-TTY stdin (read -t timeout defaults to "y") + # + # NOTE on file-generating tf-run.data directives: + # REMOTE-STATE — generates workspace remote_state.yml from parent + # COMMAND tf-directory-setup.py — generates remote_state.backend.tf + variant files + # The Proposer already ran both of these and committed the results in the PR. + # When tf-run hits these steps here they are idempotent: they overwrite files + # that already exist with identical content. No new files are created at apply time. + # + # NOTE on logs/: tf-control.sh writes every plan/apply to logs/{action}.{timestamp}.log. + # This directory is ephemeral (never committed). Ensure logs/ is in .gitignore. + - cd "${LAYER}/${REGION_DIR}" + - | + if [ "${DRY_RUN}" = "true" ]; then + echo "DRY_RUN=true — running tf-run plan only" + TFARGS="-no-color" tf-run plan + elif [ -n "${TF_RUN_START_TAG}" ]; then + TFARGS="-auto-approve" tf-run apply "tag:${TF_RUN_START_TAG}" + else + TFARGS="-auto-approve" tf-run apply + fi + + # --- Commit post-apply file changes back to main --- + # After a successful apply tf-run.data typically runs: + # COMMAND tf-directory-setup.py --link s3 + # which re-links remote_state.{dir}.tf from .tf.none → .tf.s3. + # terraform init also generates/updates .terraform.lock.hcl. + # Both of these changes must be committed back to main so: + # (a) the repo reflects actual state for future Proposer re-renders + # (b) subsequent tf-init on main does not re-download all providers + # [skip ci] prevents the push from re-triggering the webhook executor. + - cd "${CODEBUILD_SRC_DIR}/repo" + - | + git add -A -- "${LAYER}/${REGION_DIR}/remote_state."* \ + "${LAYER}/${REGION_DIR}/.terraform.lock.hcl" 2>/dev/null || true + if ! git diff --cached --quiet; then + git -c user.email="sc-automation@census.gov" \ + -c user.name="SC Automation" \ + commit -m "chore: executor post-apply update ${LAYER}/${REGION_DIR} [skip ci]" + git push \ + "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${ACCOUNT_REPO}.git" \ + HEAD:main + echo "Committed and pushed post-apply changes to main" + else + echo "No post-apply file changes to commit" + fi + + post_build: + commands: + - echo "BUILD_RESULT=${CODEBUILD_BUILD_SUCCEEDING}" + - echo "ACCOUNT_REPO=${ACCOUNT_REPO}" + - echo "LAYER=${LAYER} REGION_DIR=${REGION_DIR}" + +cache: + paths: + # Cache the provider plugin cache across builds for faster tf-init. + # Providers downloaded via Census proxy are stored here; subsequent builds + # skip re-downloading providers that haven't changed. + - /data/terraform/terraform.d/plugin-cache/**/* diff --git a/buildspec-proposer.yml b/buildspec-proposer.yml new file mode 100644 index 0000000..28e457d --- /dev/null +++ b/buildspec-proposer.yml @@ -0,0 +1,244 @@ +version: 0.2 + +# --------------------------------------------------------------------------- +# tf-run-proposer buildspec +# +# Purpose: clone account repo, render template files, write extra files, +# commit + push to a work branch, open a PR for human review. +# Does NOT run Terraform — that is the executor's job after merge. +# +# Required env-var overrides per build (supplied by Lambda): +# ACCOUNT_REPO - account repo name, e.g. 229685449397-csvd-dev-platform-dev-gov +# LAYER - terraform layer: common | infrastructure | vpc +# REGION_DIR - region directory: east | west | global +# GITHUB_TOKEN - GHE PAT (PLAINTEXT, value from Secrets Manager) +# +# Optional env-var overrides: +# GIT_BRANCH - branch to commit/PR from (default: propose/sc-automation) +# TEMPLATE_REPO - GHE repo containing workload template files (flat layout) +# TEMPLATE_SOURCE_PATH - subdirectory within TEMPLATE_REPO to use as root (empty = whole repo) +# TEMPLATE_VARS - JSON map of Jinja2 variables for template rendering +# EXTRA_FILES - JSON map {"relative/path": "content"} written after template rendering +# --------------------------------------------------------------------------- + +env: + variables: + GITHUB_ORG: "SCT-Engineering" + CENSUS_CA_S3: "s3://csvd-packer-pipeline-assets/certs/census-ca.pem" + # Org-canonical version governance repo (needed for gh CLI version) + TERRAFORM_SUPPORT_REPO: "terraform/support" + GH_CLI_S3_PREFIX: "s3://csvd-packer-pipeline-assets/tools" + HTTPS_PROXY: "http://proxy.tco.census.gov:3128" + NO_PROXY: "github.e.it.census.gov,169.254.169.254,169.254.170.2" + # Per-build defaults (overridden via environmentVariablesOverride in Lambda) + GIT_BRANCH: "propose/sc-automation" + TEMPLATE_REPO: "" + TEMPLATE_SOURCE_PATH: "" # subdirectory within TEMPLATE_REPO to use as root (empty = whole repo) + TEMPLATE_VARS: "{}" + EXTRA_FILES: "{}" + +phases: + install: + commands: + # --- Version governance: clone terraform/support to read org-canonical versions --- + - git clone --depth 1 "https://${GITHUB_TOKEN}@github.e.it.census.gov/${TERRAFORM_SUPPORT_REPO}.git" /tmp/tf-support + - export GH_VERSION=$(cat /tmp/tf-support/github-cli-releases/VERSION) + - echo "Using gh CLI ${GH_VERSION}" + + # --- Census CA certificate (required for TLS to github.e.it.census.gov) --- + - aws s3 cp "$CENSUS_CA_S3" /etc/pki/ca-trust/source/anchors/census-ca.pem + - update-ca-trust extract + + # --- tf-directory-setup.py (generates remote_state.backend.tf + variant files) --- + # Must be available in Proposer because ALL file generation happens here, not in Executor. + - cp /tmp/tf-support/local-app/tf-directory-setup/tf-directory-setup.py /usr/local/bin/tf-directory-setup.py + - chmod +x /usr/local/bin/tf-directory-setup.py + + # --- Python deps for template rendering + tf-directory-setup.py --- + - pip3 install --quiet jinja2 python-dateutil pyyaml + + # --- gh CLI (from S3, version pinned in terraform/support) --- + - aws s3 cp "${GH_CLI_S3_PREFIX}/gh_${GH_VERSION}_linux_amd64.tar.gz" /tmp/gh.tar.gz + - mkdir -p /tmp/gh-cli && tar -xzf /tmp/gh.tar.gz -C /tmp/gh-cli --strip-components=1 + - cp /tmp/gh-cli/bin/gh /usr/local/bin/gh && chmod +x /usr/local/bin/gh + + build: + commands: + # --- Configure git to rewrite SSH URLs to HTTPS --- + # Account repos reference Terraform modules via ssh://git@github.e.it.census.gov/... + # This rewrite makes those paths visible to git without requiring an SSH key. + - git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "ssh://git@github.e.it.census.gov/" + - git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "git@github.e.it.census.gov:" + + # --- Clone account repo and check out (or create) the work branch --- + - git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${ACCOUNT_REPO}.git" repo + - cd repo + - git checkout -B "${GIT_BRANCH}" + + # --- Render template repo (if specified) --- + # Clone TEMPLATE_REPO; render .j2 files with TEMPLATE_VARS via Jinja2 StrictUndefined; + # copy non-template files as-is. + # Template files are FLAT (no layer/workspace nesting inside the template repo). + # They are written into ${LAYER}/${REGION_DIR}/ in the account repo, which is + # already known from the env vars supplied by the Lambda. + - | + if [ -n "${TEMPLATE_REPO}" ]; then + git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${TEMPLATE_REPO}.git" /tmp/template-repo + python3 - <<'PYEOF' + import json, os, pathlib, shutil + from jinja2 import Environment, FileSystemLoader, StrictUndefined + + template_vars = json.loads(os.environ.get('TEMPLATE_VARS', '{}')) + layer = os.environ['LAYER'] + region_dir = os.environ['REGION_DIR'] + + src_root = pathlib.Path('/tmp/template-repo') + # Flat template files land at LAYER/REGION_DIR/ in the account repo. + # source_path lets a single template repo hold multiple product variants + # as subdirectories; only that subdirectory is used as the source root. + source_path = os.environ.get('TEMPLATE_SOURCE_PATH', '').strip('/') + if source_path: + src_root = src_root / source_path + + dst_root = pathlib.Path('.') / layer / region_dir + dst_root.mkdir(parents=True, exist_ok=True) + + rendered = 0 + copied = 0 + for src in src_root.rglob('*'): + if src.is_dir() or any(part.startswith('.git') for part in src.parts): + continue + rel = src.relative_to(src_root) + # Files starting with '.' at the template root are written to the account + # repo root (e.g. .sc-automation.yml), not into LAYER/REGION_DIR/. + if len(rel.parts) == 1 and rel.name.startswith('.'): + dst_base = pathlib.Path('.') + else: + dst_base = dst_root + if src.suffix == '.j2': + dst = dst_base / rel.with_suffix('') + dst.parent.mkdir(parents=True, exist_ok=True) + env = Environment( + loader=FileSystemLoader(str(src.parent)), + undefined=StrictUndefined, + keep_trailing_newline=True, + ) + content = env.get_template(src.name).render(**template_vars) + dst.write_text(content) + rendered += 1 + else: + dst = dst_base / rel + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dst) + copied += 1 + print(f'Template repo: rendered {rendered} .j2 file(s), copied {copied} file(s) -> {layer}/{region_dir}/') + PYEOF + else + echo 'No TEMPLATE_REPO specified — skipping template rendering' + fi + + # --- Write extra config files (JSON map path -> content); override template output --- + - | + python3 -c " + import json, os, pathlib + files = json.loads(os.environ.get('EXTRA_FILES', '{}')) + for path, content in files.items(): + p = pathlib.Path(path) + p.parent.mkdir(parents=True, exist_ok=True) + p.write_text(content) + print(f'Wrote {len(files)} extra file(s)') + " + + # --- Bootstrap workspace state files (REMOTE-STATE + tf-directory-setup.py) --- + # tf-run.sh's REMOTE-STATE directive generates workspace remote_state.yml at apply time. + # tf-run.data COMMAND steps run tf-directory-setup.py to generate remote_state.backend.tf + # and the three variant files (.tf.s3 / .tf.local / .tf.none) + activate the symlink. + # + # ALL of this must happen in the Proposer so every generated file appears in the PR diff. + # The Executor must not silently create files; it inherits what the PR committed. + - | + python3 - <<'PYEOF' + import os, re, subprocess, sys, pathlib + + repo_root = pathlib.Path('.') + + for tfrun_data in sorted(repo_root.rglob('tf-run.data')): + ws_dir = tfrun_data.parent + # Skip .git internals + if any(p.startswith('.git') for p in ws_dir.parts): + continue + + content = tfrun_data.read_text() + lines = [l.strip() for l in content.splitlines() if l.strip() and not l.startswith('#')] + + # ── Step 1: REMOTE-STATE ────────────────────────────────────────────────── + # Mirrors tf-run.sh: read ../remote_state.yml, append /{workspace_name} to + # the directory field, write workspace-level remote_state.yml. + if any(l.startswith('REMOTE-STATE') for l in lines): + parent_rs = ws_dir.parent / 'remote_state.yml' + if not parent_rs.exists(): + print(f'WARNING: {ws_dir}: REMOTE-STATE in tf-run.data but no ' + f'parent remote_state.yml found — skipping', flush=True) + continue + parent_text = parent_rs.read_text() + subdir = ws_dir.name + # Replicate: sed -E s#(^directory.*)\"\'#\1/{subdir}\" + ws_rs_text = re.sub( + r'^(directory\s*:\s*")([^"]+)(")', + lambda m: m.group(1) + m.group(2).rstrip('/') + '/' + subdir + m.group(3), + parent_text, count=1, flags=re.MULTILINE + ) + ws_rs = ws_dir / 'remote_state.yml' + ws_rs.write_text(ws_rs_text) + print(f'REMOTE-STATE: wrote {ws_rs} (directory += /{subdir})', flush=True) + + # ── Step 2: tf-directory-setup.py ──────────────────────────────────────── + # Run whenever the workspace has a remote_state.yml (just written or from + # the template). Generates remote_state.backend.tf + 3 variant files. + # --link none: initial state; the Executor will re-link to s3 after first apply. + rs_file = ws_dir / 'remote_state.yml' + if rs_file.exists(): + result = subprocess.run( + [sys.executable, '/usr/local/bin/tf-directory-setup.py', '--link', 'none'], + cwd=str(ws_dir), capture_output=True, text=True + ) + print(result.stdout, end='', flush=True) + if result.returncode != 0: + print(f'ERROR: tf-directory-setup.py failed in {ws_dir}:\n{result.stderr}', + file=sys.stderr, flush=True) + sys.exit(result.returncode) + + print('Bootstrap complete.', flush=True) + PYEOF + + # --- Commit and push --- + - git add -A + - | + git -c user.email="sc-automation@census.gov" \ + -c user.name="SC Automation" \ + commit -m "SC propose: ${LAYER}/${REGION_DIR} [${ACCOUNT_REPO}]" \ + --allow-empty + - git push origin "${GIT_BRANCH}" + + # --- Open PR (idempotent: skip if PR already exists for this branch) --- + - | + GH_HOST=github.e.it.census.gov \ + GH_TOKEN="${GITHUB_TOKEN}" \ + gh pr create \ + --title "SC propose: ${LAYER}/${REGION_DIR} [${ACCOUNT_REPO}]" \ + --body "Automated proposal from Service Catalog. Review and merge, then launch the **Apply** product to run \`tf-run apply\`." \ + --base main \ + --head "${GIT_BRANCH}" \ + || echo "PR already exists or create failed — continuing" + + post_build: + commands: + - echo "BUILD_RESULT=${CODEBUILD_BUILD_SUCCEEDING}" + - | + PR_URL=$(GH_HOST=github.e.it.census.gov \ + GH_TOKEN="${GITHUB_TOKEN}" \ + gh pr view \ + --repo "${GITHUB_ORG}/${ACCOUNT_REPO}" \ + "${GIT_BRANCH}" \ + --json url -q .url 2>/dev/null || echo "") + echo "PR_URL=${PR_URL}" diff --git a/buildspec.yml b/buildspec.yml index 23e9778..9a64d7a 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -4,48 +4,74 @@ version: 0.2 # tf-run-executor buildspec # # Required env-var overrides per build (supplied by Lambda or manual CLI): -# ACCOUNT_REPO - account repo name, e.g. 229685449397-csvd-dev-platform-dev-gov -# LAYER - terraform layer: common | infrastructure | vpc -# REGION_DIR - region directory: east | west -# GITHUB_TOKEN - GHE PAT (type PLAINTEXT, value from Secrets Manager) +# ACCOUNT_REPO - account repo name, e.g. 229685449397-csvd-dev-platform-dev-gov +# LAYER - terraform layer: common | infrastructure | vpc +# REGION_DIR - region directory: east | west +# GITHUB_TOKEN - GHE PAT (type PLAINTEXT, value from Secrets Manager) # # Optional env-var overrides: -# GIT_BRANCH - branch to commit/PR from (default: repo-init) -# TF_RUN_START_TAG - tf-run.data TAG label to start from (default: empty = from top) -# EXTRA_FILES - JSON map {"relative/path": "content"} written before tf-run -# DRY_RUN - "true" = tf plan only, no apply (default: "false") +# GIT_BRANCH - branch to commit/PR from (default: repo-init) +# TF_RUN_START_TAG - tf-run.data TAG label to start from (default: empty = from top) +# TEMPLATE_REPO - GHE repo containing Jinja2/.tf template files (default: empty) +# TEMPLATE_VARS - JSON map of Jinja2 variables for template rendering (default: {}) +# EXTRA_FILES - JSON map {"relative/path": "content"} written after template rendering +# DRY_RUN - "true" = tf plan only, no apply (default: "false") +# TARGET_ACCOUNT_ID - AWS account ID to assume role in before running tf-run +# (default: empty = run with CodeBuild's own credentials, +# i.e. csvd-dev. Set this when targeting a different account.) +# CROSS_ACCOUNT_ROLE - IAM role name to assume in TARGET_ACCOUNT_ID +# (default: r-inf-terraform) # --------------------------------------------------------------------------- env: variables: GITHUB_ORG: "SCT-Engineering" - TF_BINARY_S3: "s3://csvd-packer-pipeline-assets/terraform/terraform_1.9.1_linux_amd64.zip" + # S3 prefixes — filenames are resolved at build time from terraform/support VERSION files. + # The S3 bucket must contain the version pinned in terraform/support (keep in sync). + TF_BINARY_S3_PREFIX: "s3://csvd-packer-pipeline-assets/terraform" + GH_CLI_S3_PREFIX: "s3://csvd-packer-pipeline-assets/tools" CENSUS_CA_S3: "s3://csvd-packer-pipeline-assets/certs/census-ca.pem" - GH_CLI_S3: "s3://csvd-packer-pipeline-assets/tools/gh_2.49.0_linux_amd64.tar.gz" + # Org-canonical version governance: clone this repo to read VERSION files + TERRAFORM_SUPPORT_REPO: "terraform/support" HTTPS_PROXY: "http://proxy.tco.census.gov:3128" - NO_PROXY: "github.e.it.census.gov,169.254.169.254" + NO_PROXY: "github.e.it.census.gov,169.254.169.254,169.254.170.2" # Per-build defaults (overridden via environmentVariablesOverride in Lambda) GIT_BRANCH: "repo-init" DRY_RUN: "false" TF_RUN_START_TAG: "" + TEMPLATE_REPO: "" + TEMPLATE_VARS: "{}" EXTRA_FILES: "{}" + TARGET_ACCOUNT_ID: "" + CROSS_ACCOUNT_ROLE: "r-inf-terraform" phases: install: commands: - # --- Terraform binary (registry.terraform.io is blocked; pull from S3) --- - - aws s3 cp "$TF_BINARY_S3" /tmp/terraform.zip + # --- Version governance: clone terraform/support to read org-canonical versions --- + # This repo (github.e.it.census.gov/terraform/support) is the single source of truth + # for which Terraform and gh CLI versions the org has blessed. We read VERSION files + # from it rather than hardcoding versions here. + - git clone --depth 1 "https://${GITHUB_TOKEN}@github.e.it.census.gov/${TERRAFORM_SUPPORT_REPO}.git" /tmp/tf-support + - export TF_VERSION=$(cat /tmp/tf-support/terraform/VERSION) + - export GH_VERSION=$(cat /tmp/tf-support/github-cli-releases/VERSION) + - echo "Using Terraform ${TF_VERSION}, gh CLI ${GH_VERSION}" + + # --- Terraform binary (registry.terraform.io is blocked on Census network; use S3) --- + # S3 bucket must contain the version pinned in terraform/support/terraform/VERSION. + - aws s3 cp "${TF_BINARY_S3_PREFIX}/terraform_${TF_VERSION}_linux_amd64.zip" /tmp/terraform.zip - unzip -o /tmp/terraform.zip -d /usr/local/bin/ && chmod +x /usr/local/bin/terraform - ln -sf /usr/local/bin/terraform /usr/local/bin/tf - # --- Census CA certificate (GHE TLS) --- + # --- Census CA certificate (required for TLS to github.e.it.census.gov) --- - aws s3 cp "$CENSUS_CA_S3" /etc/pki/ca-trust/source/anchors/census-ca.pem - update-ca-trust extract - # --- tf-run toolchain (sourced from this repo's scripts/) --- - - cp "$CODEBUILD_SRC_DIR/scripts/tf-run" /usr/local/bin/tf-run - - cp "$CODEBUILD_SRC_DIR/scripts/tf-control.sh" /usr/local/bin/tf-control.sh - - cp "$CODEBUILD_SRC_DIR/scripts/tf-directory-setup.py" /usr/local/bin/tf-directory-setup.py + # --- tf-run toolchain (sourced from terraform/support, already cloned above) --- + # Canonical versions live in terraform/support local-app/ — no copies kept in this repo. + - cp /tmp/tf-support/local-app/tf-run/tf-run.sh /usr/local/bin/tf-run + - cp /tmp/tf-support/local-app/tf-control/tf-control.sh /usr/local/bin/tf-control.sh + - cp /tmp/tf-support/local-app/tf-directory-setup/tf-directory-setup.py /usr/local/bin/tf-directory-setup.py - chmod +x /usr/local/bin/tf-run /usr/local/bin/tf-control.sh /usr/local/bin/tf-directory-setup.py # Create tf-{action} symlinks expected by tf-run and account repo steps - > @@ -53,23 +79,75 @@ phases: ln -sf /usr/local/bin/tf-control.sh /usr/local/bin/tf-${action}; done - # --- Python deps for tf-directory-setup.py --- + # --- Python deps for tf-directory-setup.py and template rendering --- - pip3 install --quiet jinja2 python-dateutil pyyaml - # --- gh CLI --- - - aws s3 cp "$GH_CLI_S3" /tmp/gh.tar.gz + # --- gh CLI (S3 bucket must contain the version pinned in terraform/support) --- + - aws s3 cp "${GH_CLI_S3_PREFIX}/gh_${GH_VERSION}_linux_amd64.tar.gz" /tmp/gh.tar.gz - mkdir -p /tmp/gh-cli - tar -xzf /tmp/gh.tar.gz -C /tmp/gh-cli --strip-components=1 - cp /tmp/gh-cli/bin/gh /usr/local/bin/gh && chmod +x /usr/local/bin/gh build: commands: - # --- Clone account repo over HTTPS (SSH is blocked by Census proxy) --- + # --- Configure git to rewrite SSH URLs to HTTPS --- + # Account repos reference Terraform modules via ssh://git@github.e.it.census.gov/... + # This rewrite makes those module fetches work transparently via HTTPS + PAT, + # avoiding the need for a per-repo deploy key. + - git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "ssh://git@github.e.it.census.gov/" + - git config --global url."https://${GITHUB_TOKEN}@github.e.it.census.gov/".insteadOf "git@github.e.it.census.gov:" + + # --- Clone account repo --- - git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${ACCOUNT_REPO}.git" repo - cd repo - git checkout -B "${GIT_BRANCH}" + # --- Render template repo (if specified) into account repo --- + # Clone TEMPLATE_REPO, render .j2 files with TEMPLATE_VARS via Jinja2, + # copy non-template files as-is. Results land in the account repo tree + # at the same relative paths. EXTRA_FILES applied afterwards can override. + - | + if [ -n "${TEMPLATE_REPO}" ]; then + git clone "https://${GITHUB_TOKEN}@github.e.it.census.gov/${GITHUB_ORG}/${TEMPLATE_REPO}.git" /tmp/template-repo + python3 - <<'PYEOF' + import json, os, pathlib, shutil + from jinja2 import Environment, FileSystemLoader, StrictUndefined + + template_vars = json.loads(os.environ.get('TEMPLATE_VARS', '{}')) + src_root = pathlib.Path('/tmp/template-repo') + dst_root = pathlib.Path('.') # already inside cloned account repo + + rendered = 0 + copied = 0 + for src in src_root.rglob('*'): + if src.is_dir() or any(part.startswith('.git') for part in src.parts): + continue + rel = src.relative_to(src_root) + if src.suffix == '.j2': + # Render Jinja2 template; strip .j2 extension in destination + dst = dst_root / rel.with_suffix('') + dst.parent.mkdir(parents=True, exist_ok=True) + env = Environment( + loader=FileSystemLoader(str(src.parent)), + undefined=StrictUndefined, + keep_trailing_newline=True, + ) + content = env.get_template(src.name).render(**template_vars) + dst.write_text(content) + rendered += 1 + else: + dst = dst_root / rel + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dst) + copied += 1 + print(f'Template repo: rendered {rendered} .j2 file(s), copied {copied} file(s)') + PYEOF + else + echo 'No TEMPLATE_REPO specified — skipping template rendering' + fi + # --- Write extra config files passed in from Lambda (JSON map path -> content) --- + # Applied after template rendering; keys here override template output. - | python3 -c " import json, os, pathlib @@ -90,6 +168,29 @@ phases: --allow-empty - git push origin "${GIT_BRANCH}" + # --- Assume cross-account role (if TARGET_ACCOUNT_ID is set) --- + # CodeBuild runs in csvd-dev by default. To run tf-run apply against resources + # in a different AWS account, set TARGET_ACCOUNT_ID. The role (default: + # r-inf-terraform) must exist in that account and trust the CodeBuild IAM + # role from csvd-dev. Override CROSS_ACCOUNT_ROLE per-build if needed. + - | + if [ -n "${TARGET_ACCOUNT_ID}" ]; then + PARTITION=$(aws sts get-caller-identity --query Arn --output text | cut -d: -f2) + ROLE_ARN="arn:${PARTITION}:iam::${TARGET_ACCOUNT_ID}:role/${CROSS_ACCOUNT_ROLE}" + echo "Assuming cross-account role: ${ROLE_ARN}" + CREDS=$(aws sts assume-role \ + --role-arn "${ROLE_ARN}" \ + --role-session-name "sc-automation-${ACCOUNT_REPO}" \ + --query Credentials \ + --output json) + export AWS_ACCESS_KEY_ID=$(echo "$CREDS" | python3 -c "import json,sys; print(json.load(sys.stdin)['AccessKeyId'])") + export AWS_SECRET_ACCESS_KEY=$(echo "$CREDS" | python3 -c "import json,sys; print(json.load(sys.stdin)['SecretAccessKey'])") + export AWS_SESSION_TOKEN=$(echo "$CREDS" | python3 -c "import json,sys; print(json.load(sys.stdin)['SessionToken'])") + echo "Successfully assumed role in account ${TARGET_ACCOUNT_ID}" + else + echo "No TARGET_ACCOUNT_ID set — running with CodeBuild role (csvd-dev)" + fi + # --- Run Terraform in target layer/region directory --- # tf-run auto-proceeds on non-TTY stdin (read -t timeout defaults to "y") - cd "${LAYER}/${REGION_DIR}" diff --git a/buildspec.yml.j2 b/buildspec.yml.j2 deleted file mode 100644 index fd4ea2c..0000000 --- a/buildspec.yml.j2 +++ /dev/null @@ -1,93 +0,0 @@ -version: 0.2 - -env: - variables: - PACKER_TEMPLATE_FILE: "{{ packer_template_file }}" - AWS_REGION: "{{ aws_region }}" - ECR_REPOSITORY: "{{ ecr_repository }}" - AWS_ACCOUNT_ID: "{{ aws_account_id }}" - {% if environment_variables %} - {% for key, value in environment_variables.items() %} - {{ key }}: "{{ value }}" - {% endfor %} - {% endif %} - -phases: - install: - commands: - - echo "Installing Packer and dependencies for Service Catalog Lambda build..." - {% if tools %} - {% for tool in tools %} - - echo "Installing {{ tool.name }} version {{ tool.version }}..." - - aws s3 cp s3://{{ assets_bucket }}/{{ tool.zip_path }} /tmp/{{ tool.zip_path }} - - unzip -o /tmp/{{ tool.zip_path }} -d /tmp/{{ tool.name }} - - chmod +x /tmp/{{ tool.name }}/{{ tool.binary_name }} - - mv /tmp/{{ tool.name }}/{{ tool.binary_name }} {{ tool.install_path }}/ - - {{ tool.binary_name }} version - {% endfor %} - {% endif %} - - echo "Packer installation complete" - - pre_build: - commands: - - echo "Initializing Packer plugins for Lambda container build..." - - packer init ${PACKER_TEMPLATE_FILE} - - echo "Packer plugins initialized successfully" - - build: - commands: - - echo "Building Service Catalog Lambda container image..." - - # Get ECR login credentials - - echo "Logging into ECR..." - - aws ecr get-login-password --region ${AWS_REGION} | docker login --username AWS --password-stdin ${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com - - # Build repository URI for ECR - - | - if [ -n "$ECR_REPOSITORY" ]; then - REPOSITORY_URI="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_REPOSITORY}" - else - echo "ERROR: ECR_REPOSITORY is required for container builds" - exit 1 - fi - - # Set image tag - - | - if [ -n "$IMAGE_VERSION_TAG" ]; then - TAG="$IMAGE_VERSION_TAG" - elif [ -n "$IMAGE_TAG" ]; then - TAG="$IMAGE_TAG" - else - TAG="latest" - fi - - # Get ECR credentials for Packer - - ECR_USERNAME="AWS" - - ECR_PASSWORD=$(aws ecr get-login-password --region ${AWS_REGION}) - - ECR_LOGIN_SERVER="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com" - - # Use the cloned base image from ECR instead of public ECR - - BASE_IMAGE="{{ base_image }}" - - - echo "Building with repository_uri=$REPOSITORY_URI tag=$TAG base_image=$BASE_IMAGE" - - # Run Packer build with required variables for Lambda container - - | - packer build \ - -var "repository_uri=$REPOSITORY_URI" \ - -var "tag=$TAG" \ - -var "base_image=$BASE_IMAGE" \ - -var "ecr_login_username=$ECR_USERNAME" \ - -var "ecr_login_password=$ECR_PASSWORD" \ - -var "ecr_login_server=$ECR_LOGIN_SERVER" \ - ${PACKER_TEMPLATE_FILE} - - post_build: - commands: - - echo "Service Catalog Lambda container image build completed successfully" - - echo "Image pushed to $REPOSITORY_URI:$TAG" - - echo "Lambda function is ready for deployment with EventBridge and Service Catalog" - -artifacts: - files: - - '**/*' diff --git a/deploy/codebuild.tf b/deploy/codebuild.tf index d65d38a..1f4fcf3 100644 --- a/deploy/codebuild.tf +++ b/deploy/codebuild.tf @@ -15,10 +15,105 @@ data "aws_secretsmanager_secret_version" "ghe_token" { secret_id = "ghe-runner/github-token" } +resource "aws_codebuild_project" "tf_run_proposer" { + name = "tf-run-proposer" + description = "Clone account repo, render templates, commit, push, open PR — no Terraform execution" + build_timeout = 15 # minutes — fast, no tf-run + service_role = aws_iam_role.codebuild_exec.arn + + artifacts { + type = "NO_ARTIFACTS" + } + + environment { + compute_type = "BUILD_GENERAL1_SMALL" + image = "aws/codebuild/amazonlinux2023-x86_64-standard:4.0" + type = "LINUX_CONTAINER" + privileged_mode = false + + environment_variable { + name = "GITHUB_ORG" + value = var.github_org + } + environment_variable { + name = "CENSUS_CA_S3" + value = var.census_ca_s3 + } + environment_variable { + name = "GH_CLI_S3_PREFIX" + value = var.gh_cli_s3_prefix + } + environment_variable { + name = "HTTPS_PROXY" + value = var.https_proxy + } + environment_variable { + name = "NO_PROXY" + value = "github.e.it.census.gov,169.254.169.254,169.254.170.2" + } + # Placeholder values — always overridden by Lambda per-build + environment_variable { + name = "ACCOUNT_REPO" + value = "OVERRIDE_PER_BUILD" + } + environment_variable { + name = "LAYER" + value = "OVERRIDE_PER_BUILD" + } + environment_variable { + name = "REGION_DIR" + value = "OVERRIDE_PER_BUILD" + } + environment_variable { + name = "GITHUB_TOKEN" + type = "SECRETS_MANAGER" + value = var.github_token_secret_name + } + environment_variable { + name = "GIT_BRANCH" + value = "propose/sc-automation" + } + environment_variable { + name = "TEMPLATE_REPO" + value = "" + } + environment_variable { + name = "TEMPLATE_VARS" + value = "{}" + } + environment_variable { + name = "EXTRA_FILES" + value = "{}" + } + } + + source { + type = "GITHUB_ENTERPRISE" + location = var.source_repo_url + buildspec = "buildspec-proposer.yml" + git_clone_depth = 1 + } + + logs_config { + cloudwatch_logs { + group_name = "/aws/codebuild/tf-run-proposer" + stream_name = "" + status = "ENABLED" + } + } + + tags = { + Project = "sc-automation" + ManagedBy = "terraform" + } + + depends_on = [aws_codebuild_source_credential.ghe] +} + resource "aws_codebuild_project" "tf_run_executor" { name = "tf-run-executor" - description = "Clones account repo, writes config files, runs tf-run, opens PR" - build_timeout = 60 # minutes + description = "Clone account repo main branch, assume cross-account role, run tf-run apply" + build_timeout = 60 # minutes — tf-run apply can be slow service_role = aws_iam_role.codebuild_exec.arn artifacts { @@ -27,26 +122,25 @@ resource "aws_codebuild_project" "tf_run_executor" { environment { compute_type = "BUILD_GENERAL1_SMALL" - image = "aws/codebuild/amazonlinux2-x86_64-standard:3.0" + image = "aws/codebuild/amazonlinux2023-x86_64-standard:4.0" type = "LINUX_CONTAINER" privileged_mode = false - # --- Static defaults (overridden per-build via environmentVariablesOverride) --- environment_variable { name = "GITHUB_ORG" value = var.github_org } environment_variable { - name = "TF_BINARY_S3" - value = var.tf_binary_s3 + name = "TF_BINARY_S3_PREFIX" + value = var.tf_binary_s3_prefix } environment_variable { name = "CENSUS_CA_S3" value = var.census_ca_s3 } environment_variable { - name = "GH_CLI_S3" - value = var.gh_cli_s3 + name = "GH_CLI_S3_PREFIX" + value = var.gh_cli_s3_prefix } environment_variable { name = "HTTPS_PROXY" @@ -54,7 +148,7 @@ resource "aws_codebuild_project" "tf_run_executor" { } environment_variable { name = "NO_PROXY" - value = "github.e.it.census.gov,169.254.169.254" + value = "github.e.it.census.gov,169.254.169.254,169.254.170.2" } # Placeholder values — always overridden by Lambda per-build environment_variable { @@ -75,27 +169,27 @@ resource "aws_codebuild_project" "tf_run_executor" { value = var.github_token_secret_name } environment_variable { - name = "GIT_BRANCH" - value = "repo-init" + name = "TARGET_ACCOUNT_ID" + value = "" } environment_variable { - name = "DRY_RUN" - value = "false" + name = "CROSS_ACCOUNT_ROLE" + value = "r-inf-terraform" } environment_variable { name = "TF_RUN_START_TAG" value = "" } environment_variable { - name = "EXTRA_FILES" - value = "{}" + name = "DRY_RUN" + value = "false" } } source { type = "GITHUB_ENTERPRISE" location = var.source_repo_url - buildspec = "buildspec.yml" + buildspec = "buildspec-executor.yml" git_clone_depth = 1 } diff --git a/deploy/iam.tf b/deploy/iam.tf index 0398048..9eea4ed 100644 --- a/deploy/iam.tf +++ b/deploy/iam.tf @@ -112,9 +112,11 @@ data "aws_iam_policy_document" "codebuild_exec" { ] } - # Secrets Manager: read the GHE PAT at runtime (GITHUB_TOKEN env var) - # Note: CodeBuild uses PARAMETER_STORE for the token; this covers the SM read - # used during Terraform apply of source credentials (aws_codebuild_source_credential). + # Secrets Manager: read the GHE PAT at runtime. + # Both CodeBuild projects define GITHUB_TOKEN as type=SECRETS_MANAGER pointing to this + # secret. CodeBuild fetches the current value fresh at each build start using this + # permission, so the token never appears in StartBuild CloudTrail logs or BatchGetBuilds + # responses. This also covers the SM read in aws_codebuild_source_credential. statement { sid = "SecretsManagerReadGheToken" effect = "Allow" @@ -127,7 +129,7 @@ data "aws_iam_policy_document" "codebuild_exec" { ] } - # CloudWatch Logs: write build output + # CloudWatch Logs: write build output for both proposer and executor projects statement { sid = "CloudWatchLogsWrite" effect = "Allow" @@ -137,6 +139,8 @@ data "aws_iam_policy_document" "codebuild_exec" { "logs:PutLogEvents", ] resources = [ + "arn:${data.aws_partition.current.partition}:logs:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:log-group:/aws/codebuild/tf-run-proposer", + "arn:${data.aws_partition.current.partition}:logs:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:log-group:/aws/codebuild/tf-run-proposer:*", "arn:${data.aws_partition.current.partition}:logs:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:log-group:/aws/codebuild/tf-run-executor", "arn:${data.aws_partition.current.partition}:logs:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:log-group:/aws/codebuild/tf-run-executor:*", ] @@ -154,9 +158,24 @@ data "aws_iam_policy_document" "codebuild_exec" { "codebuild:BatchPutCodeCoverages", ] resources = [ + "arn:${data.aws_partition.current.partition}:codebuild:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:report-group/tf-run-proposer-*", "arn:${data.aws_partition.current.partition}:codebuild:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:report-group/tf-run-executor-*", ] } + + # STS: allow executor to assume a cross-account role in target accounts + # Only the executor needs this; proposer only needs GHE access. + # Default role is r-inf-terraform; can be overridden per-build via CROSS_ACCOUNT_ROLE. + statement { + sid = "StsAssumeRoleCrossAccount" + effect = "Allow" + actions = ["sts:AssumeRole"] + resources = [ + "arn:${data.aws_partition.current.partition}:iam::*:role/r-inf-terraform", + "arn:${data.aws_partition.current.partition}:iam::*:role/r-inf-terraform-eks", + "arn:${data.aws_partition.current.partition}:iam::*:role/sc-automation-codebuild-role", + ] + } } resource "aws_iam_role_policy" "codebuild_exec" { diff --git a/deploy/lambda.tf b/deploy/lambda.tf index 602a46d..3747520 100644 --- a/deploy/lambda.tf +++ b/deploy/lambda.tf @@ -43,7 +43,8 @@ resource "aws_lambda_function" "tf_run_trigger" { environment { variables = { - CODEBUILD_PROJECT_NAME = aws_codebuild_project.tf_run_executor.name + PROPOSER_PROJECT_NAME = aws_codebuild_project.tf_run_proposer.name + EXECUTOR_PROJECT_NAME = aws_codebuild_project.tf_run_executor.name GITHUB_TOKEN_SECRET_NAME = var.github_token_secret_name GITHUB_API = var.github_api GITHUB_ORG_NAME = var.github_org diff --git a/deploy/service_catalog.tf b/deploy/service_catalog.tf index 0f55721..ba746fd 100644 --- a/deploy/service_catalog.tf +++ b/deploy/service_catalog.tf @@ -1,16 +1,31 @@ locals { - product_s3_key = "tf-run-executor/v${var.product_version}/product-template.yaml" - template_url = "https://${var.artifacts_bucket_name}.s3.${data.aws_region.current.name}.amazonaws.com/${local.product_s3_key}" + proposer_s3_key = "tf-run-proposer/v${var.product_version}/proposer-template.yaml" + executor_s3_key = "tf-run-executor/v${var.product_version}/executor-template.yaml" + proposer_url = "https://${var.artifacts_bucket_name}.s3.${data.aws_region.current.name}.amazonaws.com/${local.proposer_s3_key}" + executor_url = "https://${var.artifacts_bucket_name}.s3.${data.aws_region.current.name}.amazonaws.com/${local.executor_s3_key}" } # --------------------------------------------------------------------------- -# Upload product template to the centrally-managed SC artifacts bucket +# Upload product templates to the centrally-managed SC artifacts bucket # --------------------------------------------------------------------------- -resource "aws_s3_object" "product_template" { +resource "aws_s3_object" "proposer_template" { bucket = var.artifacts_bucket_name - key = local.product_s3_key - source = "${path.module}/../service-catalog/product-template.yaml" - etag = filemd5("${path.module}/../service-catalog/product-template.yaml") + key = local.proposer_s3_key + source = "${path.module}/../service-catalog/proposer-template.yaml" + etag = filemd5("${path.module}/../service-catalog/proposer-template.yaml") + + tags = { + "servicecatalog:provisioning" = "true" + Project = "sc-automation" + ManagedBy = "terraform" + } +} + +resource "aws_s3_object" "executor_template" { + bucket = var.artifacts_bucket_name + key = local.executor_s3_key + source = "${path.module}/../service-catalog/executor-template.yaml" + etag = filemd5("${path.module}/../service-catalog/executor-template.yaml") tags = { "servicecatalog:provisioning" = "true" @@ -34,18 +49,40 @@ resource "aws_servicecatalog_portfolio" "this" { } # --------------------------------------------------------------------------- -# Product +# Products # --------------------------------------------------------------------------- -resource "aws_servicecatalog_product" "tf_run" { +resource "aws_servicecatalog_product" "tf_run_proposer" { + name = "${var.portfolio_name_prefix}-tf-run-proposer" + owner = "CSVD Platform Engineering" + description = "Render templates, write config files, and open a PR for human review. Run this before the Apply product." + type = "CLOUD_FORMATION_TEMPLATE" + + provisioning_artifact_parameters { + name = "v${var.product_version}" + description = "Version ${var.product_version}" + template_url = local.proposer_url + type = "CLOUD_FORMATION_TEMPLATE" + disable_template_validation = false + } + + tags = { + Project = "sc-automation" + ManagedBy = "terraform" + } + + depends_on = [aws_s3_object.proposer_template] +} + +resource "aws_servicecatalog_product" "tf_run_executor" { name = "${var.portfolio_name_prefix}-tf-run-executor" owner = "CSVD Platform Engineering" - description = "Trigger tf-run in an account repo layer via CodeBuild. Writes extra config files, applies Terraform, and opens a PR." + description = "Run tf-run apply in an account repo layer. Use after the Proposer PR has been reviewed and merged." type = "CLOUD_FORMATION_TEMPLATE" provisioning_artifact_parameters { name = "v${var.product_version}" description = "Version ${var.product_version}" - template_url = local.template_url + template_url = local.executor_url type = "CLOUD_FORMATION_TEMPLATE" disable_template_validation = false } @@ -55,15 +92,20 @@ resource "aws_servicecatalog_product" "tf_run" { ManagedBy = "terraform" } - depends_on = [aws_s3_object.product_template] + depends_on = [aws_s3_object.executor_template] } # --------------------------------------------------------------------------- -# Associate product with portfolio +# Associate both products with the portfolio # --------------------------------------------------------------------------- -resource "aws_servicecatalog_product_portfolio_association" "this" { +resource "aws_servicecatalog_product_portfolio_association" "proposer" { portfolio_id = aws_servicecatalog_portfolio.this.id - product_id = aws_servicecatalog_product.tf_run.id + product_id = aws_servicecatalog_product.tf_run_proposer.id +} + +resource "aws_servicecatalog_product_portfolio_association" "executor" { + portfolio_id = aws_servicecatalog_portfolio.this.id + product_id = aws_servicecatalog_product.tf_run_executor.id } # --------------------------------------------------------------------------- @@ -78,11 +120,11 @@ resource "aws_servicecatalog_principal_portfolio_association" "this" { } # --------------------------------------------------------------------------- -# Launch constraint role — assumed by CFN when launching the product +# Launch constraint role — shared by both products (same Lambda target) # --------------------------------------------------------------------------- resource "aws_iam_role" "sc_launch" { name = "${var.portfolio_name_prefix}-sc-launch-role" - description = "Role assumed by Service Catalog when launching tf-run-executor product" + description = "Role assumed by Service Catalog when launching proposer or executor product" assume_role_policy = jsonencode({ Version = "2012-10-17" @@ -149,14 +191,27 @@ resource "aws_iam_role_policy" "sc_launch" { }) } -resource "aws_servicecatalog_constraint" "launch" { +resource "aws_servicecatalog_constraint" "proposer_launch" { portfolio_id = aws_servicecatalog_portfolio.this.id - product_id = aws_servicecatalog_product.tf_run.id + product_id = aws_servicecatalog_product.tf_run_proposer.id type = "LAUNCH" parameters = jsonencode({ RoleArn = aws_iam_role.sc_launch.arn }) - description = "Launch constraint — uses a dedicated role to invoke the Lambda" + description = "Launch constraint — uses a dedicated role to invoke the Lambda (proposer)" } + +resource "aws_servicecatalog_constraint" "executor_launch" { + portfolio_id = aws_servicecatalog_portfolio.this.id + product_id = aws_servicecatalog_product.tf_run_executor.id + type = "LAUNCH" + + parameters = jsonencode({ + RoleArn = aws_iam_role.sc_launch.arn + }) + + description = "Launch constraint — uses a dedicated role to invoke the Lambda (executor)" +} + diff --git a/deploy/variables.tf b/deploy/variables.tf index ea0c197..4558239 100644 --- a/deploy/variables.tf +++ b/deploy/variables.tf @@ -10,10 +10,10 @@ variable "source_repo_url" { # e.g. "https://github.e.it.census.gov/SCT-Engineering/sc-lambda-ghactions" } -variable "tf_binary_s3" { - description = "S3 URI for the Terraform Linux AMD64 zip (registry.terraform.io is blocked)" +variable "tf_binary_s3_prefix" { + description = "S3 URI prefix for Terraform Linux AMD64 zips (registry.terraform.io is blocked). Filename is resolved at build time from the terraform/support VERSION file." type = string - default = "s3://csvd-packer-pipeline-assets/terraform/terraform_1.9.1_linux_amd64.zip" + default = "s3://csvd-packer-pipeline-assets/terraform" } variable "census_ca_s3" { @@ -22,10 +22,10 @@ variable "census_ca_s3" { default = "s3://csvd-packer-pipeline-assets/certs/census-ca.pem" } -variable "gh_cli_s3" { - description = "S3 URI for the gh CLI Linux AMD64 tarball" +variable "gh_cli_s3_prefix" { + description = "S3 URI prefix for gh CLI Linux AMD64 tarballs. Filename is resolved at build time from the terraform/support github-cli-releases/VERSION file." type = string - default = "s3://csvd-packer-pipeline-assets/tools/gh_2.49.0_linux_amd64.tar.gz" + default = "s3://csvd-packer-pipeline-assets/tools" } variable "https_proxy" { diff --git a/design-docs/CHECKPOINT.md b/design-docs/CHECKPOINT.md index 702551d..1d61dd4 100644 --- a/design-docs/CHECKPOINT.md +++ b/design-docs/CHECKPOINT.md @@ -2,7 +2,46 @@ ## 1. Last Updated -**2026-05-06** — Implementation complete: Phases 1–3 fully built and committed. +**2026-05-28** — Jira sub-tasks created under CSC-1341; three new ADRs added to `docs/decisions/`. + +--- + +## 1a. Jira Ticket Index + +Parent: **[CSC-1341](https://jira.it.census.gov/browse/CSC-1341)** — [sc-lambda-ghactions] Design & implement next-gen SC automation system + +**Completed work (In Review — GHE PR #1 open):** + +| Key | Summary | Priority | Status | ADR | +|-----|---------|----------|--------|-----| +| [CSC-1351](https://jira.it.census.gov/browse/CSC-1351) | Phase 1: CodeBuild runner + buildspec | High | In Review | — | +| [CSC-1352](https://jira.it.census.gov/browse/CSC-1352) | Phase 2: Lambda CFN Custom Resource handler | High | In Review | — | +| [CSC-1353](https://jira.it.census.gov/browse/CSC-1353) | Phase 3: Service Catalog product registration | High | In Review | — | +| [CSC-1354](https://jira.it.census.gov/browse/CSC-1354) | Architecture design, .sc-automation.yml schema, and deploy Terraform | High | In Review | — | +| [CSC-1355](https://jira.it.census.gov/browse/CSC-1355) | ADR-001: Webhook auto-apply on merge accepted | High | In Review | [ADR-001](../docs/decisions/001-webhook-auto-apply.md) | + +**Open / remaining work:** + +| Key | Summary | Priority | Status | ADR | +|-----|---------|----------|--------|-----| +| [CSC-1342](https://jira.it.census.gov/browse/CSC-1342) | Build and push Lambda container image to ECR (via packer-pipeline) | High | To Do | — | +| [CSC-1343](https://jira.it.census.gov/browse/CSC-1343) | End-to-end test: SC provision → CodeBuild → tf-run → PR → CFN SUCCESS | High | To Do | — | +| [CSC-1344](https://jira.it.census.gov/browse/CSC-1344) | Provision account baseline IAM role (sc-automation-codebuild-role) | High | To Do | [ADR-004](../docs/decisions/004-account-baseline-iam-role.md) | +| [CSC-1345](https://jira.it.census.gov/browse/CSC-1345) | ADR-002: Implement Vault AWS Secrets Engine for cross-account credentials | High | To Do | [ADR-002](../docs/decisions/002-vault-aws-secrets-engine.md) | +| [CSC-1346](https://jira.it.census.gov/browse/CSC-1346) | Vault cluster topology decision | Medium | To Do | [ADR-003](../docs/decisions/003-vault-cluster-topology.md) | +| [CSC-1348](https://jira.it.census.gov/browse/CSC-1348) | OU sharing and StackSet for Service Catalog portfolio | Medium | To Do | [ADR-005](../docs/decisions/005-portfolio-org-sharing.md) | +| [CSC-1349](https://jira.it.census.gov/browse/CSC-1349) | Migration runbook: lambda-template-repo-generator → sc-lambda-ghactions | Medium | To Do | — | +| [CSC-1350](https://jira.it.census.gov/browse/CSC-1350) | Phase 4 observability: CloudWatch dashboard + SNS alerts on FAILED builds | Low | To Do | — | + +**Decision documents index:** + +| ADR | File | Status | Linked tickets | +|-----|------|--------|---------------| +| ADR-001 | [docs/decisions/001-webhook-auto-apply.md](../docs/decisions/001-webhook-auto-apply.md) | Accepted | — | +| ADR-002 | [docs/decisions/002-vault-aws-secrets-engine.md](../docs/decisions/002-vault-aws-secrets-engine.md) | Proposed | CSC-1345 | +| ADR-003 | [docs/decisions/003-vault-cluster-topology.md](../docs/decisions/003-vault-cluster-topology.md) | Proposed | CSC-1346 | +| ADR-004 | [docs/decisions/004-account-baseline-iam-role.md](../docs/decisions/004-account-baseline-iam-role.md) | Accepted | CSC-1344, CSC-1348 | +| ADR-005 | [docs/decisions/005-portfolio-org-sharing.md](../docs/decisions/005-portfolio-org-sharing.md) | Proposed | CSC-1348 | --- diff --git a/docs/HOW-IT-WORKS.md b/docs/HOW-IT-WORKS.md new file mode 100644 index 0000000..a5ffaa4 --- /dev/null +++ b/docs/HOW-IT-WORKS.md @@ -0,0 +1,463 @@ +# How sc-lambda-ghactions Works + +This document explains the end-to-end flow of the SC Lambda + CodeBuild +automation system — from a user filling out a Service Catalog form through +to a Terraform plan or apply running inside an AWS account repository. + +--- + +## Design Overview: Proposer Product + Webhook Auto-Apply + +The system uses a **single user-facing Service Catalog product** with a human +review gate before Terraform runs any infrastructure changes: + +| Component | CodeBuild Project | What It Does | +|-----------|------------------|--------------| +| SC Product: `tf-run-proposer` | `tf-run-proposer` | Clone repo → render templates → commit → open PR | +| Webhook (automatic) | `tf-run-executor` | Clone `main` → assume role → run `tf-run apply` | + +**Why not two SC products?** + +An earlier design exposed the executor as a second Service Catalog product, +requiring a human to return to the SC console after merging the PR, re-enter the +same parameters, and click Launch. This is pure operational overhead — the review +already happened at PR merge time, and the parameters needed to run the apply are +already recorded in `.sc-automation.yml` in the repo. + +The current design restores the PR as a genuine gate with no extra manual steps: + +1. A team provisions the **Proposer** product → changes are committed to a branch + and a PR is opened. No infrastructure is touched. +2. A human reviews the diff, approves, and merges the PR. +3. The GHE push-to-main webhook fires automatically → Lambda reads + `.sc-automation.yml` → starts `tf-run-executor` CodeBuild. No SC product, + no CFN stack, no user action required. + +See [ADR-001](decisions/001-webhook-auto-apply.md) for the full decision record. + +--- + +## Component Overview + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ PROPOSE FLOW │ +│ │ +│ User fills SC form → CFN Custom Resource │ +│ └─> Lambda (tf-run-executor-trigger) │ +│ • Validates inputs (action=propose) │ +│ • Starts tf-run-proposer CodeBuild build │ +│ • Polls CodeBuild, captures PR URL │ +│ • Returns PR URL + repo URL to CFN │ +│ └─> CodeBuild: tf-run-proposer │ +│ • Installs: Census CA cert, gh CLI, Jinja2 │ +│ • Clones account repo │ +│ • Checks out / creates branch propose/sc-automation │ +│ • Renders .j2 templates from TEMPLATE_REPO │ +│ • Writes EXTRA_FILES │ +│ • git commit + git push │ +│ • gh pr create (idempotent) │ +│ • POST_BUILD emits PR_URL= │ +└─────────────────────────────────────────────────────────────────────┘ + + ↕ Human reviews PR, approves, merges ↕ + +┌─────────────────────────────────────────────────────────────────────┐ +│ AUTO-APPLY (webhook — no user action required) │ +│ │ +│ GHE push to main → Lambda Function URL (HMAC verified) │ +│ └─> Lambda (tf-run-webhook-handler) │ +│ • Reads .sc-automation.yml from default branch │ +│ • Starts tf-run-executor CodeBuild (fire-and-forget) │ +│ • Posts pending commit status to GHE │ +│ └─> CodeBuild: tf-run-executor │ +│ • Installs: Terraform binary (from S3), tf-run │ +│ toolchain, Census CA cert, gh CLI, Python deps │ +│ • Clones account repo at main (post-merge) │ +│ • Optionally assumes cross-account IAM role │ +│ • cd {LAYER}/{REGION_DIR} │ +│ • tf-run apply (respects TF_RUN_START_TAG) │ +│ • POST_BUILD writes commit status ✅/❌ to GHE │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Infrastructure Overview + +| Resource | Name | Account / Location | +|---|---|---| +| Lambda | `tf-run-executor-trigger` | csvd-dev (`229685449397`), `us-gov-west-1` | +| CodeBuild (proposer) | `tf-run-proposer` | csvd-dev | +| CodeBuild (executor) | `tf-run-executor` | csvd-dev | +| SC Portfolio | `{prefix}-tf-run` | csvd-dev | +| SC Product (propose) | `{prefix}-tf-run-proposer` | csvd-dev | +| CFN Template (propose) | `service-catalog/proposer-template.yaml` | S3 artifacts bucket | +| Lambda Function URL | `tf-run-webhook-handler` HTTPS endpoint | csvd-dev | +| GHE Webhook | Org-level push webhook → Lambda Function URL | GHE (manual one-time setup) | +| Launch Role | `{prefix}-sc-launch-role` | csvd-dev | +| GHE PAT | `ghe-runner/github-token` in Secrets Manager | csvd-dev | +| Cross-account role | `sc-automation-codebuild-role` | **Target** account | + +--- + +## CodeBuild Projects Reference + +There are exactly **two** CodeBuild projects. They are both in csvd-dev and are +never invoked directly by end users. + +--- + +### `tf-run-proposer` + +**Triggered by:** The Proposer SC product (user fills SC form → CFN Custom +Resource → Lambda starts this build and polls it). + +**What it does:** + +1. Clones the target account repo from GHE +2. Checks out (or creates) a proposal branch (default: `propose/sc-automation`) +3. Renders Jinja2 (`.j2`) template files from `TEMPLATE_REPO` using `TEMPLATE_VARS` +4. Writes any `EXTRA_FILES` directly into the repo tree +5. `git commit && git push --force-with-lease` +6. Opens a pull request (`proposal branch` → `main`) via `gh pr create`; skips + if a PR already exists for that branch (idempotent) +7. Emits `PR_URL=` in POST_BUILD so the Lambda can return it to CloudFormation + +**Does NOT run Terraform.** No infrastructure is touched during this build. +The only changes are committed files in a GHE branch. + +**Key env vars (injected per-build):** + +| Variable | Source | +|---|---| +| `ACCOUNT_REPO` | SC form → CFN → Lambda | +| `LAYER` | SC form → CFN → Lambda | +| `REGION_DIR` | SC form → CFN → Lambda | +| `GIT_BRANCH` | SC form → CFN → Lambda | +| `TEMPLATE_REPO` | SC form → CFN → Lambda | +| `TEMPLATE_VARS` | SC form → CFN → Lambda | +| `EXTRA_FILES` | SC form → CFN → Lambda | +| `GITHUB_TOKEN` | Lambda reads from Secrets Manager `ghe-runner/github-token` | + +**Build definition:** `buildspec-proposer.yml` + +--- + +### `tf-run-executor` + +**Triggered by:** The webhook Lambda — automatically on every push to `main` in +a watched account repo. Never triggered by a user or SC product. + +**What it does:** + +1. Reads target parameters from the per-build env vars (set by the webhook Lambda + from `.sc-automation.yml`) +2. Clones the account repo at `main` (the post-merge state) +3. If `TARGET_ACCOUNT_ID` is set: calls `aws sts assume-role` to obtain + temporary credentials in the target account +4. `cd ${LAYER}/${REGION_DIR}` +5. Runs `tf-run apply` (or `tf-run plan` if `DRY_RUN=true`), optionally starting + from a specific `TAG` step +6. In POST_BUILD: calls the GHE commit status API to write ✅ `success` or + ❌ `failure` on the merge commit — visible directly on the PR timeline + +**This is the only build that runs Terraform and changes real infrastructure.** + +**Key env vars (injected per-build):** + +| Variable | Source | +|---|---| +| `ACCOUNT_REPO` | Webhook Lambda reads from `.sc-automation.yml` | +| `LAYER` | Webhook Lambda reads from `.sc-automation.yml` | +| `REGION_DIR` | Webhook Lambda reads from `.sc-automation.yml` | +| `TARGET_ACCOUNT_ID` | Webhook Lambda reads from `.sc-automation.yml` | +| `DRY_RUN` | Webhook Lambda reads from `.sc-automation.yml` | +| `TF_RUN_START_TAG` | Webhook Lambda reads from `.sc-automation.yml` | +| `COMMIT_SHA` | Webhook Lambda reads from the GHE push payload | +| `GITHUB_TOKEN` | Lambda reads from Secrets Manager `ghe-runner/github-token` | + +> **ADR-002 (Proposed):** A future revision will add `VAULT_AWS_ROLE` so the executor +> obtains short-lived credentials from the Vault AWS Secrets Engine instead of +> assuming `sc-automation-codebuild-role` directly. Until ADR-002 is accepted and +> implemented, `buildspec-executor.yml` uses the static `aws sts assume-role` model. + +**Build definition:** `buildspec-executor.yml` + +--- + +### Relationship between the two projects + +``` +User (SC form) + └─> tf-run-proposer ← renders files, opens PR, touches nothing in AWS + ↓ + Human reviews diff and merges PR + ↓ + GHE push webhook ─> tf-run-executor ← runs Terraform, changes infrastructure +``` + +They share the same account repo and the same GHE PAT, but have completely +separate IAM roles, buildspecs, and trigger paths. The proposer build never has +Terraform installed; the executor build never opens GitHub PRs. + +--- + +## Step-by-Step: Propose Flow + +### 1. User fills the SC form + +The user opens the **tf-run-proposer** product in the Service Catalog console and +provides: + +- **AccountRepo** — the account repo name (e.g. `229685449397-csvd-dev-platform-dev-gov`) +- **Layer** — `common`, `infrastructure`, or `vpc` +- **RegionDir** — `east`, `west`, or `global` +- **GitBranch** — branch to commit to (default: `propose/sc-automation`) +- **TemplateRepo** _(optional)_ — GHE repo containing `.j2` Jinja2 template files +- **TemplateVars** _(optional)_ — JSON dict of values passed to Jinja2 +- **ExtraFiles** _(optional)_ — JSON dict of `{ "path": "content" }` written directly + +### 2. CloudFormation invokes the Lambda + +CFN creates a `Custom::TerraformPropose` resource with `action: propose`. + +### 3. Lambda validates and starts CodeBuild + +`TfRunRequest` is validated by Pydantic. Lambda starts `tf-run-proposer` with these +per-build environment variable overrides: + +``` +ACCOUNT_REPO, LAYER, REGION_DIR, GIT_BRANCH, +TEMPLATE_REPO, TEMPLATE_VARS, EXTRA_FILES, GITHUB_TOKEN +``` + +### 4. CodeBuild - INSTALL phase + +- Clones `github.e.it.census.gov/terraform/support` for version governance +- Downloads and installs `gh` CLI from S3 (version governed by `VERSION_GH`) +- Downloads and installs Census CA cert from S3 → `update-ca-trust` +- `pip3 install jinja2` +- **Does NOT install Terraform** — no infrastructure changes happen in this build + +### 5. CodeBuild - BUILD phase + +1. Rewrite git remote URLs (`ssh://` → `https://`) using the GHE PAT +2. `git clone` the account repo; `git checkout -B ${GIT_BRANCH}` +3. If `TEMPLATE_REPO` is set: + - Clone the template repo (at `TEMPLATE_SOURCE_PATH` subdirectory if set) + - Template files are **flat** — no `layer/workspace/` nesting inside the repo + - Render `.j2` files with Jinja2 (`StrictUndefined`); copy non-template files as-is + - All files land in `${LAYER}/${REGION_DIR}/` in the account repo + - Exception: dotfiles (`.sc-automation.yml`, etc.) go to the account repo root +4. If `EXTRA_FILES` is non-empty: + - Parse the JSON dict; write each `path → content` entry directly (overrides templates) +5. **Bootstrap workspace state files** (all file generation must be in the Proposer PR): + - For every `tf-run.data` containing a `REMOTE-STATE` directive: + - Read `../remote_state.yml` (layer-level) and append `/{workspace_name}` to the `directory` field + - Write the result as `remote_state.yml` in the workspace directory + - This mirrors exactly what `tf-run.sh`'s `REMOTE-STATE` handler does at apply time + - For every workspace directory that now has a `remote_state.yml`: + - Run `tf-directory-setup.py --link none` to generate: + - `remote_state.backend.tf` — the S3 backend block + - `remote_state.{dir}.tf.s3` — production variant + - `remote_state.{dir}.tf.local` — local-state variant + - `remote_state.{dir}.tf.none` — empty stub (activated by `--link none`) + - Symlink `remote_state.{dir}.tf → remote_state.{dir}.tf.none` (bootstrap state) + - `--link none` is the correct bootstrap choice: state does not exist yet; the Executor will re-link to `.s3` after the first successful apply + > **Why here and not in the Executor?** `tf-run.sh` generates these files at apply time via + > `REMOTE-STATE` directive and `COMMAND tf-directory-setup.py` steps. If the Executor generates + > them, they are invisible to reviewers. By running this in the Proposer, every generated file + > appears in the PR diff and is subject to human review before any infrastructure changes. +6. `git add -A && git commit -m "feat: sc-automation propose" --allow-empty` +7. `git push origin ${GIT_BRANCH} --force-with-lease` +8. `gh pr create --base main --head ${GIT_BRANCH} --title "..." --body "..."` (idempotent — skips if PR already exists) + +### 6. CodeBuild - POST_BUILD phase + +Emits `PR_URL=` to stdout for Lambda to capture. + +### 7. Lambda polls and returns + +Lambda polls CodeBuild every 20 s. On `SUCCEEDED`: +- Fetches PR URL via `gh pr view` output +- Sends CFN `SUCCESS` with: + - `PullRequestUrl` / `pull_request_url` + - `RepositoryUrl` / `repository_url` + - `BranchName` / `branch_name` + - `CodeBuildBuildId` + +The CFN stack completes and the output panel shows the PR URL. + +--- + +## Auto-Apply on Merge (Webhook) + +### 1. Prerequisites + +- The Proposer has run and its PR has been **reviewed and merged** to `main` +- `.sc-automation.yml` was committed by the Proposer alongside the rendered files +- The target account has the `sc-automation-codebuild-role` IAM role with a trust + policy allowing assume-role from the CodeBuild execution role in csvd-dev +- The GHE org webhook is configured once: push events → Lambda Function URL + +### 2. GHE fires the push webhook + +On merge to `main`, GHE sends a `push` event to the Lambda Function URL with +an HMAC-SHA256 signature (`X-Hub-Signature-256` header). The Lambda verifies +the signature against the `ghe-runner/webhook-secret` Secrets Manager secret. + +### 3. Lambda reads `.sc-automation.yml` and starts CodeBuild + +The Lambda (webhook handler mode): +1. Fetches `.sc-automation.yml` from the default branch of the pushed repo +2. Extracts `account_repo`, `layer`, `region_dir`, `target_account_id`, + `dry_run`, and optional `tf_run_start_tag` +3. Calls `codebuild:StartBuild` on `tf-run-executor` with override env vars: + ``` + ACCOUNT_REPO, LAYER, REGION_DIR, + TARGET_ACCOUNT_ID, TF_RUN_START_TAG, DRY_RUN, GITHUB_TOKEN + ``` +4. Posts a `pending` commit status to the merge commit on GHE +5. Returns HTTP 200 immediately — the webhook call is fire-and-forget + +### 4. CodeBuild - INSTALL phase + +- Clones `github.e.it.census.gov/terraform/support` for version governance +- Downloads Terraform binary from S3 (version governed by `VERSION_TF`) +- Installs tf-run toolchain scripts from the support repo +- Creates `terraform_latest` symlink → `terraform` (account repos set `TFCOMMAND=terraform_latest` in `.tf-control`) +- Creates `/data/terraform/terraform.d/plugin-cache/` and `/data/terraform/terraform.d/providers/` (required by `.tf-control.tfrc` `plugin_cache_dir` and `filesystem_mirror` directives) +- Downloads and installs Census CA cert +- Downloads and installs `gh` CLI +- `pip3 install python-dateutil pyyaml` + +### 5. CodeBuild - BUILD phase + +1. Rewrite git remotes; `git clone` account repo; `git checkout main` +2. If `TARGET_ACCOUNT_ID` is set: `aws sts assume-role` → + `arn:${AWS::Partition}:iam::{TARGET_ACCOUNT_ID}:role/sc-automation-codebuild-role` + and export the temporary credentials +3. `cd ${LAYER}/${REGION_DIR}` +4. If `DRY_RUN=true`: `tf-run plan`; else: `tf-run apply` + (with optional `--start-tag ${TF_RUN_START_TAG}`) +5. **Commit post-apply changes back to `main`** — two categories of files change after a successful apply: + - **Symlink re-link**: `tf-run.data` typically contains `COMMAND tf-directory-setup.py --link s3` which changes the `remote_state.{dir}.tf` symlink from `.tf.none` → `.tf.s3`. This must be pushed back so future Proposer re-renders see the correct active variant. + - **Lock file update**: `tf-init` generates or updates `.terraform.lock.hcl` if provider constraints change. This must be pushed back so subsequent runs do not re-resolve providers. + - These are committed directly to `main` with `[skip ci]` in the message to prevent the webhook from re-triggering the Executor. No PR is needed: these are operational metadata, not infrastructure config changes. + - If `git diff --cached` is empty (DRY_RUN or no changes), the commit step is skipped cleanly. + +### 6. CodeBuild - POST_BUILD phase + +Writes a `success` or `failure` commit status to GHE on the merge commit, +linking to the CodeBuild log. Platform engineers see ✅/❌ on the commit +without checking CloudWatch directly. + +### Manual One-Off Runs + +For re-apply, dry-run, or partial runs (start from a TAG), trigger the executor +build directly: + +```bash +export AWS_DEFAULT_REGION=us-gov-west-1 +aws codebuild start-build \ + --project-name tf-run-executor \ + --environment-variables-override \ + name=ACCOUNT_REPO,value=229685449397-csvd-dev-platform-dev-gov,type=PLAINTEXT \ + name=LAYER,value=infrastructure,type=PLAINTEXT \ + name=REGION_DIR,value=west,type=PLAINTEXT \ + name=DRY_RUN,value=true,type=PLAINTEXT +``` + +No Service Catalog product is needed. + +--- + +## Key Constraints + +### Census Network Proxy + +CodeBuild runs on standard Amazon Linux 2023 inside the Census VPC. Registry +traffic (Terraform providers, Python packages) must go through the Census HTTP +proxy: + +``` +HTTPS_PROXY=http://proxy.tco.census.gov:3128 +NO_PROXY=github.e.it.census.gov,169.254.169.254,169.254.170.2 +``` + +`github.e.it.census.gov` is in `NO_PROXY` because it is accessed directly +(the proxy does not handle GHE traffic). + +### TLS — Census CA Certificate + +The internal GHE host uses a Census-signed TLS certificate. The Census CA cert +must be installed into the OS trust store (`update-ca-trust`) before any `git`, +`gh`, or `pip` commands that touch GHE or Census-mirrored registries. + +### Terraform Binary from S3 + +`registry.terraform.io/hashicorp` is blocked on the Census network. The Terraform +binary is pre-staged in S3 (`csvd-packer-pipeline-assets`) and downloaded during +the INSTALL phase. The version is governed by `VERSION_TF` in the +`github.e.it.census.gov/terraform/support` repo. + +### GitHub Provider: CSVD Module Only + +The executor uses the `CSVD/terraform-github-repo` internal Terraform module. +The public `HappyPathway/terraform-github-repo` is pinned to `github ~> 6.0` +which conflicts with the `>= 6.11.0` constraint used here. Do not switch modules. + +### Cross-Account Role + +For the executor to apply Terraform in an account other than csvd-dev, the target +account must have: + +```hcl +resource "aws_iam_role" "sc_automation_codebuild" { + name = "sc-automation-codebuild-role" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Principal = { AWS = "arn:aws-us-gov:iam::229685449397:role/tf-run-executor-codebuild" } + Action = "sts:AssumeRole" + }] + }) +} +``` + +The executor role in csvd-dev has `sts:AssumeRole` on +`arn:*:iam::*:role/sc-automation-codebuild-role`. + +### Proposer is Idempotent + +The proposer uses `git push --force-with-lease` and `gh pr create` with a check +for an existing open PR. Re-provisioning the Proposer product will update the +branch and PR rather than creating a duplicate. + +--- + +## Parameter Naming Convention + +The CFN product templates pass all parameters in `snake_case` directly to the +Lambda. The Lambda Pydantic model uses `snake_case` field names. Passing +`snake_case` from CFN avoids the PascalCase→snake_case normalizer, which +mishandles acronyms (`AWSAccountId` → `a_w_s_account_id`). + +--- + +## Files Reference + +| File | Purpose | +|---|---| +| `buildspec-proposer.yml` | CodeBuild build definition for the proposer project | +| `buildspec-executor.yml` | CodeBuild build definition for the executor project | +| `lambda/app.py` | Lambda entry point: validates inputs, routes to proposer or executor | +| `deploy/codebuild.tf` | Terraform: `aws_codebuild_project.tf_run_proposer` + `tf_run_executor` | +| `deploy/lambda.tf` | Terraform: Lambda function with `PROPOSER_PROJECT_NAME` + `EXECUTOR_PROJECT_NAME` | +| `deploy/iam.tf` | Terraform: IAM roles for Lambda, CodeBuild (with `sts:AssumeRole`), SC launch | +| `deploy/service_catalog.tf` | Terraform: Portfolio, single Proposer product, launch constraint | +| `deploy/webhook.tf` | Terraform: Lambda Function URL, HMAC secret, GHE webhook IAM | +| `service-catalog/proposer-template.yaml` | CFN template for the Propose product | diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..9052243 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,145 @@ +# sc-lambda-ghactions Documentation + +This directory contains the design, operating model, and rollout guidance for +`sc-lambda-ghactions` — the centralized Lambda + CodeBuild system that provisions +and manages Terraform-backed account repo changes through AWS Service Catalog. + +## What This System Does + +At a high level, the platform supports this workflow: + +1. A user launches a Service Catalog product +2. CloudFormation invokes a centralized Lambda in `csvd-dev` +3. The Lambda validates inputs and starts a CodeBuild build +4. CodeBuild clones a template repo, renders Terraform/HCL/YAML content, and opens a PR +5. After merge, the executor path can run Terraform against the target workload +6. CSVD can also operate the full managed fleet centrally + +## How to Read This Documentation + +This doc set currently contains both: + +- **Current or near-term implementation guidance** for the CodeBuild-based rollout +- **Proposed design evolution** for auto-apply, generalized product types, and fleet-scale operations + +Because of that, the best entry point depends on what you need. + +## Recommended Reading Paths + +### 1. "I need the quickest overview" + +Start with: + +- [HOW-IT-WORKS.md](HOW-IT-WORKS.md) — end-to-end explanation of the proposer/executor model, the main infrastructure components, and the current CodeBuild execution flow +- [workflow-flowcharts.md](workflow-flowcharts.md) — visual walkthrough of provisioning, apply-on-merge, and fleet update flows + +### 2. "I need to understand the target generalized architecture" + +Start with: + +- [generalized-terraform-product-architecture.md](generalized-terraform-product-architecture.md) — explains how the system expands from EKS-only into a reusable pattern for any Terraform workload +- [template-management.md](template-management.md) — explains how template repos, Jinja2 rendering, `.sc-automation.yml`, and repo injection work +- [repo-vars-and-secrets.md](repo-vars-and-secrets.md) — explains how SSM and Secrets Manager values are injected into CodeBuild builds + +### 3. "I need to onboard a new Service Catalog product" + +Read in this order: + +- [generalized-terraform-product-architecture.md](generalized-terraform-product-architecture.md) — required moving parts for a new `product_type` +- [template-management.md](template-management.md) — template repo structure and rendering expectations +- [service-catalog-census-integration.md](service-catalog-census-integration.md) — how to register the product in `terraform-service-catalog-census` +- [repo-vars-and-secrets.md](repo-vars-and-secrets.md) — how product-scoped configuration and secrets reach the build + +### 4. "I need to understand operations and governance at scale" + +Start with: + +- [fleet-governance-at-scale.md](fleet-governance-at-scale.md) — the `terraform-sc-fleet` operating model, workload inventory structure, maintenance windows, and governance controls +- [cross-account-visibility.md](cross-account-visibility.md) — hub-and-spoke IAM model and options for centralized visibility across accounts +- [workflow-flowcharts.md](workflow-flowcharts.md) — visual summary of fleet-wide operations + +### 5. "I need to understand the webhook auto-apply proposal" + +Read: + +- [decisions/001-webhook-auto-apply.md](decisions/001-webhook-auto-apply.md) — ADR for triggering executor builds automatically from GitHub Enterprise webhook events +- [workflow-flowcharts.md](workflow-flowcharts.md) — flow-level view of the apply-on-merge path +- [template-management.md](template-management.md) — `.sc-automation.yml` schema and executor behavior + +## Document Guide + +### Core system overview + +- [HOW-IT-WORKS.md](HOW-IT-WORKS.md) + - Best for understanding the end-to-end proposer/executor model + - Covers the centralized Lambda, CodeBuild projects, SC products, and step-by-step runtime behavior + - Use this as the main operational baseline + +- [workflow-flowcharts.md](workflow-flowcharts.md) + - Best for stakeholder demos and quick architectural orientation + - Includes flows for provisioning, apply-on-merge, and fleet-wide updates + +### Generalization and product onboarding + +- [generalized-terraform-product-architecture.md](generalized-terraform-product-architecture.md) + - Explains how the platform generalizes to any Terraform workload + - Defines the core onboarding units: template repo, Jinja2 templates, Pydantic model, CFN product template, census registration + +- [template-management.md](template-management.md) + - Canonical guide for template repo usage + - Covers full-repo vs subdirectory templates, Jinja2 rendering, `.sc-automation.yml`, proposer behavior, and executor re-rendering into existing account repos + +- [repo-vars-and-secrets.md](repo-vars-and-secrets.md) + - Canonical guide for runtime config injection + - Covers AWS Parameter Store layout, Secrets Manager layout, Lambda IAM, and CodeBuild `environmentVariablesOverride` + +- [service-catalog-census-integration.md](service-catalog-census-integration.md) + - Canonical guide for enterprise product registration + - Covers central vs StackSet vs census-managed resources, launch roles, portfolio/product YAML, and rollout into `terraform-service-catalog-census` + +### Operations, governance, and visibility + +- [fleet-governance-at-scale.md](fleet-governance-at-scale.md) + - Defines the `terraform-sc-fleet` model for operating many workloads across many repos + - Covers workload entry files, account repo layout, update scripts, maintenance windows, CODEOWNERS, and branch protection + +- [cross-account-visibility.md](cross-account-visibility.md) + - Covers read-only access patterns for viewing managed resources across accounts + - Describes the hub-and-spoke IAM role chain and Resource Explorer-first UI approach + +### Architecture decisions + +- [decisions/001-webhook-auto-apply.md](decisions/001-webhook-auto-apply.md) + - ADR for the proposed webhook-triggered executor path + - Useful for understanding why the manual post-merge step should disappear and how `.sc-automation.yml` participates in the design + +## Suggested Canonical Interpretation + +Where multiple docs overlap, use this interpretation: + +- [HOW-IT-WORKS.md](HOW-IT-WORKS.md) is the best **runtime/system overview** +- [template-management.md](template-management.md) is the best **template repo and account repo injection** reference +- [repo-vars-and-secrets.md](repo-vars-and-secrets.md) is the best **config/secrets injection** reference +- [service-catalog-census-integration.md](service-catalog-census-integration.md) is the best **enterprise rollout** reference +- [fleet-governance-at-scale.md](fleet-governance-at-scale.md) is the best **day-2 fleet operations** reference +- [decisions/001-webhook-auto-apply.md](decisions/001-webhook-auto-apply.md) is the best **design rationale** for auto-apply on merge + +## Current Gaps and Notes + +This doc set is now broad enough to explain: + +- how template repos are leveraged +- how rendered content is injected into new and existing account repos +- how CodeBuild receives configuration and secrets +- how new products are registered in Census +- how CSVD governs and operates the resulting fleet + +A few documents are still explicitly marked **Proposed** or **Draft**, so treat them as design intent unless and until the code and deployment match them. + +## If You Only Read Three Docs + +Read these first: + +1. [HOW-IT-WORKS.md](HOW-IT-WORKS.md) +2. [template-management.md](template-management.md) +3. [service-catalog-census-integration.md](service-catalog-census-integration.md) diff --git a/docs/account-bootstrap-analysis.md b/docs/account-bootstrap-analysis.md new file mode 100644 index 0000000..8a1a214 --- /dev/null +++ b/docs/account-bootstrap-analysis.md @@ -0,0 +1,856 @@ +# Account Repo Analysis & Bootstrap Automation Proposal + +**Date:** 2026-05-22 +**Status:** Proposed +**Author:** AI analysis of `account-repos` workspace +**Audience:** Platform Engineering / SCT-Engineering + +--- + +## 1. Executive Summary + +This document records a systematic analysis of the ~100 AWS account repositories +cloned under `~/git/account-repos` and maps the common structural elements into +a proposed series of **sc-lambda-ghactions workspaces** (Service Catalog products +backed by template repos) that can automate the full account bootstrap lifecycle. + +The analysis found that every account repo — regardless of account type, partition +(GovCloud vs commercial EW), or team ownership — follows a strictly ordered, +repeatable sequence of Terraform workspaces. The content of each workspace is +highly parameterized but structurally identical across accounts. This makes the +entire bootstrap sequence a strong candidate for sc-lambda-ghactions automation. + +A key finding is that the **git-secret / GPG credential system** is the primary +architectural blocker for headless automation of the `common/` layer (IAM foundation). +ADR-002's Vault AWS Secrets Engine, extended to cover Vault KV for provider +credentials, directly unblocks this. Section 6 covers this in detail. + +Where full automation is not possible, this document states so explicitly and +explains why. + +--- + +## 2. Account Repo Structure — Universal Elements + +Every account directory under `account-repos/` contains exactly the following +top-level items. No account was missing any of these. + +### 2.1 Top-Level Layout + +``` +{account-id}-{account-alias}/ +├── applications/ # app-workspace scaffold (most accounts) +│ └── structure/ # mirrors common/ infrastructure/ vpc/ as templates +├── common/ # IAM: policies, roles, groups, users, SAML, LDAP +├── credentials.d/ # per-region AWS credential .tf files +├── edl-automation/ # EDL-specific automation (EDL accounts only) +├── includes.d/ # shared variable definitions (tags) +├── infrastructure/ # TF state backend, S3 logs, CloudTrail, Config +├── init/ # git repo setup; git-secret/gpg-setup present in legacy repos only +│ ├── git-secret/ # ⚠️ legacy — team-member GPG public keys; eliminated with Vault +│ ├── git-setup/ # IaC to create/configure the GitHub repo +│ └── gpg-setup/ # ⚠️ legacy — account-specific GPG key generation; eliminated with Vault +├── provider_configs.d/ # provider secrets: GitHub, LDAP, Infoblox, DNS +├── variables.d/ # variables.common.tf, variables.tfstate.tf, per-region .tfvars +├── vpc/ # VPC resources per region +├── INF.SETUP.md # step-by-step human bootstrap guide +├── README.md +├── TOP # high-level apply-phase sequence (non-apps repos) +├── outputs.common.tf +├── region.tf # locals { region = var.region } +└── tf-run.data # orchestration phases (TAG/COMMENT directives) +``` + +### 2.2 Within `common/` + +``` +common/ +├── INF.account-info.tf # module "account_settings" — alias, password policy +├── INF.general-policies.tf # managed + custom IAM policies +├── INF.saml.tf # IAM SAML provider +├── INF.ldap-ou-create.tf # base LDAP OU for the account +├── INF.role.inf-cloud-admin.tf +├── INF.group.inf-cloud-admin.tf +├── INF.role.inf-network-admin.tf +├── INF.role.inf-flowlogs.tf +├── INF.group.inf-ip-restriction.tf +├── INF.remote-roles.tf # additional SAML roles +├── INF.admin-user.{username}.tf # one file per admin user (variable count) +├── inf-cloud-admin.users.tf +├── INF.service.cloudforms.tf +├── sso/ # per-SSO-permission-set subdirectories +├── remote_state.backend.tf # symlink (→ .s3 or .none depending on state) +├── remote_state.common.tf +├── outputs.common.tf +├── region.tf +└── versions.tf +``` + +### 2.3 Within `infrastructure/` + +``` +infrastructure/ +├── INF.tfstate.tf # S3 bucket + DynamoDB for TF state +├── east/ # per-region workspace +│ ├── INF.cloudtrail.tf +│ ├── INF.config.tf +│ ├── INF.s3-access-logs.tf +│ ├── INF.s3-flow-logs.tf +│ ├── INF.object-logs.tf +│ ├── INF.dynamic-route53.tf +│ ├── INF.ses-domain.tf +│ ├── INF.preload-kms.tf +│ ├── INF.splunk-description.tf +│ ├── locals.tf +│ ├── region.tf +│ ├── remote_state.backend.tf +│ └── versions.tf +└── west/ # same structure as east/ +``` + +### 2.4 Provider Configurations (`provider_configs.d/`) + +Present in every account without exception: + +| File | Provider | +|---|---| +| `provider.github.tf` | GitHub Enterprise | +| `provider.github.variables.tf` | GitHub provider variables | +| `provider.github.auto.tfvars.secret` | GitHub PAT (git-secret encrypted) | +| `provider.ldap.tf` | LDAP (legacy) | +| `provider.ldap.variables.tf` | LDAP variables | +| `provider.ldap.auto.tfvars.secret` | LDAP bind password (git-secret encrypted) | +| `provider.ldap_new.tf` | LDAP (new provider) | +| `provider.ldap_new.variables.tf` | LDAP new variables | +| `provider.ldap_new.auto.tfvars.secret` | LDAP new bind password | +| `provider.dns.tf` | DNS (Infoblox/Route53) | +| `provider.infoblox.tf` | Infoblox (EW accounts only) | +| `provider.infoblox.variables.tf` | Infoblox variables (EW only) | +| `provider.infoblox.auto.tfvars.secret` | Infoblox creds (EW only) | +| `tf-run.data` | Phase orchestration for this layer | + +### 2.5 Variable Files (`variables.d/`) + +| File | Purpose | +|---|---| +| `variables.common.tf` | Variable declarations for all common inputs | +| `variables.tfstate.tf` | Variable declarations for state backend | +| `{region}.variables.common.auto.tfvars` | Per-region values (account_id, alias, region) | + +--- + +## 3. Bootstrap Phase Sequence + +All account repos share the same bootstrap execution order, controlled by `TOP` +and `tf-run.data`. The sequence is: + +``` +Phase 0: MANUAL — AWS account creation, initial bootstrap IAM user +Phase 1: init/ — GitHub repo creation (GPG/git-secret eliminated with Vault) +Phase 2: provider_configs.d/ — provider secret initialization +Phase 3: infrastructure/ (partial) — TF state backend (S3 + DynamoDB) +Phase 4: infrastructure/{region}/ — S3 access log buckets per region +Phase 5: common/ — IAM foundation (see ordered sub-steps below) +Phase 6: infrastructure/ (finalize) — flow log buckets, object logging, etc. +Phase 7: infrastructure/{region}/ — CloudTrail, Config, SES, Route53 per region +Phase 8: vpc/ — VPC per region +Phase 9: applications/structure/ — (if applicable) app workspace scaffold +``` + +### Phase 5 Sub-Steps (ordered dependencies) + +``` +5.1 general — managed_policies, custom_policies, custom_policy_documents +5.2 account_settings — account alias + IAM password policy +5.3 saml — IAM SAML provider +5.4 ldap_ou — base LDAP OU (prerequisite for all LDAP objects) +5.5 role inf-cloud-admin + group inf-cloud-admin (apply twice: create file, then LDAP object) +5.6 role inf-network-admin + group inf-network-admin +5.7 role inf-flowlogs +5.8 group inf-ip-restriction +5.9 splunk_user +5.10 service_cloudforms +5.11 admin user accounts (one tf file per user; parallel-safe within the group) +5.12 other SAML roles (remote roles — apply twice each) +``` + +--- + +## 4. Account Type Variations + +### 4.1 Partition + +The authoritative discriminator is the `aws_environment` variable in each account's `variables.d/*.variables.common.auto.tfvars`: + +| `aws_environment` value | Meaning | `credentials.d/` contents | Primary region | +|---|---|---|---| +| `"gov"` | AWS GovCloud (US) | `us-gov-east-1.credentials.tf`, `us-gov-west-1.credentials.tf` (2 files) | `us-gov-east-1` | +| `"ew"` | East-West commercial network zone | 17 commercial-region files (one account, `ent-ew-sectools-prod`, has 30 as newer regions were added) | `us-east-1` | + +**The `-gov` vs `-ew` name suffix does not reliably map to partition alone.** Examples: +- `ent-gov-operations-prod` → `aws_environment = "gov"` (GovCloud despite `-prod` suffix) +- `csvd-dev-ew` → has commercial-region credentials but is the GovCloud-linked commercial account for `csvd-dev-gov`, not a standalone commercial workload account +- `do2-prod` (no `-ew` suffix) → `aws_environment = "ew"` + +The `-ew` suffix, when present alongside a corresponding `-gov` account, designates the **GovCloud linked commercial account** — the pairing required by AWS for every GovCloud account. These accounts carry the standard 17-region commercial credential set but serve a different operational role than standalone commercial workload accounts. + +**Consequence for bootstrap automation:** `aws_environment` is the correct input field to use, not a derived partition value from the alias name. It must be supplied explicitly in the SC form. + +**Infoblox provider:** present in `provider_configs.d/` for `aws_environment = "ew"` accounts only. + +### 4.2 Program/Team + +| Pattern | Directories added | Notes | +|---|---|---| +| `edl-*` | `edl-automation/` | EDL-specific automation harness | +| `ent-gov-network-*` | `vpc-shared/` instead of `applications/` | Network accounts share VPC | +| `_apps-{stack}` | Separate repo per application stack; `SUBMODULE` file replaces `TOP` | Each stack is its own GitHub repo | + +### 4.3 `_apps-*` Repos + +Accounts with application stacks have companion repos following the naming +convention `{account-id}-{alias}_apps-{stack-name}`. These share the same +`common/`, `credentials.d/`, `infrastructure/`, `provider_configs.d/` scaffolding +as the base account repo but contain a `SUBMODULE` orchestration file rather than +`TOP`. They are registered as GitHub submodules in the base account repo. + +--- + +## 5. Key Inputs Required Per New Account + +These are the minimum parameterized values needed to generate a complete account +repo from scratch: + +| Input | Examples | Notes | +|---|---|---| +| `account_id` | `001476713248` | 12-digit AWS account number | +| `account_alias` | `edl-core-dev-gov` | Used in all resource naming | +| `aws_environment` | `gov` or `ew` | Sourced directly from the account's tfvars; controls region set + Infoblox presence. Do not derive from the alias name. | +| `primary_region` | `us-gov-east-1` or `us-east-1` | Drives first-region workspace | +| `secondary_region` | `us-gov-west-1` or `us-west-2` | Drives second-region workspace | +| `program` | `edl`, `ent`, `ma`, `lab`, etc. | Controls edl-automation inclusion | +| `environment` | `dev`, `nonprod`, `prod`, `common` | Tags and policy scoping | +| `admin_users` | `[badra001, dwara001, ...]` | Generates `INF.admin-user.*.tf` files | +| `github_org` | `SCT-Engineering` or specific org | For `init/git-setup/` | +| `github_repo_name` | `{account_id}-{alias}` | Usually derived from above | +| `tfstate_bucket` | `inf-tfstate-{account_id}` | S3 bucket for remote state | +| `app_stacks` | `[]` or `[adsd-eks, tco-imds]` | Whether to create `_apps-*` repos | +| `include_edl_automation` | `true/false` | EDL accounts only | +| `include_vpc_shared` | `true/false` | Network accounts only | + +--- + +## 6. GPG Keys and git-secret — What They Actually Protect + +Understanding the GPG/git-secret system precisely is critical to assessing what +can be automated and what ADR-002 (Vault) can eliminate. + +### 6.1 Two Distinct GPG Key Systems + +There are **two separate GPG key concepts** in every account repo, serving +different purposes: + +#### Account-specific GPG keypair (`init/gpg-setup/`) + +`tf apply` in `init/gpg-setup/` generates a unique GPG keypair for the account +(e.g. `tf-001476713248-edl-core-dev-gov`). Its purpose, from `INF.gpg-setup.md`: + +> "This key is used for encrypting specific resource values, such as **IAM +> passwords** or **IAM access keys**." + +It is **not** used to protect provider credentials. It encrypts the IAM console +passwords and AWS access keys that Terraform generates for admin users (module +`admin_{username}`) so those sensitive values can be committed to the repo and +distributed out-of-band without appearing in plaintext. The key artifacts are: + +| File | Contents | How stored | +|---|---|---| +| `tf-{account}.gpg.b64` | Public key (base64) | Plaintext in git; symlinked at `TOP/init/tf-gpg-key.b64` | +| `tf-{account}.gpg.asc` | Public key (ascii-armored) | Plaintext in git | +| `tf-{account}.gpg.secret-key.secret` | **Private key** | Encrypted by git-secret | + +The private key is itself protected by git-secret — you need a team member's +personal GPG key to retrieve it. + +#### Team member GPG public keys (`init/git-secret/*.gpg.asc`) + +One file per engineer, sourced from `terraform/support/keys/gpg-public-keys`. +These are the **recipients** for git-secret's multi-key encryption. Anyone whose +key is in this directory can run `git secret reveal` to decrypt the protected +files in the repo. Adding or removing an engineer requires: importing their key, +running `git-secret tell $EMAIL`, running `git-secret hide` (re-encrypts all +files for all current recipients), and committing the result. + +### 6.2 What git-secret Actually Encrypts + +The `.secret` extension marks a git-secret encrypted file. The plaintext +counterpart (without `.secret`) is gitignored. Encrypted files found across all +account repos: + +| Encrypted file | Plaintext contains | Used by | +|---|---|---| +| `provider_configs.d/provider.github.auto.tfvars.secret` | `github_token`, `github_org`, `github_url` | All TF workspaces using GitHub provider | +| `provider_configs.d/provider.ldap.auto.tfvars.secret` | `ldap_user`, `ldap_password` | All TF workspaces creating LDAP objects | +| `provider_configs.d/provider.ldap_new.auto.tfvars.secret` | `ldap_user`, `ldap_password` (new provider) | Same | +| `provider_configs.d/provider.infoblox.auto.tfvars.secret` | Infoblox API credentials | EW accounts only | +| `init/gpg-setup/tf-{account}.gpg.secret-key.secret` | Account GPG private key | Local operators who need to decrypt IAM passwords | +| `vpc/{region}/.../access_key.yml.secret` | IAM access keys for service accounts | Where AWS access keys are stored in VPC workloads | + +### 6.3 Why This Blocks CodeBuild Automation Today + +`git secret reveal` requires a GPG private key present in the local keychain. +CodeBuild has no such key — it was never designed to be a git-secret recipient. +As a result: + +- **Any TF workspace that sources the GitHub or LDAP provider** (i.e., `common/`, + any SSO workspace, any workspace in `provider_configs.d/`) **cannot be run by + the executor in its current form.** The `.auto.tfvars.secret` files would not + be decrypted, the provider would have empty credentials, and the apply would fail. + +- The executor **can** run workspaces that use only the AWS provider (e.g., + `infrastructure/`, `vpc/`) because those rely on STS credentials injected via + environment variables, not on git-secret-managed files. + +This is the core automation gap. It is not just a manual step — it is an +architectural incompatibility between git-secret and headless automation. + +### 6.4 What Vault Can Replace + +ADR-002 is framed around AWS credential issuance (replacing static +`sts:AssumeRole`), but the Vault KV Secrets Engine can trivially extend to +replace git-secret for all provider credentials as well. The mapping is direct: + +| Today (git-secret) | With Vault KV | +|---|---| +| `provider.github.auto.tfvars.secret` → `git secret reveal` | `vault kv get secret/accounts/{alias}/github` → write `.auto.tfvars` at build time | +| `provider.ldap.auto.tfvars.secret` → `git secret reveal` | `vault kv get secret/accounts/{alias}/ldap` → write `.auto.tfvars` at build time | +| `provider.infoblox.auto.tfvars.secret` → `git secret reveal` | `vault kv get secret/accounts/{alias}/infoblox` → write `.auto.tfvars` at build time | +| Account GPG private key (encrypts IAM passwords in repo) | **Eliminated** — admin-user module writes passwords directly to `vault kv put secret/accounts/{alias}/users/{username}`; no GPG, no `.secret` files | + +The executor buildspec adds a `vault kv get` call per needed provider before +running `tf-init`/`tf-run`, injecting the plaintext credentials as temporary +files that are never committed. This replaces the entire `git secret reveal` +ceremony. **The `.gitsecret/` directory, `init/gpg-setup/`, and `init/git-secret/` +are eliminated from all new account repos** — they are artifacts of the old system +and have no role in Vault-managed accounts. + +#### CodeBuild authentication to Vault + +CodeBuild authenticates to Vault using the **AWS auth method** — no credentials +are injected, stored, or rotated. CodeBuild proves its identity via +`sts:GetCallerIdentity`; Vault verifies the IAM role ARN directly with AWS. + +The proposer's `tf apply` provisions the Vault auth role for each new account: + +```hcl +resource "vault_aws_auth_backend_role" "codebuild" { + backend = "aws" + role = "sc-automation-${var.account_id}" + auth_type = "iam" + bound_iam_principal_arns = [ + "arn:${var.partition}:iam::${var.account_id}:role/sc-automation-codebuild-role" + ] + token_policies = ["sc-automation-${var.account_id}"] + token_ttl = 900 +} +``` + +Then in the executor buildspec: + +```bash +vault login -method=aws -no-print role=sc-automation-${ACCOUNT_ID} +GITHUB_TOKEN=$(vault kv get -field=github_token secret/accounts/${ACCOUNT_ALIAS}/github) +``` + +No AppRole, no Secret IDs, nothing in Secrets Manager. The IAM role *is* the credential. + +### 6.5 What Vault Cannot Eliminate + +With Vault managing all secrets and CodeBuild authenticating via the AWS auth +method, one manual step survives: + +1. **Account-specific GPG keypair generation (M4): Eliminated.** The + `init/gpg-setup/` directory and the entire `.gitsecret/` tree are dropped from + new account repos. The `admin-user` module delivers IAM passwords directly to + `vault kv put secret/accounts/{alias}/users/{username}` rather than to + GPG-encrypted files. Operators retrieve passwords via `vault kv get` using + their own IAM credentials — no GPG toolchain required at any point. + +2. **Bootstrapping Vault itself with the first account credential:** The very + first time a new account is bootstrapped, Vault does not yet have that + account's LDAP password or GitHub PAT. An operator must do a one-time + `vault kv put` for each credential. This is a single 3-command operation per + credential per account — far simpler than the full git-secret ceremony — and + can be performed by a central platform team without any access to the target + account's GPG keychain. + +### 6.6 Vault Scope Expansion Summary + +If ADR-002 is implemented and extended to cover provider credentials via Vault KV: + +| Manual step | Current status | With Vault | +|---|---|---| +| M4 — GPG keypair generation | Required per account | **Eliminated** — `init/gpg-setup/` and `.gitsecret/` removed from new repos; IAM passwords go to Vault KV | +| M5 — Team member GPG key collection | Required per account per new team member | **Eliminated** — no git-secret recipients; operators access secrets via IAM + Vault policy | +| M6 — `*.auto.tfvars.secret` encryption | Required per credential per account | Replaced by one `vault kv put` per credential (one-time, central team) | +| M10 — LDAP objects in `common/` | Currently blocked for CodeBuild | Unblocked — executor reads LDAP credentials from Vault at build time | + +The practical effect: implementing Vault KV for provider credentials **unlocks +full automation of `common/`** — the largest and most complex bootstrap workspace +— which is currently the hardest manual phase. + +--- + +## 7. Proposed sc-lambda-ghactions Workspace Series + +The following describes each proposed Service Catalog product / template repo +needed to automate the bootstrap sequence. They map directly onto the phase +sequence in section 3. + +Each workspace corresponds to one sc-lambda-ghactions **Proposer invocation** +(one PR, one executor run). They are ordered by dependency. + +--- + +### Workspace 0: `bootstrap-account-repo` + +**Template repo:** `template-bootstrap-account-repo` +**Layer:** `init` (special — not a standard TF layer; this creates the repo itself) +**Purpose:** Create the GitHub repo, set branch protections, configure teams, +write the top-level scaffold files (`TOP`, `tf-run.data`, `region.tf`, +`outputs.common.tf`, `README.md`, `INF.SETUP.md`). + +**Inputs:** +- `account_id`, `account_alias`, `aws_environment`, `program`, `environment` +- `primary_region`, `secondary_region` +- `github_org`, `github_teams` (list with permissions) +- `admin_users` (for `INF.SETUP.md` generation) + +**Rendered outputs:** +- `init/git-setup/INF.repo-setup.tf` — GitHub repo resource + team membership +- `TOP` — apply phase sequence file +- `tf-run.data` — orchestration file +- `README.md`, `INF.SETUP.md` — human documentation + +**What the executor does:** `tf apply` in `init/git-setup/` creates the GitHub +repo via the GitHub Terraform provider. + +> ⚠️ **Cannot be automated:** GPG key generation for `init/gpg-setup/` requires +> a human with GnuPG to generate the account-specific keypair, encrypt it, and +> commit the `.gpg.asc` and `.gpg.b64` artifacts. The private key must be +> distributed out-of-band to account operators. This step remains manual. + +> ⚠️ **Cannot be automated:** Each team member's GPG public key (`init/git-secret/ +> {username}.gpg.asc`) must be provided by the individual. The proposer can +> render the `git-secret` setup script, but the keys themselves must come from +> a known-good source (e.g., a team keyring registry). If such a registry exists +> in Secrets Manager or SSM, this can be automated; otherwise it remains manual. + +--- + +### Workspace 1: `bootstrap-provider-configs` + +**Template repo:** `template-provider-configs` +**Layer:** `provider_configs.d` +**Region dir:** `global` (no region scoping for this layer) +**Purpose:** Render all provider configuration `.tf` files and stub out the +encrypted secret placeholders. Sets up the GitHub, LDAP, LDAP-new, DNS, and +(if `aws_environment = "ew"`) Infoblox providers. + +**Inputs:** `account_id`, `account_alias`, `aws_environment` + +**Rendered outputs:** +- `provider_configs.d/provider.github.tf` +- `provider_configs.d/provider.ldap.tf`, `provider.ldap_new.tf` +- `provider_configs.d/provider.dns.tf` +- `provider_configs.d/provider.infoblox.tf` (EW only) +- `provider_configs.d/tf-run.data` +- All `variables.tf` counterparts + +> ⚠️ **Cannot be automated without Vault KV (see section 6):** The `*.auto.tfvars.secret` +> files (GitHub PAT, LDAP bind password, Infoblox credentials) are git-secret encrypted +> files that CodeBuild cannot decrypt. With ADR-002 extended to Vault KV, the executor +> would instead call `vault kv get secret/accounts/{alias}/{provider}` at build time and +> write the credentials as temporary files before `tf-init` — replacing git-secret entirely. +> Until that is implemented, an operator must manually `vault kv put` (or `git-secret hide`) +> the credentials once per account. This is also what gates full automation of `common/` +> (Workspace 6), which uses both the LDAP and GitHub providers. + +--- + +### Workspace 2: `bootstrap-credentials` + +**Template repo:** `template-credentials` +**Layer:** `credentials.d` +**Region dir:** `global` +**Purpose:** Generate the per-region AWS credential provider `.tf` files. + +**Inputs:** `account_id`, `account_alias`, `aws_environment` (`gov` or `ew`) +- `gov`: generates `us-gov-east-1.credentials.tf`, `us-gov-west-1.credentials.tf` +- `ew`: generates all 17 (or 30 for newer builds) commercial-region credential files + +**Rendered outputs:** `credentials.d/{region}.credentials.tf` for each region + +**What the executor does:** No TF apply needed; this is a file generation step. +The executor can be set to `DRY_RUN=true` for this workspace — the files just +need to be committed. + +--- + +### Workspace 3: `bootstrap-variables` + +**Template repo:** `template-variables` +**Layer:** `variables.d` +**Region dir:** `global` +**Purpose:** Generate `variables.common.tf`, `variables.tfstate.tf`, and per-region +`{region}.variables.common.auto.tfvars` files. + +**Inputs:** `account_id`, `account_alias`, `aws_environment`, `primary_region`, +`secondary_region`, `environment`, `program`, `tfstate_bucket` + +**Rendered outputs:** +- `variables.d/variables.common.tf` +- `variables.d/variables.tfstate.tf` +- `variables.d/{primary_region}.variables.common.auto.tfvars` +- `variables.d/{secondary_region}.variables.common.auto.tfvars` +- `includes.d/variables.account_tags.tf` +- `includes.d/variables.application_tags.tf` +- `includes.d/variables.infrastructure_tags.tf` + +**What the executor does:** `DRY_RUN=true` — file generation only. + +--- + +### Workspace 4: `bootstrap-infrastructure-tfstate` + +**Template repo:** `template-infrastructure-tfstate` +**Layer:** `infrastructure` +**Region dir:** `global` +**Purpose:** Bootstrap the Terraform state backend: S3 bucket + DynamoDB table. +This is the first workspace that touches real AWS infrastructure. + +**Prerequisite:** AWS bootstrap IAM user created manually (see section 7, +manual step M1). This executor run assumes the bootstrap user's credentials. + +**Inputs:** `account_id`, `account_alias`, `aws_environment`, `primary_region`, +`tfstate_bucket` + +**Rendered outputs:** +- `infrastructure/INF.tfstate.tf` +- `infrastructure/remote_state.backend.tf` (→ `.none` initially; executor re-links to `.s3`) +- `infrastructure/tf-run.data` +- `infrastructure/region.tf`, `versions.tf` + +**What the executor does:** +1. `tf apply -target=module.tfstate` — creates S3 + DynamoDB +2. Re-links `remote_state.backend.tf` → `.s3` (commits back to `main`) + +> ⚠️ **Cannot be automated:** Initial application of this workspace requires the +> `bootstrap` IAM user's credentials, which exist only in AWS Console and must +> be manually provided to CodeBuild (e.g., via SSM SecureString or injected as +> build-time overrides). One approach: after the account is created, an operator +> stores the bootstrap credentials in Secrets Manager under a known path, the +> executor reads them, applies, then the `common/` phase rotates to real users. +> This is architecturally possible but requires an agreed credential handoff +> convention not yet established. + +--- + +### Workspace 5: `bootstrap-infrastructure-regional-logs` + +**Template repo:** `template-infrastructure-regional-logs` +**Layer:** `infrastructure` +**Region dir:** `{primary_region}` then `{secondary_region}` (two separate runs) +**Purpose:** Create the `inf-logs-{account}-{region}` S3 access log bucket in +each region. Required before any ALB, S3 bucket, or object-log resources can be +configured. + +**Inputs:** `account_id`, `account_alias`, `aws_environment`, target region + +**Rendered outputs:** +- `infrastructure/{region}/INF.s3-access-logs.tf` +- `infrastructure/{region}/region.tf`, `remote_state.backend.tf`, `versions.tf` + +**What the executor does:** `tf apply -target=module.logs` per region. + +--- + +### Workspace 6: `bootstrap-common` + +**Template repo:** `template-common` +**Layer:** `common` +**Region dir:** `global` (common layer has no per-region split) +**Purpose:** Render and apply all IAM foundation resources in dependency order. + +This is the largest and most complex bootstrap workspace. Because of the +ordered dependency chain within `common/` (see section 3, Phase 5 sub-steps), +the executor must respect `TF_RUN_START_TAG` to resume from a given step. + +**Inputs:** +- `account_id`, `account_alias`, `aws_environment`, `environment`, `program` +- `admin_users` list (one `INF.admin-user.{username}.tf` per entry) +- `saml_provider_metadata` (SAML XML metadata from identity provider) +- `ldap_base_dn`, `ldap_account_ou` + +**Rendered outputs:** +- `common/INF.account-info.tf` +- `common/INF.general-policies.tf` +- `common/INF.saml.tf` +- `common/INF.ldap-ou-create.tf` +- `common/INF.role.inf-cloud-admin.tf` +- `common/INF.group.inf-cloud-admin.tf` +- `common/INF.role.inf-network-admin.tf` +- `common/INF.role.inf-flowlogs.tf` +- `common/INF.group.inf-ip-restriction.tf` +- `common/INF.service.cloudforms.tf` +- `common/INF.admin-user.{username}.tf` for each user in `admin_users` +- `common/inf-cloud-admin.users.tf` +- `common/remote_state.backend.tf`, `remote_state.common.tf` +- `common/outputs.common.tf`, `region.tf`, `versions.tf` +- `common/tf-run.data` (ordered TAG sequence matching section 3 Phase 5) + +**What the executor does:** +Runs `tf-run apply` which walks the `TAG` sequence in `common/tf-run.data`. +The `TF_RUN_START_TAG` env var allows resuming after a partial failure. + +> ⚠️ **Partially automatable — SAML metadata:** The SAML provider metadata XML +> must be obtained from the identity provider (e.g., Okta or ADFS) and passed +> as an input. If the IdP is Okta and an API exists, this can be automated. If +> the metadata is managed manually, it must be provided by an operator. + +> ⚠️ **Partially automatable — LDAP:** The `INF.ldap-ou-create.tf` module +> requires LDAP bind credentials in `provider_configs.d/` to be decryptable at +> runtime. Until ADR-002 (Vault AWS Secrets Engine) is implemented, these +> credentials must already be git-secret encrypted and present in the repo from +> Workspace 1. If that was done, LDAP steps are fully automated. If not, they +> require manual intervention. + +> ⚠️ **Two-pass apply for SAML roles:** Each `INF.role.*.tf` that creates LDAP +> objects requires two sequential `tf apply` calls (first creates a local file, +> second creates the LDAP object). The executor's `tf-run.data` TAG sequence +> handles this natively — no special tooling needed — but the operator must +> ensure the `common/tf-run.data` TAG ordering encodes the two-pass pattern. + +--- + +### Workspace 7: `bootstrap-infrastructure-finalize` + +**Template repo:** `template-infrastructure-finalize` +**Layer:** `infrastructure` +**Region dir:** `{primary_region}` and `{secondary_region}` (two runs) +**Purpose:** Apply the remaining infrastructure resources after `common/` is +complete (which provides the SAML roles required for flow-log and object-log +bucket policies). + +**Inputs:** `account_id`, `account_alias`, `aws_environment`, region + +**Rendered outputs per region:** +- `infrastructure/{region}/INF.s3-flow-logs.tf` +- `infrastructure/{region}/INF.object-logs.tf` +- `infrastructure/{region}/INF.cloudtrail.tf` +- `infrastructure/{region}/INF.config.tf` +- `infrastructure/{region}/INF.dynamic-route53.tf` +- `infrastructure/{region}/INF.ses-domain.tf` +- `infrastructure/{region}/INF.preload-kms.tf` +- `infrastructure/{region}/INF.splunk-description.tf` +- `infrastructure/{region}/locals.tf` + +**What the executor does:** `tf-run apply` walking the TAG sequence in the +regional `tf-run.data`. + +--- + +### Workspace 8: `bootstrap-vpc` + +**Template repo:** `template-vpc` +**Layer:** `vpc` +**Region dir:** `{primary_region}` and `{secondary_region}` (two runs) +**Purpose:** Create the VPC and associated networking resources in each region. + +**Inputs:** +- `account_id`, `account_alias`, `aws_environment`, region +- `vpc_cidr`, `subnet_cidrs` (map of AZ → CIDR) +- `vpc_name` (usually derived from account alias) +- Network account ID for VPC sharing (if applicable) + +**Rendered outputs:** +- `vpc/{region}/INF.vpc.tf` +- `vpc/{region}/INF.subnets.tf` +- `vpc/{region}/INF.tgw-attachment.tf` (if transit gateway) +- `vpc/{region}/region.tf`, `remote_state.backend.tf`, `versions.tf` + +**What the executor does:** `tf-run apply` applying VPC resources. + +> ⚠️ **Cannot be fully automated:** VPC CIDR allocation must be coordinated with +> the network team's IPAM system. The CIDRs cannot be derived automatically +> without an IPAM API integration. An operator must supply them via SC form +> inputs or the allocation must be read from an external registry (e.g., Infoblox +> or an internal IPAM). + +--- + +### Workspace 9 (optional): `bootstrap-applications-structure` + +**Template repo:** `template-applications-structure` +**Layer:** `applications` +**Region dir:** `structure` +**Purpose:** Scaffold the `applications/structure/` directories that mirror +`common/`, `infrastructure/`, and `vpc/` as templates for app teams. Only needed +for accounts that will host application stacks. + +**Inputs:** `account_id`, `account_alias`, `aws_environment`, `primary_region`, +`secondary_region`, `app_stacks` (list of stack names for `_apps-*` repos) + +**Rendered outputs:** +- `applications/structure/common/`, `infrastructure/`, `vpc/` scaffold files +- Symlinks matching the base account repo pattern + +--- + +### Workspace 10 (optional): `bootstrap-apps-repo` + +**Template repo:** `template-apps-repo` +**Layer:** `init` (creates a new GitHub repo) +**Purpose:** Create and scaffold a `{account-id}-{alias}_apps-{stack-name}` repo +for each application stack, registering it as a submodule of the base account repo. + +Repeat once per stack name in `app_stacks`. + +--- + +## 8. Manual Steps That Cannot Be Automated + +The following steps are explicitly outside the scope of sc-lambda-ghactions +automation in its current form. Each has the reason stated. + +| # | Step | Why It Cannot Be Automated | Potential Future Path | +|---|---|---|---| +| M1 | AWS account creation | Account Vending Machine / AWS Organizations automation is out of scope for this system | Integrate with AWS Control Tower or an internal AVM product | +| M2 | `bootstrap` IAM user creation | Requires AWS Console + AdminAccess before any IaC exists | Control Tower / account vending pre-creates a bootstrap role | +| M3 | Bootstrap IAM credentials handoff | Access key + secret for the bootstrap user must be securely handed to the first executor run | Store in Secrets Manager during AVM; executor reads from known path | +| M4 | GPG keypair generation (`init/gpg-setup/`) | Generates keypair used to encrypt Terraform-created IAM passwords in the repo | Eliminated if admin-user module is updated to write passwords to Vault KV instead of GPG-encrypting them in the repo (see section 6.5) | +| M5 | Team member GPG public key collection | `git secret tell` requires each engineer's public key as a git-secret recipient | **Fully eliminated by ADR-002 + Vault KV** — no git-secret recipients needed when secrets live in Vault (see section 6.4) | +| M6 | `*.auto.tfvars.secret` encryption | git-secret requires the account GPG key on the operator's keychain; CodeBuild cannot decrypt these files | **Substantially eliminated by ADR-002 extended to Vault KV** — replaced by one `vault kv put` per credential; this also unblocks `common/` automation (see section 6.3–6.4) | +| M7 | SAML provider metadata XML | Must be retrieved from the IdP (Okta, ADFS, etc.) | Automate if IdP has an API; otherwise operator pastes metadata into SC form | +| M8 | VPC CIDR allocation | CIDRs must come from an IPAM system | Automate via Infoblox API or internal IPAM product integration | +| M9 | `bootstrap` user rotation | After admin users are created, the bootstrap user's access key must be disabled and the TF import performed | Low complexity; could be a separate SC product (`import-bootstrap-user`) | +| M10 | Two-pass SAML role applies | LDAP objects require `tf apply` twice per role | Already handled by `tf-run.data` TAG sequence; not a manual step if executor is running cleanly | +| M11 | Initial `git checkout -b initial-setup` push | Per `common/INF.SETUP.md` — the first clean `git push` after `common/` complete | Could be part of executor post-apply commit; low risk to automate | + +--- + +## 9. Mapping to sc-lambda-ghactions Concepts + +### 8.1 `.sc-automation.yml` per workspace + +Each workspace PR committed by the Proposer includes a `.sc-automation.yml` +written to the account repo root. Because multiple workspaces touch the same +repo, the convention must be to write workspace-specific YAML -- or scope the +file to the workspace layer using the `.sc-automation.yml` path scoping +already built into the webhook handler. + +Proposed convention: +```yaml +# .sc-automation.yml written by template-bootstrap-common proposer +account_repo: 001476713248-edl-core-dev-gov +layer: common +region_dir: global +target_account_id: "001476713248" +dry_run: false +tf_run_start_tag: "" # set to a TAG label to resume from a partial failure +``` + +### 8.2 Cross-account IAM role + +Every PR-merge-triggered executor run for a target account requires +`sc-automation-codebuild-role` to exist in that account. During bootstrap, +this role does not yet exist at the time Workspace 4 runs. Two options: + +**Option A (recommended):** The AWS account vending process (Control Tower or +AVM) pre-creates `sc-automation-codebuild-role` as part of the account baseline +before any bootstrap workspace runs. This is the cleanest design. + +**Option B:** Workspace 4 (`bootstrap-infrastructure-tfstate`) runs in the target +account using the bootstrap user's static credentials injected via Secrets Manager +rather than `sts:AssumeRole`. After `common/` creates the real admin users and +the `sc-automation-codebuild-role` can itself be applied as a module in `common/`, +all subsequent workspaces use the standard assume-role path. + +### 8.3 Template repo versioning + +Each template repo (section 6) should be tagged (`v1.0.0`, `v1.1.0`, etc.) and +the CFN product template for that workspace should pin `template_repo_ref`. This +ensures that re-running a bootstrap workspace on an existing account (e.g., to +add a new admin user) uses the exact same templates that created the account. + +### 8.4 Ordered product invocation via the SC console + +The 10 workspace products should be presented in an SC portfolio named +**Account Bootstrap** with a display order that mirrors the dependency sequence. +There is no current mechanism in sc-lambda-ghactions to enforce ordering between +products — the human operator is responsible for launching them in sequence. + +A future enhancement could add a dependency state machine (Step Functions or +DynamoDB tracking) to block Workspace N from launching until Workspace N-1 +has a successful executor commit status. This is out of scope for the initial +implementation. + +--- + +## 10. Template Repo Summary + +| Workspace | Template Repo | TF Layer | Primary SC Input Fields | +|---|---|---|---| +| 0 | `template-bootstrap-account-repo` | `init/git-setup` | account_id, alias, aws_environment, github_teams, admin_users | +| 1 | `template-provider-configs` | `provider_configs.d` | account_id, alias, aws_environment | +| 2 | `template-credentials` | `credentials.d` | account_id, alias, aws_environment | +| 3 | `template-variables` | `variables.d` | account_id, alias, aws_environment, regions, environment, program | +| 4 | `template-infrastructure-tfstate` | `infrastructure` | account_id, alias, aws_environment, primary_region, tfstate_bucket | +| 5 | `template-infrastructure-regional-logs` | `infrastructure/{region}` | account_id, alias, region | +| 6 | `template-common` | `common` | account_id, alias, aws_environment, admin_users, saml_metadata, ldap config | +| 7 | `template-infrastructure-finalize` | `infrastructure/{region}` | account_id, alias, region | +| 8 | `template-vpc` | `vpc/{region}` | account_id, alias, region, vpc_cidr, subnet_cidrs | +| 9 | `template-applications-structure` | `applications/structure` | account_id, alias, aws_environment, regions, app_stacks | +| 10 | `template-apps-repo` | `init/git-setup` (new repo) | account_id, alias, stack_name | + +--- + +## 11. Phased Implementation Recommendation + +Given the complexity of the full sequence, this implementation plan stages the +work into three phases aligned with the manual blockers: + +### Phase 1 — Structural scaffolding (no executing Terraform) +Workspaces 0, 1, 2, 3 — these produce only committed files and repo configuration. +They do not require the bootstrap IAM user or any running AWS infrastructure. +High confidence of automation; implement first. + +### Phase 2 — Infrastructure foundation (requires bootstrap credential handoff) +Workspaces 4 and 5 — these apply Terraform against the new account. +Blocked on establishing the bootstrap credential convention (M1–M3). +Implement after the credential handoff pattern is agreed. + +### Phase 3 — IAM, VPC, app structure +Workspaces 6, 7, 8, 9, 10 — these are the most account-specific and depend on +secrets (LDAP, SAML metadata) and IPAM allocation. +The git-secret dependency (M6) is the largest blocker; see section 6 for a full +analysis. Retiring git-secret in favor of Vault KV (extending ADR-002) is a +prerequisite for full unattended automation of `common/`. With Vault KV in place, +the only remaining manual inputs for `common/` are SAML metadata and VPC CIDRs. + +--- + +## 12. Comparison to Current State + +| Aspect | Today (manual `INF.SETUP.md`) | With proposed sc-lambda-ghactions products | +|---|---|---| +| Repo creation | Manual `git init` + GitHub API | Automated — Workspace 0 PR + executor | +| Provider file generation | Hand-edit per account | Automated — Workspace 1 | +| Credentials file generation | Hand-edit per region | Automated — Workspace 2 | +| TF state bootstrap | Manual CLI commands | Automated — Workspace 4 | +| IAM roles/groups/users | Ordered manual `tf apply` per module | Automated — Workspace 6 TAG sequence | +| VPC | Manual `tf apply` | Automated — Workspace 8 (requires CIDR input) | +| Secrets management | git-secret (manual encrypt cycle) | Manual until ADR-002 | +| Time to first usable account | Days | Hours (Phase 1+2 only); minutes if Phase 3 secrets are available | +| Auditability | `git log` in account repo | PR per workspace in GitHub; CodeBuild logs; GHE commit status | +| Repeatability | Operator knowledge / `INF.SETUP.md` | SC product form fields; idempotent Proposer | diff --git a/docs/cross-account-visibility.md b/docs/cross-account-visibility.md new file mode 100644 index 0000000..91cf421 --- /dev/null +++ b/docs/cross-account-visibility.md @@ -0,0 +1,353 @@ +# Cross-Account Fleet Visibility — Credentials and Console UI + +**Date:** 2026-05-19 +**Status:** Proposed +**Scope:** Read-only visibility across all accounts managed by sc-lambda-ghactions + +--- + +## Problem + +The `terraform-sc-fleet` manifest and `update_fleet.py` give CSVD a single operational +view of all managed workloads at the Terraform / GHE layer. But engineers also need to +locate and inspect those resources in the **AWS console** — CloudFormation stacks, +Service Catalog provisioned products, Lambda functions, S3 buckets, EKS clusters — +across all accounts simultaneously, without switching console sessions or holding +long-lived credentials for each account. + +--- + +## Credential Model — Hub-and-Spoke IAM Role Chain + +The UI server and any tooling that reads across accounts **never holds long-lived +credentials**. It uses `sts:AssumeRole` to obtain temporary credentials scoped to +each target account on demand. + +``` +csvd-dev (229685449397) — hub + └─> sc-fleet-ui-server role (instance profile / ECS task role) + └─> sts:AssumeRole ─────────────────────────────────────────────┐ + ▼ + Any spoke account + └─> sc-fleet-readonly role + └─> ReadOnlyAccess (AWS managed policy) +``` + +Temporary credentials are cached for up to 1 hour (the STS session duration). +Rotation is automatic. No keys are stored in environment variables, SSM, or Secrets Manager. + +--- + +## Infrastructure + +### 1. Spoke role — deployed to every target account via StackSet + +One role per account, deployed automatically via the existing +`CensusServiceCatalog-RoleAndAction` StackSet alongside the SC launch roles. + +**CFN role template** (`templates/role-templates/sc-fleet-readonly-role.yaml`): + +```yaml +Type: AWS::IAM::Role +Properties: + RoleName: sc-fleet-readonly + AssumeRolePolicyDocument: + Version: "2012-10-17" + Statement: + - Effect: Allow + Principal: + AWS: !Sub "arn:${AWS::Partition}:iam::${HubAccountId}:role/sc-fleet-ui-server" + Action: sts:AssumeRole + Condition: + StringEquals: + "sts:ExternalId": !Ref ExternalId # optional but recommended + ManagedPolicyArns: + - !Sub "arn:${AWS::Partition}:iam::aws:policy/ReadOnlyAccess" + Tags: + - Key: managed-by + Value: sc-lambda-ghactions +``` + +**`roles.yaml.tftpl` entry** (census repo): + +```yaml +- template: sc-fleet-readonly-role.yaml + parameters: + - parameter: HubAccountId + value: "229685449397" + - parameter: ExternalId + value: "sc-fleet-ui" +``` + +This propagates to all OU-shared accounts automatically. New accounts joining the OU +receive the role via `auto_deployment { enabled = true }`. + +### 2. Hub role — deployed in csvd-dev + +Lives in `sc-lambda-ghactions/deploy/iam.tf`. This is the role assumed by the UI server +(ECS task, Lambda, or EC2 instance profile). + +```hcl +resource "aws_iam_role" "sc_fleet_ui_server" { + name = "sc-fleet-ui-server" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Principal = { Service = "ecs-tasks.amazonaws.com" } + Action = "sts:AssumeRole" + }] + }) + + tags = { + managed-by = "sc-lambda-ghactions" + } +} + +resource "aws_iam_role_policy" "assume_spoke_roles" { + name = "assume-sc-fleet-readonly" + role = aws_iam_role.sc_fleet_ui_server.id + + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = "sts:AssumeRole" + Resource = "arn:${data.aws_partition.current.partition}:iam::*:role/sc-fleet-readonly" + # Restrict to org accounts only + Condition = { + StringEquals = { + "aws:ResourceOrgID" = var.org_id + } + } + }] + }) +} +``` + +### 3. Python helper — per-account session factory + +Used by the fleet dashboard, `update_fleet.py`, and any other tooling that needs +cross-account AWS API access: + +```python +# scripts/aws_session.py +import boto3 +from functools import lru_cache + +READONLY_ROLE = "sc-fleet-readonly" +PARTITION = "aws-us-gov" +REGION = "us-gov-west-1" + +@lru_cache(maxsize=64) +def session_for(account_id: str) -> boto3.Session: + """Return a boto3 Session scoped to account_id via sts:AssumeRole. + Credentials are cached for the lifetime of the process. + For long-running processes, evict the cache before the 1-hour STS expiry. + """ + sts = boto3.client("sts", region_name=REGION) + assumed = sts.assume_role( + RoleArn=f"arn:{PARTITION}:iam::{account_id}:role/{READONLY_ROLE}", + RoleSessionName="sc-fleet-ui", + ExternalId="sc-fleet-ui", + DurationSeconds=3600, + ) + creds = assumed["Credentials"] + return boto3.Session( + aws_access_key_id=creds["AccessKeyId"], + aws_secret_access_key=creds["SecretAccessKey"], + aws_session_token=creds["SessionToken"], + region_name=REGION, + ) + +def sc_client(account_id: str): + return session_for(account_id).client("servicecatalog") + +def cfn_client(account_id: str): + return session_for(account_id).client("cloudformation") +``` + +--- + +## Centralized UI Options + +Three options in order of implementation cost: + +### Option A — AWS Resource Explorer (recommended first step) + +Resource Explorer with a **multi-account aggregator index** provides a single search +across all accounts with built-in console deep-links. No custom UI to build or maintain. + +#### Setup + +Enable Resource Explorer org-wide with an aggregator in the management (or delegated +admin) account: + +```hcl +# In the management/delegated-admin account +resource "aws_resourceexplorer2_index" "aggregator" { + type = "AGGREGATOR" +} + +resource "aws_resourceexplorer2_view" "sc_fleet" { + name = "sc-fleet" + default_view = true + + filters { + filter_string = "tag:managed-by:sc-lambda-ghactions" + } +} +``` + +Each member account needs a local index (can be enabled via AWS Organizations policy +or Terraform deployed via StackSet): + +```hcl +resource "aws_resourceexplorer2_index" "local" { + type = "LOCAL" +} +``` + +#### Tagging convention + +Every resource provisioned through sc-lambda-ghactions must carry these tags so +Resource Explorer can surface them: + +| Tag key | Example value | Purpose | +|---------|--------------|---------| +| `managed-by` | `sc-lambda-ghactions` | Scope the aggregator view | +| `product-type` | `eks_cluster` | Filter by workload type | +| `workload-name` | `csvd-dev-mcm` | Find a specific workload | +| `team` | `csvd` | Filter by owning team | +| `lifecycle` | `dev` | Filter by environment tier | +| `account-repo` | `229685449397-csvd-dev-gov_apps-adsd-eks` | Trace back to GHE repo | + +The Proposer CodeBuild buildspec applies these tags when rendering HCL files that +create tagged resources. For resources that don't support tags (e.g. some IAM), the CFN +stack itself is tagged and the stack's console link is sufficient. + +#### Example Resource Explorer queries + +``` +# All sc-lambda-ghactions resources +tag:managed-by=sc-lambda-ghactions + +# All EKS provisioned products +tag:managed-by=sc-lambda-ghactions tag:product-type=eks_cluster + +# Specific workload across all resource types +tag:workload-name=csvd-dev-mcm + +# Failed CloudFormation stacks managed by the system +resourcetype:AWS::CloudFormation::Stack tag:managed-by=sc-lambda-ghactions +``` + +Results include a direct "Open in console" link to each resource in its native account. + +--- + +### Option B — Custom Fleet Dashboard + +A lightweight read-only web app when Resource Explorer is insufficient — e.g. you need +to show fleet diff state (pending PRs, last apply status, maintenance windows) alongside +AWS resource state. + +#### Architecture + +``` +csvd-dev + └─> ECS Fargate task (or Lambda + Function URL) + ├─> Assumes sc-fleet-ui-server hub role + ├─> Reads terraform-sc-fleet workloads/** (GHE API) + ├─> Calls sts:AssumeRole per account → reads SC/CFN/resource state + └─> Renders HTML dashboard with console deep-links +``` + +#### Console deep-link construction + +Direct links into the GovCloud console for each resource type: + +```python +BASE = "https://console.amazonaws-us-gov.com" + +def cfn_stack_link(region: str, stack_name: str) -> str: + return f"{BASE}/cloudformation/home?region={region}#/stacks?filteringText={stack_name}" + +def sc_product_link(region: str, product_id: str) -> str: + return f"{BASE}/servicecatalog/home?region={region}#/provisioned-products/{product_id}" + +def lambda_link(region: str, function_name: str) -> str: + return f"{BASE}/lambda/home?region={region}#/functions/{function_name}" + +def eks_link(region: str, cluster_name: str) -> str: + return f"{BASE}/eks/home?region={region}#/clusters/{cluster_name}" +``` + +#### Fleet status aggregation + +```python +from scripts.aws_session import sc_client, cfn_client + +def fleet_status(accounts: list[str]) -> list[dict]: + """Return provisioned product status across all accounts.""" + results = [] + for account_id in accounts: + sc = sc_client(account_id) + products = sc.search_provisioned_products( + Filters={"SearchQuery": ["tag:managed-by:sc-lambda-ghactions"]} + )["ProvisionedProducts"] + for p in products: + results.append({ + "account_id": account_id, + "product_name": p["Name"], + "product_type": p.get("Tags", {}).get("product-type"), + "status": p["Status"], + "status_message": p.get("StatusMessage"), + "console_link": sc_product_link(p["LastProvisioningRecordId"], p["Id"]), + }) + return results +``` + +--- + +### Option C — AWS Systems Manager Explorer + +SSM Fleet Manager and Explorer aggregate resource data, OpsItems, and compliance across +accounts out of the box — zero custom code, built-in console UI. Less flexible than +Options A/B but worth evaluating before building anything custom. + +Enable via AWS Organizations in the SSM console of the management account. No Terraform +changes needed beyond ensuring SSM is activated in all member accounts (already required +for StackSet operations). + +--- + +## Recommended Rollout + +| Phase | Work | Outcome | +|-------|------|---------| +| **1** | Add tags to all sc-lambda-ghactions provisioned resources (Proposer GHA templates) | Every resource carries `managed-by`, `product-type`, `workload-name`, `team`, `lifecycle` | +| **2** | Deploy `sc-fleet-readonly` spoke role via StackSet entry in census repo | CSVD hub can assume into any org account with one `sts:AssumeRole` call | +| **3** | Enable Resource Explorer aggregator index via management account | Single console search across all accounts with deep-links; zero custom UI | +| **4** | Add `aws_session.py` session factory to `terraform-sc-fleet/scripts/` | `update_fleet.py` and any future tooling can query any account with one helper call | +| **5** | *(optional)* Build fleet dashboard if Resource Explorer + GHE PR state is insufficient | Custom ECS task with per-account SC/CFN reads + console deep-link generation | + +Phases 1–3 are the minimum viable set. Phase 4 is a development convenience. Phase 5 +is only needed if the built-in console tools don't cover the operational queries CSVD +actually needs to make. + +--- + +## Security Notes + +- The `sc-fleet-readonly` spoke role grants `ReadOnlyAccess` — it cannot create, modify, + or delete any resource in any spoke account +- The `ExternalId` condition on `sts:AssumeRole` prevents confused-deputy attacks — only + callers that know the external ID can assume the role +- The hub role `sc-fleet-ui-server` is scoped to `sts:AssumeRole` on `*/sc-fleet-readonly` + only — it cannot assume any other role in spoke accounts +- The org condition (`aws:ResourceOrgID`) on the hub policy prevents the server from + assuming the role name in accounts outside the Census org +- No long-lived credentials are stored anywhere; STS temporary credentials expire + automatically after at most 1 hour diff --git a/docs/decisions/001-webhook-auto-apply.md b/docs/decisions/001-webhook-auto-apply.md new file mode 100644 index 0000000..f445862 --- /dev/null +++ b/docs/decisions/001-webhook-auto-apply.md @@ -0,0 +1,224 @@ + +# ADR-001: Webhook-Triggered Auto-Apply on Merge to Main + +## In Plain Language + +This document explains a new way to make our automation system easier and faster. Right now, after someone reviews and merges a pull request (PR) in GitHub, a person has to go into AWS Service Catalog and start the next step by hand. This is slow and can lead to mistakes or delays. + +We want to change this so that when a PR is merged to the main branch, our system will automatically start the next step without anyone having to do it manually. We will do this by using a GitHub webhook, which is a tool that tells our system when something important happens in the repo. When the webhook sees a new change on main, it will trigger our automation to run the apply step right away. + +This paper describes how this automatic process will work, what files and settings are needed, and what changes we have to make to our system. The goal is to make things smoother, faster, and less error-prone for everyone who uses our platform. + +**Status:** Accepted +**Date:** 2026-05-11 +**Supersedes:** the two-product model (proposer SC product + executor SC product) + +--- + +## Context + +An earlier design split the workflow into two Service Catalog products — a +**Proposer** product to render templates and open a PR, and a separate +**Executor** product to run `tf-run apply` after the PR was merged. While the +Proposer SC product is a natural fit for self-service provisioning (users fill +a form, get a PR URL back), the Executor SC product is not: it requires a +platform engineer to return to Service Catalog, find the product, re-enter the +same parameters already specified at propose time, and click Launch. + +This step is pure operational overhead with no review value — the review already +happened when the PR was merged to `main`. The information needed to start the +executor build (account repo, layer, region dir, target account) is already +recorded in `.sc-automation.yml` in the repo itself. + +**The Executor SC product is removed.** Apply is triggered automatically by a +GHE webhook on merge to `main`. The only user-facing Service Catalog product +remains the Proposer. + +--- + +## Decision + +Add a **GitHub Enterprise webhook handler** to the Lambda that automatically +starts an executor CodeBuild build whenever a push event lands on `main` in a +watched account repo. + +Target apply configuration is stored in a `.sc-automation.yml` file committed to +the root of each account repo by the Proposer (or manually by a platform engineer). + +--- + +## Proposed Design + +### `.sc-automation.yml` — committed to the account repo root + +```yaml +# Written by the Proposer CodeBuild build or manually by a platform engineer. +# Each entry triggers one executor CodeBuild build when changes land on main. +apply_on_merge: + - layer: infrastructure + region_dir: west + target_account_id: "229685449397" + - layer: infrastructure + region_dir: east + target_account_id: "229685449397" + - layer: vpc + region_dir: west + target_account_id: "229685449397" +``` + +Fields per entry: + +| Field | Required | Description | +|---|---|---| +| `layer` | yes | `common`, `infrastructure`, or `vpc` | +| `region_dir` | yes | `east`, `west`, or `global` | +| `target_account_id` | no | 12-digit AWS account ID; omit to run in csvd-dev | +| `cross_account_role` | no | IAM role name to assume in `target_account_id` (default: `r-inf-terraform`) | +| `tf_run_start_tag` | no | tf-run TAG label to start from | +| `dry_run` | no | `true` to plan only (default: `false`) | + +### Lambda changes + +Add a `/webhook` path handler alongside the existing CFN handler in +`lambda/app.py`. + +**Invocation:** Lambda Function URL (no API Gateway needed — GHE can POST to +a Function URL directly). The URL is added to the GHE org or repo webhook +settings. + +### Webhook payload — what GHE sends + +The GHE `push` event payload contains everything the Lambda needs to identify +the repo without any out-of-band mapping: + +```json +{ + "ref": "refs/heads/main", + "after": "abc123def456...", + "repository": { + "name": "229685449397-csvd-dev-platform-dev-gov", + "full_name": "SCT-Engineering/229685449397-csvd-dev-platform-dev-gov", + "clone_url": "https://github.e.it.census.gov/SCT-Engineering/229685449397-csvd-dev-platform-dev-gov.git" + } +} +``` + +- `repository.name` → `ACCOUNT_REPO` passed to CodeBuild +- `after` → merge commit SHA used for GHE commit status writeback +- No repo→callback URL map is needed or maintained + +**Request flow:** + +``` +GHE push event (main branch, account repo) + → Lambda Function URL POST / + → verify HMAC-SHA256 signature (secret in SM: ghe-runner/webhook-secret) + → parse X-GitHub-Event: push + → filter: ref == refs/heads/main AND repository.name matches account repo pattern + → fetch .sc-automation.yml from main via GHE API (no clone — single API call) + → if .sc-automation.yml missing: post ❌ commit status "no .sc-automation.yml on main" and exit + → post ⏳ "pending" commit status on merge SHA + → for each entry in apply_on_merge: + start_codebuild_build( + action="apply", + account_repo=payload["repository"]["name"], # from webhook + layer=entry["layer"], # from .sc-automation.yml + region_dir=entry["region_dir"], # from .sc-automation.yml + target_account_id=entry.get("target_account_id", ""), + commit_sha=payload["after"] # for status writeback + ) + (fire-and-forget — do NOT poll CodeBuild) + → return HTTP 200 immediately +``` + +**Executor buildspec writeback:** +The executor CodeBuild build receives `COMMIT_SHA` as an env var. In its +POST_BUILD phase it calls `gh api` to post a GHE commit status (`success` or +`failure`) back to the merge commit. Teams see ✅ or ❌ directly on the commit +in the PR history — no CloudWatch required. + +**Key differences from the CFN handler:** + +- **No polling.** The webhook handler starts builds and returns immediately. + Build results are visible in CodeBuild logs and CloudWatch. There is no CFN + stack to signal. +- **No CFN resource.** Webhook-triggered executor runs bypass Service Catalog + entirely. For manual one-off runs (re-apply from a TAG, dry-run), the executor + build can be started directly via the CodeBuild console or AWS CLI — no SC + product is needed or maintained. +- **Idempotent.** If GHE retries the webhook (network blip), a duplicate build + is started. This is acceptable — `tf-run apply` on an already-applied state is + a no-op. + +### Infrastructure changes + +| Resource | Change | +|---|---| +| Lambda Function URL | Add `aws_lambda_function_url` resource in `deploy/lambda.tf` | +| Lambda invoke permission | Add `aws_lambda_permission` allowing `lambda:InvokeFunctionUrl` from `*` (HMAC signature is the auth mechanism) | +| Secrets Manager | Add a `ghe-runner/webhook-secret` secret for HMAC verification | +| Lambda IAM | No change — existing `codebuild:StartBuild` permission covers webhook-triggered builds | +| GHE Webhook | Manual one-time setup: org or per-repo webhook → Function URL, content-type `application/json`, events: `push` | + +### `.sc-automation.yml` lifecycle + +- **Proposer writes it** on the first run for a branch, if the file doesn't + already exist on `main`. The Proposer knows `layer`, `region_dir`, and + `target_account_id` from its CodeBuild env vars. It commits `.sc-automation.yml` + alongside the rendered template files so the file is reviewed in the same PR. +- **Proposer does NOT overwrite it** on subsequent runs — it checks whether the + file already exists on `main` and skips writing if so, preserving any manual + edits made by platform engineers. +- **Platform engineers edit it** directly via PR to add, remove, or reorder + apply targets. +- **GHE commit status missing → blocked** — if `.sc-automation.yml` is not + present on `main` when a push webhook fires, the Lambda posts a `failure` + commit status and does not start any builds. This surfaces the problem + immediately without a silent no-op. + +--- + +## Consequences + +### Benefits + +- Eliminates the manual "provision executor product" step after PR merge +- Apply is fully traceable: GHE push event → CloudWatch Logs → CodeBuild build ID +- GHE commit status writeback gives teams ✅/❌ feedback directly on the merge commit +- No new infrastructure services (no EventBridge, no SQS, no API Gateway) +- No repo→callback URL map to maintain — repo identity comes from the webhook payload +- Manual one-off executor runs (re-apply from a TAG, dry-run) are done directly + via `aws codebuild start-build` — no separate SC product is needed or maintained + +### Trade-offs + +- Build results are no longer surfaced in a CloudFormation stack output — users + must check CodeBuild or CloudWatch Logs directly +- GHE webhook requires a one-time manual setup per org (or per repo for + fine-grained control) +- A merge to `main` that does not involve Terraform changes (e.g. README edit) + will still trigger executor builds. Mitigation: add a `paths` filter in + `.sc-automation.yml` (future enhancement) or rely on `tf-run apply` being a + safe no-op + +### Out of scope for this ADR + +- SNS / Slack / email notification after a webhook-triggered apply — tracked separately +- Path filtering (only trigger on changes under `{layer}/{region_dir}/`) — tracked separately +- Idempotency guard against GHE webhook retries firing duplicate builds — `tf-run apply` + on an already-converged state is a safe no-op, so this is deferred + +--- + +## Alternatives Considered + +**CodeStar connection + CodePipeline watch:** Requires CodePipeline infrastructure +per repo, CodeStar connector host setup for GHE on-prem, and loses the per-run +environment variable flexibility that the Lambda `StartBuild` override model +provides. Rejected. + +**EventBridge + S3 source:** Would require mirroring the GHE repo to CodeCommit +or S3 to get an EventBridge trigger. Adds a sync layer with no benefit. Rejected. + +**Poll-based apply (Lambda on schedule):** Adds latency and unnecessary API calls. +Rejected. diff --git a/docs/decisions/002-vault-aws-secrets-engine.md b/docs/decisions/002-vault-aws-secrets-engine.md new file mode 100644 index 0000000..1d1bd4c --- /dev/null +++ b/docs/decisions/002-vault-aws-secrets-engine.md @@ -0,0 +1,345 @@ +# ADR-002: HashiCorp Vault AWS Secrets Engine for Dynamic Cross-Account Credentials + +## In Plain Language + +Right now, when our automation runs Terraform in an account repo it needs AWS +credentials to assume a role in the target account. Those credentials come from +a long-lived IAM role attached to the CodeBuild service role — a role that +exists permanently and can be used at any time. + +This document proposes replacing those static, always-on IAM credentials with +**short-lived, on-demand credentials** issued by a HashiCorp Vault cluster +running the [AWS Secrets Engine](https://developer.hashicorp.com/vault/docs/secrets/aws). +When a build starts, it authenticates to Vault (using its own AWS identity), +asks for credentials scoped to the target account and the specific Vault role +defined in the product workspace, gets back temporary AWS keys that expire in +minutes, and then runs Terraform. There are no long-lived keys to rotate or +accidentally expose. + +Because the Vault role is a Terraform resource declared inside the product +workspace, the exact IAM permissions granted to any automation run are visible +as a reviewable diff in the same PR that makes the infrastructure change. Review +the code, review the access policy — one approval covers both. + +**Status:** Proposed +**Date:** 2026-05-19 + +--- + +## Context + +The current cross-account credential model works as follows: + +1. The CodeBuild service role (`sc-automation-codebuild-role` in csvd-dev) has + `sts:AssumeRole` permission for `*:role/sc-automation-codebuild-role`. +2. A matching role with the same name is pre-created in each target account and + trusts the csvd-dev CodeBuild role. +3. The executor buildspec calls `aws sts assume-role` and exports `AWS_*` env + vars before running Terraform. + +This works but has the following drawbacks: + +- **Static trust relationship.** The csvd-dev CodeBuild role can assume the + target-account role at any time, not just during a sanctioned automation run. + If the CodeBuild service role or its credentials were ever misused, an attacker + could assume any target-account role without any build being underway. +- **No per-run scope.** Every executor build gets the same level of access, + regardless of what the product workspace actually needs. There is no way to + restrict a build to, say, only VPC-layer permissions. +- **Permission review is disconnected.** The IAM role in the target account is + managed separately from the product workspace. A reviewer approving a product + PR has no visibility into what IAM permissions the automation will use. +- **Static role pre-creation.** Every new account requires a platform engineer + to pre-create the `sc-automation-codebuild-role` role before the first + automation run can succeed. + +The Vault AWS Secrets Engine addresses all four of these gaps. + +--- + +## Decision + +Deploy a HashiCorp Vault cluster (or use an existing Census-managed Vault) with +the **AWS Secrets Engine** enabled. Each SC product workspace declares a +`vault_aws_secret_backend_role` Terraform resource specifying the exact IAM +permissions the automation run requires. The executor buildspec authenticates to +Vault using the **AWS auth method** (the CodeBuild task's own IAM identity) and +requests short-lived STS credentials scoped to that role before running Terraform. + +--- + +## Proposed Design + +### Vault AWS Secrets Engine — how it works + +``` +Vault cluster (csvd-dev or shared platform) + └── secrets/aws/ (AWS Secrets Engine mount) + └── roles/ + └── {vault_aws_role} + credential_type = assumed_role + role_arns = ["arn:aws-us-gov:iam::{account_id}:role/{role}"] + default_ttl = 900s + max_ttl = 1800s +``` + +When the executor build calls `vault read aws/creds/{vault_aws_role}`, Vault +calls `sts:AssumeRole` on its own behalf and returns temporary +`AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_SESSION_TOKEN` that +expire when the TTL elapses. The credentials are scoped to exactly the role ARNs +listed in the Vault role — nothing wider. + +### `.sc-automation.yml` — new field + +```yaml +apply_on_merge: + - layer: infrastructure + region_dir: west + target_account_id: "229685449397" + vault_aws_role: "sc-infra-west-229685449397" # ← new +``` + +The `vault_aws_role` value is the name of the Vault role to read credentials +from. It is written by the Proposer (derived from the product workspace) and +committed to the account repo alongside the rendered HCL files. + +### Product workspace — Vault role as a Terraform resource + +Each SC product workspace (e.g. a VPC product, an EKS product) declares the +Vault role it needs alongside its other infrastructure: + +```hcl +# vault_role.tf — committed inside the product workspace, reviewed in the PR + +resource "vault_aws_secret_backend_role" "automation" { + backend = "aws" + name = "sc-infra-west-${var.target_account_id}" + credential_type = "assumed_role" + + role_arns = [ + "arn:${var.aws_partition}:iam::${var.target_account_id}:role/sc-automation-infra-west" + ] + + default_ttl = 900 + max_ttl = 1800 +} +``` + +**Why this matters for review:** The Proposer PR diff includes `vault_role.tf`. +A reviewer can see exactly which IAM role the automation will assume and in which +account. Access policy and infrastructure change are approved in the same PR +— there is no separate IAM role PR to chase down or forget. + +### CodeBuild authentication to Vault — AWS auth method + +The executor CodeBuild task authenticates to Vault using the +[AWS auth method](https://developer.hashicorp.com/vault/docs/auth/aws). The +CodeBuild service role's IAM identity is used as the authentication credential +— no long-lived Vault token is stored anywhere. + +```bash +# executor buildspec — PRE_BUILD phase +vault login -method=aws \ + -path=auth/aws \ + role=sc-automation-executor \ + header_value=vault.example.census.gov + +# Read dynamic credentials for this specific run +CREDS=$(vault read -format=json aws/creds/${VAULT_AWS_ROLE}) +export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r .data.access_key) +export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r .data.secret_key) +export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r .data.security_token) +``` + +The CodeBuild task's IAM role is added to a Vault auth policy that permits only +`read` on `aws/creds/*` — it cannot create new Vault roles, modify policies, or +read credentials for roles it is not permitted to access. + +### Vault Terraform resources — managed in deploy/ + +```hcl +# deploy/vault.tf + +resource "vault_aws_secret_backend" "aws" { + path = "aws" + # Vault's own IAM user/role that calls sts:AssumeRole on behalf of requestors + # must have sts:AssumeRole on the target roles. +} + +resource "vault_auth_backend" "aws" { + type = "aws" + path = "auth/aws" +} + +resource "vault_aws_auth_backend_role" "codebuild_executor" { + backend = vault_auth_backend.aws.path + role = "sc-automation-executor" + auth_type = "iam" + bound_iam_principal_arns = [aws_iam_role.codebuild_service_role.arn] + token_policies = ["sc-automation-executor"] + token_ttl = 900 +} + +resource "vault_policy" "codebuild_executor" { + name = "sc-automation-executor" + + policy = <<-EOT + path "aws/creds/*" { + capabilities = ["read"] + } + EOT +} +``` + +### Infrastructure summary + +| Component | Location | Purpose | +|---|---|---| +| Vault cluster | Census-managed or csvd-dev | Issues dynamic AWS credentials | +| AWS Secrets Engine | `aws/` mount on Vault | Calls `sts:AssumeRole` and returns short-lived keys | +| AWS auth method | `auth/aws/` mount on Vault | Lets CodeBuild authenticate using its own IAM identity | +| `vault_aws_secret_backend_role` | Product workspace Terraform | Per-product IAM scope, reviewed in the Proposer PR | +| Vault endpoint env var | `deploy/codebuild.tf` | `VAULT_ADDR` set on the executor CodeBuild project | +| Vault IAM user | `deploy/vault.tf` | Has `sts:AssumeRole` on all target-account roles | +| Target-account IAM roles | Per-account Terraform | Trust Vault IAM user; scoped to minimum permissions | + +--- + +## Integration with the Proposer Flow + +The key insight is that the Vault role declaration is **part of the product +workspace**, not managed out-of-band. + +When the Proposer CodeBuild build runs Terraform (`tf apply`) to render and +commit files to the account repo, it also applies `vault_role.tf`. The result: + +1. User fills SC product form → Proposer starts. +2. Proposer runs `terraform apply` in the product workspace → creates + `vault_aws_secret_backend_role` in Vault. +3. Proposer renders HCL templates → opens PR on the account repo. +4. PR includes `.sc-automation.yml` with `vault_aws_role: sc-infra-west-{account_id}`. +5. Reviewer merges PR. +6. Webhook fires executor build with `VAULT_AWS_ROLE=sc-infra-west-{account_id}`. +7. Executor authenticates to Vault, reads credentials for that role, runs Terraform. + +The Vault role and the target-account IAM role both exist by the time the +executor runs because the Proposer created them before the PR was even opened. + +### Account baseline prerequisite + +For the Proposer to create the target-account IAM role, it needs an initial +foothold in that account. A single **proposer-access role** must exist in each +target account before the first product is provisioned into it: + +```hcl +# Created once per account as part of account baseline / landing-zone +resource "aws_iam_role" "sc_automation_proposer" { + name = "sc-automation-proposer" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Principal = { AWS = "arn:${var.aws_partition}:iam::229685449397:role/tf-run-proposer-codebuild" } + Action = "sts:AssumeRole" + }] + }) +} + +# Permissions boundary keeps this role from creating anything outside +# the sc-automation-* namespace regardless of what policy is attached +resource "aws_iam_role_policy" "sc_automation_proposer" { + role = aws_iam_role.sc_automation_proposer.name + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = ["iam:CreateRole", "iam:PutRolePolicy", "iam:AttachRolePolicy", + "iam:DeleteRole", "iam:DeleteRolePolicy", "iam:GetRole"] + Resource = "arn:${var.aws_partition}:iam::*:role/sc-automation-*" + }] + }) +} +``` + +This role is **not** a Vault-specific concept — it is the account-level trust +grant that allows the automation platform (csvd-dev) to manage its own IAM +footprint in a target account. It belongs in the account vending / landing-zone +baseline alongside other platform roles (e.g. Break-Glass, Config recorder, +SSO permission sets). Once created at account birth it never needs to change. + +--- + +## Consequences + +### Benefits + +- **Short-lived credentials.** Dynamic STS credentials expire within the TTL + (default 15 min). A leaked credential is useless after expiry. +- **Per-run scope.** Each executor build reads credentials for the specific + Vault role defined in `.sc-automation.yml`. A build cannot access credentials + for a role it was not explicitly given. +- **Review parity.** IAM permissions (`vault_role.tf`) are changed in the same + PR as infrastructure. No separate IAM PR; no forgotten permission cleanup. +- **No static cross-account trust.** The existing "CodeBuild role can assume + any `sc-automation-codebuild-role` at any time" is replaced with "CodeBuild + can only read credentials for Vault roles it is permitted to access, and only + during an active build." +- **Automatic Vault role and IAM role provisioning.** The Proposer's + `terraform apply` creates both the Vault role and the target-account IAM + role the Vault secrets engine will assume — in the same apply, before the + PR is opened. No manual per-product setup in the target account. +- **Audit log.** Vault logs every credential issuance with the requesting + entity, timestamp, and lease ID. Each executor build's credential request is + independently auditable in Vault audit logs, separate from CloudTrail. + +### Trade-offs + +- **Vault dependency.** The automation chain now requires a healthy Vault + cluster. If Vault is unavailable, executor builds cannot obtain credentials + and will fail. Mitigation: Vault HA, periodic health checks, runbook for + Vault outage. +- **Vault provider version pinning.** The product workspace requires the + `hashicorp/vault` Terraform provider. This must be available via the Census + proxy (or mirrored in the internal provider cache) and pinned to a tested + version. +- **One landing-zone role required per account.** The Proposer needs a + `sc-automation-proposer` role in each target account (see _Account baseline + prerequisite_ above) to create the per-product executor IAM role. This is a + one-time setup per account, lives in the account vending baseline, and is + narrower than today's equivalent (`iam:CreateRole` on `sc-automation-*` only). +- **Executor buildspec changes required.** `vault login` and `vault read` + calls must be added to the PRE_BUILD phase and the prior + `aws sts assume-role` pattern removed. + +### Out of scope for this ADR + +- Vault cluster sizing, HA topology, and DR strategy — tracked separately +- Census Vault namespace design (shared cluster vs. dedicated) — tracked separately +- Migration path for existing accounts already using the static-role model — tracked separately +- Slack / SNS notification on Vault credential issuance failures — tracked separately + +--- + +## Alternatives Considered + +**AWS IAM Roles Anywhere:** Lets workloads outside AWS obtain short-lived +credentials by presenting a certificate signed by a private CA. Requires +managing a private CA and distributing certificates to CodeBuild tasks. +More complex than Vault AWS auth (which reuses the existing IAM identity +already on the CodeBuild task) with no meaningful benefit in this context. +Rejected. + +**Keep static cross-account role assumption + add SCPs to restrict usage to +CodeBuild source IPs:** SCPs cannot restrict by source service (CodeBuild vs +an operator workstation with the same credentials), only by IP range. IP +ranges for CodeBuild are not stable or exclusive. Rejected. + +**AWS Secrets Manager dynamic secrets plugin:** AWS Secrets Manager does not +natively generate STS-assumed-role credentials on demand. The only supported +dynamic rotation pattern is for database passwords. Rejected. + +**OIDC federation (GitHub Actions model):** GHE on-prem does not expose an +OIDC discovery endpoint compatible with the AWS IAM OIDC provider without +additional infrastructure. Vault AWS auth with the existing CodeBuild IAM +identity is simpler and requires no GHE configuration changes. Rejected. diff --git a/docs/decisions/003-vault-cluster-topology.md b/docs/decisions/003-vault-cluster-topology.md new file mode 100644 index 0000000..78fbd0c --- /dev/null +++ b/docs/decisions/003-vault-cluster-topology.md @@ -0,0 +1,134 @@ +# ADR-003: Vault Cluster Topology for SC Automation + +## In Plain Language + +Before we can implement ADR-002 (dynamic AWS credentials from Vault), we need to decide +*which* Vault cluster the SC automation system will talk to, how that cluster is organized, +and how CodeBuild builds will authenticate to it. + +This document records the topology decision: existing shared cluster vs. dedicated cluster, +namespace layout, and the auth method CodeBuild will use to prove its identity to Vault. + +**Status:** Proposed +**Date:** 2026-05-28 +**Depends on:** ADR-002 (`002-vault-aws-secrets-engine.md`) +**Jira:** [CSC-1346](https://jira.it.census.gov/browse/CSC-1346) + +--- + +## Context + +ADR-002 specifies that the CodeBuild executor will authenticate to Vault and request +short-lived AWS credentials from the Vault AWS Secrets Engine. But it deliberately +defers the question of *which* Vault cluster to use. Three viable topologies exist: + +### Option A — Shared Census Vault cluster, dedicated namespace + +Use an existing Census-managed Vault cluster (e.g. the platform Vault in csvd-prod +or a shared non-prod instance). Create a dedicated namespace (`sc-automation/`) so +that all SC automation policies, roles, and secrets engine mounts are isolated from +other tenants. + +**Pros:** +- No new cluster to operate or HA-tune +- Shared cluster is already monitored, patched, and backed up +- Cost is shared across all tenants + +**Cons:** +- Dependency on another team's change-management cadence +- Namespace-level isolation is good but not complete cluster isolation +- Shared cluster outage affects all tenants simultaneously + +### Option B — Dedicated Vault cluster in csvd-dev + +Deploy a standalone Vault cluster (Integrated Storage / Raft, 3-node) in csvd-dev +`us-gov-west-1` specifically for SC automation. + +**Pros:** +- Full operational control; can tune lease TTLs, auth policies, and HA config + without coordinating with other teams +- Complete isolation — a misconfiguration in SC automation cannot affect other workloads +- Can be versioned and upgraded on our own schedule + +**Cons:** +- New operational burden: cluster patching, unseal key rotation, backup scheduling +- Requires 3 EC2 instances (or ECS tasks) and associated IAM/networking +- Higher cost for a single-tenant cluster + +### Option C — Vault on Kubernetes (ECS/EKS sidecar pattern) + +Run Vault as a sidecar container alongside CodeBuild tasks (dev/agent pattern), using +`vault agent` injector to deliver credentials to the build environment. + +**Pros:** No persistent cluster to manage +**Cons:** CodeBuild does not support sidecars natively; requires workaround; substantially +more complex than Options A or B. **Not recommended.** + +--- + +## Auth Method Decision + +Regardless of cluster topology, CodeBuild will authenticate to Vault using the +**AWS IAM auth method** (`auth/aws`). The CodeBuild service role ARN +(`arn:${AWS::Partition}:iam::229685449397:role/sc-automation-codebuild-role`) is +bound to a Vault role. When the executor build starts, `vault login` presents the +current IAM identity (via `GetCallerIdentity`) — no static tokens or secrets are +needed inside the build environment. + +```hcl +# Vault IAM auth role (managed in sc-lambda-ghactions deploy/) +resource "vault_aws_auth_backend_role" "codebuild_executor" { + backend = "aws" + role = "sc-automation-codebuild" + auth_type = "iam" + bound_iam_principal_arns = ["arn:aws-us-gov:iam::229685449397:role/sc-automation-codebuild-role"] + token_ttl = 900 # 15 min — matches max CodeBuild build window + token_policies = ["sc-automation-executor"] +} +``` + +--- + +## Decision + +> **TO BE DECIDED** — this ADR is in Proposed state pending discussion with the +> platform / Vault operations team. + +Questions to answer before closing this ADR: + +1. Is there an existing Census Vault cluster available for non-prod workloads that + the SC automation team can use? What is its SLA? +2. Does the Census Vault team support dedicated namespaces for product teams? +3. What is the blast-radius / approval process for cluster-level changes on a + shared cluster that affect us? +4. Are there cost / account placement constraints that favour one topology? + +**Recommended default (pending discussion): Option A** — shared Census cluster with +a dedicated `sc-automation/` namespace. This avoids new operational burden while +still providing tenant isolation. Revisit if the shared cluster proves too slow to +change or if an outage directly impacts SC automation SLA. + +--- + +## Consequences + +### If Option A (shared cluster, dedicated namespace) + +- Platform team must grant namespace admin rights to the SC automation team +- SC automation `deploy/` Terraform must include Vault provider config pointing at + the shared cluster +- Vault cluster URL and namespace become required Terraform variables + +### If Option B (dedicated cluster in csvd-dev) + +- New Terraform module required to stand up 3-node Raft cluster in csvd-dev +- Unseal key escrow procedure must be documented and tested +- Adds ~$X/month to csvd-dev bill (to be estimated) + +--- + +## Related + +- [ADR-002: Vault AWS Secrets Engine](./002-vault-aws-secrets-engine.md) — upstream decision +- [CSC-1345](https://jira.it.census.gov/browse/CSC-1345) — ADR-002 implementation ticket +- [CSC-1346](https://jira.it.census.gov/browse/CSC-1346) — this topology decision ticket diff --git a/docs/decisions/004-account-baseline-iam-role.md b/docs/decisions/004-account-baseline-iam-role.md new file mode 100644 index 0000000..bdb1fed --- /dev/null +++ b/docs/decisions/004-account-baseline-iam-role.md @@ -0,0 +1,164 @@ +# ADR-004: Account Baseline IAM Role for SC Automation Cross-Account Access + +## In Plain Language + +For our automation to run Terraform in a target AWS account, it needs AWS credentials +in that account. Right now the plan is a pre-created IAM role called +`sc-automation-codebuild-role` in every target account that trusts the CodeBuild +service role in csvd-dev. + +This document records what that role must look like, how it gets created at scale +across the org, and the lifecycle rules around updates and removal. + +**Status:** Accepted +**Date:** 2026-05-28 +**Jira:** [CSC-1344](https://jira.it.census.gov/browse/CSC-1344) +**Note:** If ADR-002/ADR-003 are fully implemented (Vault AWS Secrets Engine), the +`sts:AssumeRole` trust in this role is eventually replaced by a Vault-issued +credential. This role definition remains the correct minimum-privilege baseline +regardless of which credential mechanism is used. + +--- + +## Context + +The executor CodeBuild build runs in csvd-dev. To apply Terraform changes in a +target account (e.g. `123456789012-some-team-workload-dev-gov`), it must assume +a role in that account. + +### Current mechanism (static AssumeRole) + +``` +csvd-dev CodeBuild role (229685449397) + └─ sts:AssumeRole ─────────────────────────────────> + sc-automation-codebuild-role (in target account) + └─ trusts 229685449397 CodeBuild role +``` + +### Future mechanism (Vault dynamic credentials — ADR-002) + +``` +csvd-dev CodeBuild → vault login (IAM auth) → Vault AWS Secrets Engine + └─ Vault generates short-lived creds for sc-automation-codebuild-role +``` + +In both cases the **target-account role definition is the same** — only the +mechanism for obtaining credentials to it changes. + +--- + +## Role Specification + +### Role name + +`sc-automation-codebuild-role` + +Consistent name across all accounts enables a single `sts:AssumeRole` permission +in the csvd-dev CodeBuild role without per-account ARN enumeration. + +### Trust policy (static AssumeRole model) + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "AllowSCAutomationCodeBuild", + "Effect": "Allow", + "Principal": { + "AWS": "arn:aws-us-gov:iam::229685449397:role/sc-automation-codebuild-role" + }, + "Action": "sts:AssumeRole", + "Condition": { + "StringEquals": { + "sts:ExternalId": "${account_id}" + } + } + } + ] +} +``` + +The `ExternalId` condition requires the CodeBuild build to pass the target account ID, +preventing confused-deputy attacks across the org. + +### Permissions (minimum viable) + +The role needs enough access for the product workspaces it will run. For the initial +baseline, the following managed policies cover the standard layers: + +| Layer | Managed Policy | +|-------|---------------| +| `infrastructure/` | `arn:aws-us-gov:iam::aws:policy/AdministratorAccess` (scoped later) | +| `common/` | `arn:aws-us-gov:iam::aws:policy/IAMFullAccess` + `PowerUserAccess` | +| `vpc/` | `arn:aws-us-gov:iam::aws:policy/AmazonVPCFullAccess` | + +> **Future hardening:** Replace `AdministratorAccess` with a least-privilege +> customer-managed policy once product workspace IAM requirements are stable. +> Track in a follow-up ADR. + +### Permission boundary (recommended) + +Attach a permission boundary policy that prevents the role from escalating +privileges beyond the SC automation scope: + +```hcl +resource "aws_iam_role" "sc_automation" { + name = "sc-automation-codebuild-role" + assume_role_policy = data.aws_iam_policy_document.trust.json + permissions_boundary = "arn:${data.aws_partition.current.partition}:iam::${data.aws_caller_identity.current.account_id}:policy/sc-automation-boundary" +} +``` + +--- + +## Rollout Strategy + +Three options for provisioning this role at org scale: + +### Option A — CloudFormation StackSet (recommended) + +Use a `SERVICE_MANAGED` StackSet targeting the whole OU. Each new account +automatically gets the role on vending. Updates are propagated automatically. + +```yaml +# stackset template fragment +Resources: + SCAutomationRole: + Type: AWS::IAM::Role + Properties: + RoleName: sc-automation-codebuild-role + AssumeRolePolicyDocument: ... + ManagedPolicyArns: + - !Sub "arn:${AWS::Partition}:iam::aws:policy/AdministratorAccess" +``` + +### Option B — Terraform module via account-vending pipeline + +Add the role to the account baseline Terraform module that runs at account +creation time (if one exists). Requires a retroactive apply for existing accounts. + +### Option C — Manual per-account creation (not recommended) + +Acceptable only for the initial csvd-dev E2E test. Does not scale. + +**Decision: Option A (StackSet)** for org-wide rollout. Option B as secondary +mechanism if a Terraform account-vending pipeline is in place. + +--- + +## Consequences + +- Every new account joining the target OU automatically gets `sc-automation-codebuild-role` +- The StackSet stack instance must be removed before an account can be decommissioned +- Changes to the role (policy updates, ExternalId rotation) are propagated via + StackSet stack instance updates — average propagation time ~15 min for large OUs + +--- + +## Related + +- [ADR-002: Vault AWS Secrets Engine](./002-vault-aws-secrets-engine.md) +- [ADR-003: Vault Cluster Topology](./003-vault-cluster-topology.md) +- [CSC-1344](https://jira.it.census.gov/browse/CSC-1344) — provisioning ticket +- [CSC-1348](https://jira.it.census.gov/browse/CSC-1348) — OU sharing / StackSet ticket diff --git a/docs/decisions/005-portfolio-org-sharing.md b/docs/decisions/005-portfolio-org-sharing.md new file mode 100644 index 0000000..19d2e60 --- /dev/null +++ b/docs/decisions/005-portfolio-org-sharing.md @@ -0,0 +1,141 @@ +# ADR-005: Service Catalog Portfolio Org-Wide Sharing Strategy + +## In Plain Language + +The sc-lambda-ghactions SC products live in a portfolio in csvd-dev. For teams in +other AWS accounts to provision those products, they need to see the portfolio. + +This document records how the portfolio is shared org-wide — whether through AWS +Resource Access Manager (RAM), CloudFormation StackSets, or a combination — and +what access controls govern who can see and launch products. + +**Status:** Proposed +**Date:** 2026-05-28 +**Jira:** [CSC-1348](https://jira.it.census.gov/browse/CSC-1348) + +--- + +## Context + +AWS Service Catalog supports **portfolio sharing** to other accounts or entire +Organizations units via: + +1. **AWS RAM portfolio share** — shares the portfolio to an OU or the entire org. + Member accounts see the portfolio and must explicitly **accept** or it can be + auto-accepted at the org level if RAM auto-accept is enabled for the OU. + +2. **CloudFormation StackSet (SERVICE_MANAGED)** — uses a CFN StackSet to deploy + an `AWS::ServiceCatalog::AcceptedPortfolioShare` + `AWS::ServiceCatalog::PortfolioPrincipalAssociation` + resource into every member account, granting the end-user principal (e.g. an + IAM role) launch permissions automatically. + +3. **Hybrid** — RAM share provides the underlying portfolio visibility; + a StackSet handles the per-account `AcceptedPortfolioShare` + principal + association so users don't have to accept manually. + +### Current state + +`deploy/service_catalog.tf` in this repo creates the portfolio and products in +csvd-dev but does not yet configure cross-account sharing. The only currently +working provisioning path is from within csvd-dev itself. + +--- + +## Decision + +**Use the hybrid approach (RAM share + StackSet for acceptance and principal binding).** + +### Step 1 — RAM portfolio share to the OU + +Add to `deploy/service_catalog.tf`: + +```hcl +resource "aws_servicecatalog_portfolio_share" "org" { + portfolio_id = aws_servicecatalog_portfolio.main.id + type = "ORGANIZATIONAL_UNIT" + principal_id = var.target_ou_arn # e.g. "ou-xxxx-yyyyyyyy" + share_tag_options = false + wait_for_acceptance = false # StackSet handles acceptance +} +``` + +### Step 2 — StackSet for acceptance + principal association + +Deploy a `SERVICE_MANAGED` StackSet into the target OU from the management account +(or csvd-dev if delegated admin is configured): + +```yaml +Resources: + AcceptShare: + Type: AWS::ServiceCatalog::AcceptedPortfolioShare + Properties: + PortfolioId: !Ref PortfolioId # passed as StackSet parameter + + LaunchPermission: + Type: AWS::ServiceCatalog::PortfolioPrincipalAssociation + DependsOn: AcceptShare + Properties: + PortfolioId: !Ref PortfolioId + PrincipalARN: !Sub "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/SC-ProductLauncher" + PrincipalType: IAM +``` + +This gives any account in the OU automatic access to the portfolio without a +platform engineer having to accept shares manually. + +### Launch constraint + +Each product in the portfolio has a `LAUNCH` constraint that specifies the IAM +role used to provision the product in the member account. This role is the Lambda +`ServiceToken` principal in csvd-dev — no per-account launch role is needed since +the Lambda runs centrally. + +--- + +## Access Control + +| Who | Access | +|-----|--------| +| Any account in the target OU | Can see and provision products | +| Principal: `SC-ProductLauncher` IAM role | Has launch permission in their account | +| csvd-dev only | Portfolio admin / product version management | + +### Tag-based visibility (future) + +If products should only be visible to specific teams within an account, AWS Service +Catalog Tag Options can filter the portfolio view. Not implemented in initial rollout. + +--- + +## StackSet Placement + +The StackSet must be deployed from an account with **delegated administrator** rights +for Service Catalog (or from the management account). If csvd-dev is the delegated +admin for Service Catalog in this org, the StackSet can be managed from +`deploy/service_catalog.tf`. + +Confirm delegated admin status: +```bash +aws organizations list-delegated-administrators \ + --service-principal servicecatalog.amazonaws.com \ + --region us-gov-west-1 +``` + +--- + +## Consequences + +- New accounts joining the target OU automatically receive portfolio access (via + StackSet auto-deployment) +- The `target_ou_arn` variable must be added to `deploy/variables.tf` +- A separate CFN StackSet template file will live in `service-catalog/portfolio-share-stackset.yaml` +- If the OU changes (accounts move), RAM share scoping does not need to change — + the StackSet handles new instances automatically + +--- + +## Related + +- [ADR-004: Account Baseline IAM Role](./004-account-baseline-iam-role.md) +- [CSC-1344](https://jira.it.census.gov/browse/CSC-1344) — baseline IAM role (must exist before members can launch products) +- [CSC-1348](https://jira.it.census.gov/browse/CSC-1348) — implementation ticket for this decision diff --git a/docs/fleet-governance-at-scale.md b/docs/fleet-governance-at-scale.md new file mode 100644 index 0000000..0ac2483 --- /dev/null +++ b/docs/fleet-governance-at-scale.md @@ -0,0 +1,403 @@ +# Infrastructure Fleet Governance at Scale + +**Ported and generalized from:** `lambda-template-repo-generator/design-docs/EKS_CLUSTER_GOVERNANCE_AT_SCALE.md` +**Generalized from:** EKS-only → any Terraform workload managed through sc-lambda-ghactions +**Date:** 2026-05-19 +**Status:** DRAFT + +--- + +## Summary + +This document defines the governance model and work plan for operating the +sc-lambda-ghactions system at scale — across many provisioned workloads, many account +repos, and many product types (EKS clusters, S3 buckets, RDS instances, VPCs, etc.). + +The three requirements that drive the design: + +1. **Individual workloads** can be modified and updated granularly, without touching others. +2. **All workloads** can be managed centrally by CSVD — CSVD retains governance even as + provisioning is self-service for customers. +3. **Workload state lives in the customer's account repo**, in a dedicated folder per workload, + using a consistent Terragrunt structure. + +The overarching constraint: customers cannot realistically manage complex Terraform +infrastructure themselves. If CSVD gives up governance, they will be called in to +remediate. The solution must scale to many workloads while keeping CSVD in control of +configuration correctness and lifecycle. + +--- + +## The Fleet Repository: `terraform-sc-fleet` + +### Why a dedicated fleet repo + +The sc-lambda-ghactions Lambda and CodeBuild builds are the _provisioning plane_ — they +create repos and open initial PRs. GitHub Actions workflows are planned for a later +rollout phase and will replace the CodeBuild executor builds at that point. The _operations plane_ — applying ongoing changes, +fleet-wide version bumps, governance policy enforcement — belongs in a separate repo +that CSVD controls directly. + +**`SCT-Engineering/terraform-sc-fleet`** is this operations plane. It contains one +folder per managed workload instance, each of which is a Terraform module call pointing +at the relevant product workspace. + +### Fleet repo structure + +``` +terraform-sc-fleet/ +├── workloads/ +│ ├── eks_cluster/ +│ │ ├── dev/ +│ │ │ ├── csvd/ +│ │ │ │ ├── csvd-dev-mcm/main.tf +│ │ │ │ └── csvd-lab-dja/main.tf +│ │ │ └── adsd/ +│ │ │ └── adsd-tools-dev/main.tf +│ │ └── prod/ +│ │ └── ois/ +│ │ └── ois-cribl-prod/main.tf +│ ├── s3_bucket/ +│ │ ├── dev/ +│ │ │ └── csvd/ +│ │ │ └── csvd-artifacts/main.tf +│ │ └── prod/ +│ └── {product_type}/ +│ └── {lifecycle}/ +│ └── {team}/ +│ └── {workload-name}/main.tf +├── scripts/ +│ ├── update_fleet.py # Fleet-wide apply runner +│ ├── maintenance_check.py # Window-aware update eligibility +│ └── fleet_query.py # Structured inventory queries +├── .github/ +│ └── workflows/ +│ └── regenerate-workspace.yml # Auto-updates fleet.code-workspace on push +├── fleet.code-workspace # Auto-generated VS Code workspace (all managed repos) +└── README.md +``` + +The directory tree encodes two dimensions: +- **Product type** (`eks_cluster`, `s3_bucket`, etc.) — matches `product_type` in the SC form +- **Lifecycle / team** (`dev/csvd`, `prod/ois`, etc.) — controls blast radius of fleet operations + +--- + +## Per-Workload Entry Files + +Each `workloads/{product_type}/{lifecycle}/{team}/{name}/main.tf` calls the corresponding +Terraform product workspace as a versioned external module: + +```hcl +# workloads/eks_cluster/dev/csvd/csvd-dev-mcm/main.tf +module "workload" { + source = "github.e.it.census.gov/SCT-Engineering/terraform-eks-deployment///?ref=v1.2.0" + + repo_name = "229685449397-csvd-dev-gov_apps-adsd-eks" # account repo + cluster_name = "csvd-dev-mcm" # folder inside that repo + account_name = "csvd-dev-gov" + aws_account_id = "229685449397" + aws_region = "us-gov-west-1" + vpc_name = "csvd-dev-ew-vpc-01" + # ... cluster-specific overrides +} + +locals { + maintenance_window = { + allowed_days = ["Tuesday", "Wednesday"] + allowed_hours = { start = 2, end = 6 } # UTC + blackout_dates = [] + } +} +``` + +Each entry file serves two roles simultaneously: +1. **Workload metadata** — authoritative record of the configuration CSVD intends for + this workload instance (versions, account, region, VPC, overrides) +2. **Injection location map** — specifies which account repo this workload's rendered HCL + was written into, and under which subfolder + +The `workloads/` tree as a whole is the **fleet map**: every workload CSVD manages has +an entry here. No external database, no spreadsheet. The source files are the inventory. + +--- + +## Account Repo Layout + +Each provisioned workload writes its rendered HCL into a folder inside a per-account +GHE repo. The folder path follows the account repo layer conventions: + +``` +{account-id}-{account-name}_apps-{team}/ +└── {product_type}/ + └── {workload-name}/ + ├── .sc-automation.yml # Written by Proposer; drives webhook executor + ├── config.json # Workload metadata (product_type, version pinned) + └── {region}/ + ├── remote_state.yml + └── {rendered HCL files} +``` + +**One account repo per account per team prefix** (e.g. `_apps-adsd-eks`, `_apps-csvd-platform`). +Multiple workload types and multiple instances of the same type can coexist in the same +account repo in separate subdirectories. + +--- + +## Separation of Concerns + +| Layer | Owner | Purpose | +|-------|-------|---------| +| Account repo (`{account}_apps-{team}/`) | Tenant team (read), CSVD (write via PR) | Source of truth for workload HCL config | +| `terraform-sc-fleet/workloads/` | CSVD | Central manifest; drives `tf apply` per workload | +| Product workspace (`terraform-eks-deployment`, etc.) | CSVD | Shared rendering logic and version defaults per product type | +| sc-lambda-ghactions Lambda + CodeBuild | CSVD | Provisioning UI; creates repo + initial config; webhook executor (initial rollout) | + +--- + +## Fleet Operations + +### Single-workload update + +```bash +cd terraform-sc-fleet/workloads/eks_cluster/dev/csvd/csvd-dev-mcm +source ~/aws-creds && tf apply +``` + +Opens a PR in that workload's account repo with the updated rendered HCL. Zero blast +radius to other workloads. + +### Fleet-wide update (`update_fleet.py`) + +```bash +# All workloads (dry run first) +python scripts/update_fleet.py --dry-run + +# All EKS clusters, dev lifecycle only +python scripts/update_fleet.py --product-type eks_cluster --lifecycle dev + +# Production workloads (requires --force) +python scripts/update_fleet.py --lifecycle prod --force + +# Filter by team +python scripts/update_fleet.py --team adsd + +# Filter by name substring +python scripts/update_fleet.py --filter csvd-lab +``` + +The script: +1. Walks `workloads/**/**/main.tf` recursively +2. Applies `--product-type`, `--lifecycle`, `--team`, `--filter` selectors +3. Checks `maintenance_window` locals — skips workloads outside their window unless `--force` +4. Runs `tf apply` (or `tf plan` for `--dry-run`) per workload +5. Reports per-workload success/failure with PR URLs + +**A version bump across 20 clusters is a one-liner.** Every additional workload costs CSVD +zero marginal effort for fleet-wide operations. + +### Maintenance windows + +Each entry file declares an optional `maintenance_window` local: + +```hcl +locals { + maintenance_window = { + allowed_days = ["Tuesday", "Wednesday"] + allowed_hours = { start = 2, end = 6 } # UTC + blackout_dates = ["2026-06-15", "2026-06-16"] + } +} +``` + +`update_fleet.py` reads this before each apply and skips out-of-window workloads. +Customers request a blackout window by opening a PR to their account repo modifying +`.sc-automation.yml` or by contacting CSVD to update the entry file. No out-of-band +emails or calendar coordination required. + +--- + +## Governance Controls + +### CODEOWNERS in provisioned account repos + +The Proposer build commits a `CODEOWNERS` file into every account repo it creates, +via `managed_extra_files` in the Terraform product workspace: + +``` +# CSVD owns all managed workload configuration +{product_type}/ @SCT-Engineering/csvd-platform-admins +``` + +Platform engineers in other teams may open PRs but cannot merge without CSVD approval. + +### Branch protection + +Branch protection (require PR, require CSVD review, no direct push to `main`) is set +at provisioning time via the `CSVD/terraform-github-repo` module call in each product +workspace. Every repo provisioned through sc-lambda-ghactions automatically gets these +rules at creation. + +### CODEOWNERS in `terraform-sc-fleet` + +The fleet repo itself uses a hierarchy-aware CODEOWNERS: + +``` +# Production workloads require senior review +workloads/*/prod/ @SCT-Engineering/csvd-senior-platform-admins + +# Dev/sandbox workloads can be approved by any CSVD engineer +workloads/*/dev/ @SCT-Engineering/csvd-platform-admins +``` + +--- + +## Fleet Workspace (`fleet.code-workspace`) + +A VS Code workspace file that includes all managed account repos and the fleet manifest +gives a CSVD engineer a full fleet view in a single editor window: + +```json +{ + "folders": [ + { "name": "fleet-manifest", "path": "." }, + { "name": "eks: csvd-dev-mcm", "path": "~/git/account-repos/229685449397-csvd-dev-gov_apps-adsd-eks" }, + { "name": "eks: adsd-tools-dev", "path": "~/git/account-repos/066884702657-ent-gov-shared-sa_apps-adsd-eks" }, + { "name": "s3: csvd-artifacts", "path": "~/git/account-repos/229685449397-csvd-dev-gov_apps-csvd-platform" } + // ... one entry per managed workload + ] +} +``` + +**This file is auto-generated** by a script in `terraform-sc-fleet` that is triggered +on every push to `main`. The script walks `workloads/**/**/main.tf`, extracts `repo_name` +and `workload_name`, and writes `fleet.code-workspace`. No operator ever edits it manually. + +> In the initial rollout this is a CodeBuild project triggered by a webhook on +> push to `main` in the `terraform-sc-fleet` repo. + +With this workspace open, a CSVD engineer can: +- See all workload configs side-by-side in the Explorer without navigating repos +- Ask Copilot fleet questions across all files at once: + _"Which EKS clusters are not on version 1.31?"_ + _"Show me all prod workloads and their maintenance windows"_ +- Grep across all workload configs simultaneously +- Open PRs to specific workload folders directly from the editor + +--- + +## AI Agents for Fleet Operations + +Because all workload config is declarative files in structured repos, AI agents can answer +operational questions without any custom database or API — **the workspace is the inventory**. + +### `sc-fleet` — Fleet Operator Agent + +Scoped to `fleet.code-workspace`. Answers operational questions across all managed workloads. + +Representative prompts: +- _"Which EKS clusters are not on version 1.31?"_ +- _"Show me all workloads in us-gov-east-1 and their account names"_ +- _"What's the maintenance window for adsd-tools-dev?"_ +- _"Which workloads have a pending update PR open right now?"_ + +### `sc-upgrade` — Version Bump Planning Agent + +Scoped to the relevant product workspace (e.g. `terraform-eks-deployment`). Plans and +validates fleet-wide or targeted version changes before applying. + +Representative prompts: +- _"Plan an upgrade of EKS to 1.31 for all dev clusters"_ +- _"Which workloads can receive an update today based on their maintenance windows?"_ +- _"Show me the tf plan diff for bumping the S3 module version fleet-wide"_ + +### `sc-pr-reviewer` — Customer PR Review Agent + +Injected into each account repo via `managed_extra_files` as a `.github/copilot-instructions.md`. +Automatically summarizes incoming customer PRs and flags governance violations before +a CSVD engineer reviews. + +Representative uses (triggered by a CodeBuild build on PR open, or invoked manually): +- Classifies all changed fields and flags any that are CSVD-owned +- Determines if the change requires a maintenance window +- Produces a one-sentence plain-English summary for the CSVD reviewer + +### `sc-provisioner` — Provisioning Debug Agent + +Scoped to `sc-lambda-ghactions`. Helps debug provisioning failures and validate SC inputs. + +Representative prompts: +- _"The SC product failed — here's the CFN error. What went wrong?"_ +- _"Validate these SC input parameters before I submit"_ +- _"What HCL files would be generated for this cluster config?"_ + +--- + +## Proposed Skills (for `~/.copilot/skills/`) + +| Skill | Trigger phrases | What it does | +|-------|----------------|-------------| +| `sc-fleet-query` | "fleet status", "which workloads", "show me all" | Parses `workloads/**/**/main.tf`, returns structured inventory; accepts `--product-type`, `--filter`, `--field` | +| `sc-maintenance-check` | "maintenance window", "can I update", "what's due today" | Reads `maintenance_window` locals, returns workloads eligible for update on a given date | +| `sc-upgrade-planner` | "plan upgrade", "bump version" | Calls `update_fleet.py --dry-run`, returns per-workload plan summary; flags closed maintenance windows | +| `sc-pr-summary` | "review PR", "summarize this diff" | Fetches PR diff via GHE API, classifies changed fields, returns one-sentence summary + governance flag list | + +--- + +## Work Plan + +### Phase 1 — Create `terraform-sc-fleet` repo + +- [ ] Create `SCT-Engineering/terraform-sc-fleet` +- [ ] Move existing `terraform-eks-deployment/clusters/` entries into + `workloads/eks_cluster/{lifecycle}/{team}/{name}/main.tf` +- [ ] Update module source paths from `../../` to versioned external module reference +- [ ] Add `README.md`, `scripts/update_fleet.py` skeleton +- [ ] Add CodeBuild project to regenerate `fleet.code-workspace` on push to `main` *(GHA workflow planned for later rollout)* + +### Phase 2 — Wire sc-lambda-ghactions Proposer to write fleet entries + +- [ ] After Proposer creates the account repo and opens the PR, also commit a new + `workloads/{product_type}/{lifecycle}/{team}/{name}/main.tf` entry to `terraform-sc-fleet` +- [ ] SC form adds optional `team` and `lifecycle` parameters (default: `dev` + name-prefix heuristic) +- [ ] Lambda threads `team` and `lifecycle` to the Proposer CodeBuild build as `environmentVariablesOverride` + +### Phase 3 — Governance controls at provisioning time + +- [ ] Add `CODEOWNERS` and branch protection to every provisioned account repo + via `managed_extra_files` in each product workspace +- [ ] Add CODEOWNERS to `terraform-sc-fleet` scoped by lifecycle + +### Phase 4 — Fleet-wide update automation + +- [ ] Complete `scripts/update_fleet.py` with `--product-type`, `--lifecycle`, `--team`, + `--filter`, `--dry-run`, `--force` flags +- [ ] Add maintenance window parsing (`maintenance_window` locals) +- [ ] Add `scripts/maintenance_check.py` for window-aware eligibility reporting +- [ ] Wire a CodeBuild project as headless fleet runner (optional) + +### Phase 5 — AI agents and skills + +- [ ] Add `fleet.code-workspace` auto-generation CodeBuild project *(GHA workflow planned for later rollout)* +- [ ] Add copilot instructions to `terraform-sc-fleet` scoped for fleet operator queries +- [ ] Define `sc-fleet-query` and `sc-maintenance-check` skills under `~/.copilot/skills/` +- [ ] Add `.github/copilot-instructions.md` to provisioned account repos via `managed_extra_files` + +--- + +## Open Questions + +| # | Question | Owner | +|---|----------|-------| +| 1 | One account repo per workload type, or one per account? | Manuel / Don | +| 2 | Auto-merge for fleet version bumps in dev lifecycle, or always require review? | Matthew / Manuel | +| 3 | Who is CODEOWNER on `main` for each product type — a team or named individuals? | Manuel | +| 4 | Fleet-wide updates: CodeBuild headless runner, or CSVD engineer runs `update_fleet.py` manually? | David / Matthew | + +--- + +## Non-Goals + +- Customers self-managing Terraform — CSVD owns all Terraform execution +- Per-customer forks of product workspaces — single central workspace per product type +- Moving workload config to a database or external registry — `workloads/**` is the registry diff --git a/docs/generalized-terraform-product-architecture.md b/docs/generalized-terraform-product-architecture.md new file mode 100644 index 0000000..94610e6 --- /dev/null +++ b/docs/generalized-terraform-product-architecture.md @@ -0,0 +1,445 @@ +# Generalized Terraform Product Architecture + +**Date:** 2026-05-19 +**Status:** Proposed +**Audience:** Platform Engineering stakeholders +**Context:** Expanding the Service Catalog automation system beyond EKS to support any arbitrary Terraform template repo + +--- + +## Summary + +The Service Catalog (SC) automation system was originally built to create EKS cluster +GitHub repositories. This document describes a path to generalize that system so that +**any Terraform workload** — S3 buckets, RDS databases, VPCs, IAM roles, etc. — can be +onboarded as a new SC product with minimal engineering effort. + +The core Lambda infrastructure, webhook handler, and CodeBuild executor are already +workload-agnostic. The changes required to support a new product type are scoped to: + +1. A **template repo** on GitHub Enterprise +2. A set of **Jinja2 HCL/YAML templates** for the rendered files +3. A **Pydantic config model** describing the product's inputs +4. A **CloudFormation product template** for the Service Catalog form +5. A **census config YAML** to register the product in the portfolio + +No changes to the Lambda runtime, CodeBuild projects, or webhook infrastructure +are needed after the initial generalization work is complete. + +--- + +## Current State (EKS-only) + +``` +SC Console (user fills EKS form) + └─> CFN Stack (Custom::GitHubRepository) + └─> Lambda (eks-terragrunt-repo-gen-template-automation) + ├─> Validates EKS-specific inputs (Pydantic model) + ├─> Fetches GHE token from Secrets Manager + ├─> Triggers executor CodeBuild build + └─> Polls build → returns repo URL + PR URL to CFN +``` + +The Lambda and CodeBuild executor are tightly coupled to EKS field names +(`cluster_name`, `vpc_name`, `vpc_domain_name`, etc.) and the +`template-eks-cluster` template repo. + +--- + +## Target State (Any Terraform Workload) + +``` +SC Console (user fills product form — any workload type) + └─> CFN Stack (Custom::TerraformRepo) + └─> Lambda (sc-template-automation) [shared, central] + ├─> Reads product_type from CFN properties + ├─> Routes to the correct Pydantic model + template set + ├─> Triggers executor CodeBuild build + └─> Returns repo URL + PR URL to CFN + +GitHub Enterprise (any account repo) + └─> push to main + └─> Lambda webhook handler (existing, already generic) + └─> Reads .sc-automation.yml → starts executor build +``` + +The Lambda becomes a **dispatcher**: `product_type` is a single new field in the +CFN `Properties` block that routes the request to the correct handler. + +--- + +## What Is Already Generic + +The following components require **no changes** to support new product types: + +| Component | Why it is already generic | +|---|---| +| Webhook handler | Reads `.sc-automation.yml` from any repo; no workload awareness | +| `.sc-automation.yml` schema | `layer`, `region_dir`, `target_account_id` are workload-agnostic | +| Executor CodeBuild project | Runs `tf apply` in any Terraform workspace; env vars are injected at build time | +| HMAC signature verification | Workload-agnostic GHE push event handling | +| GHE commit status writeback | Writes ✅/❌ to any repo's merge commit | +| Lambda Function URL | Single entry point; no per-product URLs needed | + +--- + +## What Changes for Each New Product + +### 1. Template repo on GHE + +Create a new repo under `SCT-Engineering/` (e.g. `template-s3-bucket`) containing +**only the workload-specific files** — nothing else. + +Account repos already carry all standard scaffolding from initial setup: +`.tf-control`, `.tf-control.tfrc`, `region.tf`, `credentials.d/`, `variables.d/`, +and layer-level `remote_state.yml` files with account-specific values. +Duplicating any of that in a template repo would overwrite working values with +stubs and make the template non-reusable across accounts. + +A minimal template repo looks like: + +``` +template-s3-bucket/ +├── INF.s3-standard.tf.j2 # S3 bucket + policy resources +├── tf-run.data # REMOTE-STATE + tf-directory-setup + ALL steps +└── .sc-automation.yml.j2 # optional; Proposer writes a default if absent +``` + +The files are **flat**. `LAYER` and `REGION_DIR` are env vars already known to the +Proposer build — it writes the rendered files into `${ACCOUNT_REPO}/${LAYER}/${REGION_DIR}/` +at copy time. There is no reason to encode layer or region as directory structure +inside the template repo. + +If the target layer does not yet exist in the account repo, the Lambda Pydantic +model constructs the layer-level `remote_state.yml` from SC form inputs +(`account_id`, `account_alias`, `bucket`, `profile`, `region`) and passes it to +the Proposer via `EXTRA_FILES`. The template repo never carries this file. + +### 2. Jinja2 templates + +Jinja2 template files (`.tf.j2`) live **in the template repo itself** — flat, alongside +`tf-run.data`. There is no separate `lambda/templates/` directory tree. The Proposer +build clones the template repo and renders every `.j2` file it finds, writing the +result (minus the `.j2` extension) into `${LAYER}/${REGION_DIR}/` in the account repo. + +Example for an S3 product: + +``` +template-s3-bucket/ +├── INF.s3-standard.tf.j2 # rendered → infrastructure/west/INF.s3-standard.tf +├── tf-run.data +└── .sc-automation.yml.j2 +``` + +Subdirectory variants (e.g. `standard/` vs `encrypted/`) are supported via the +`source_path` parameter — the Proposer copies only the named subdirectory's contents, +stripped of the subdirectory prefix. + +### 3. `handler.py` in the template repo + +`handler.py` lives **at the root of the template repo** alongside the Jinja2 templates. +It is the single place that defines everything the Lambda needs to know about the product: +required inputs, defaults, and any computed `EXTRA_FILES`. No files inside the Lambda +repository are created or modified. + +#### Contract + +| Symbol | Type | Purpose | +|--------|------|---------| +| `PRODUCT_TYPE` | `str` | Unique key; must match the `product_type` field in the CFN Properties block | +| `handle(props: dict) -> dict` | callable | Receives normalized CFN props; returns (possibly modified) props ready for `TfRunRequest` | + +`handle()` is called before `TfRunRequest` is constructed. It should: +- Apply product-specific defaults (`layer`, `region_dir`, `template_repo`, …) +- Validate required inputs (via a Pydantic model or plain assertions) +- Inject computed `extra_files` entries (e.g. a layer-level `remote_state.yml`) + +#### Example — complete `handler.py` for an S3 bucket product + +```python +# template-s3-bucket/handler.py +from __future__ import annotations +from pydantic import BaseModel +from typing import Literal + +PRODUCT_TYPE = "s3_bucket" + + +class _Config(BaseModel): + bucket_name: str + account_name: str + aws_account_id: str + account_alias: str + environment: Literal["dev", "test", "prod"] + layer: str = "infrastructure" + region_dir: str = "west" + aws_region: str = "us-gov-west-1" + team: str + workload: str + tier: str + + +def handle(props: dict) -> dict: + cfg = _Config(**{k: v for k, v in props.items() if k in _Config.model_fields}) + props.setdefault("layer", cfg.layer) + props.setdefault("region_dir", cfg.region_dir) + props.setdefault("template_repo", "template-s3-bucket") + # Inject layer-level remote_state.yml if the layer is new + props.setdefault("extra_files", {}) + props["extra_files"].setdefault( + f"{cfg.layer}/remote_state.yml", + _render_remote_state(cfg), + ) + return props + + +def _render_remote_state(cfg: _Config) -> str: + return ( + f"directory: \"{cfg.layer}\"\n" + f"profile: \"{cfg.aws_account_id}-{cfg.account_alias}\"\n" + f"bucket: \"inf-tfstate-{cfg.aws_account_id}\"\n" + f"bucket_region: \"us-gov-east-1\"\n" + f"region: \"{cfg.aws_region}\"\n" + f"account_id: \"{cfg.aws_account_id}\"\n" + f"account_alias: \"{cfg.account_alias}\"\n" + f"aws_environment: \"gov\"\n" + ) +``` + +Because `handler.py` is versioned in the template repo, the Pydantic model and defaults +evolve alongside the templates — no Lambda redeploy required. + +### 4. Lambda dispatcher — runtime fetch from template repo + +The Lambda has no `handlers/` directory and no handler registry. Instead, it fetches +`handler.py` directly from the template repo via the GHE API at request time and loads +it dynamically. The template repo is identified by the `template_repo` field already +present in the CFN Properties. + +``` +lambda/ +└── app.py ← one-time change: fetch + exec handler.py, then call handle(props) + no lambda/handlers/ directory, no lambda/models/ directory +``` + +#### How it works (design intent for `app.py`) + +```python +# 1. Read template_repo + optional ref from CFN props +template_repo = normalized.get("template_repo") # e.g. "template-s3-bucket" +template_repo_ref = normalized.get("template_repo_ref", "") # e.g. "v2.0.0"; empty = default branch +if not template_repo: + raise ValueError("template_repo is required") + +# 2. Fetch handler.py from GHE via the contents API, at the pinned ref +github_org = os.environ.get("GITHUB_ORG_NAME", "SCT-Engineering") +github_api = os.environ.get("GITHUB_API", "https://github.e.it.census.gov/api/v3") +ref_param = f"?ref={template_repo_ref}" if template_repo_ref else "" +handler_url = f"{github_api}/repos/{github_org}/{template_repo}/contents/handler.py{ref_param}" +# ...fetch with Authorization header, base64-decode the content... + +# 3. Load the module dynamically +import types +mod = types.ModuleType("_handler") +exec(compile(handler_source, "handler.py", "exec"), mod.__dict__) + +# 4. Validate the contract and dispatch +if not (callable(getattr(mod, "handle", None)) and getattr(mod, "PRODUCT_TYPE", None)): + raise ValueError(f"{template_repo}/handler.py must define PRODUCT_TYPE and handle()") +normalized = mod.handle(normalized) +# template_repo_ref is preserved in normalized; CodeBuild receives it as TEMPLATE_REPO_REF +tf_req = TfRunRequest(**normalized) +``` + +#### Security boundary + +The Lambda only fetches `handler.py` from repos whose name is in an allow-list prefix +(`template-*` within `SCT-Engineering`). The GHE token used has **read-only** scope on +template repos, so a compromised template repo cannot write to account repos via this +path. Handler execution is the only place arbitrary code runs — this is intentional and +auditable (every template repo change is a PR in the SCT-Engineering org). + +**Adding a new product type requires only creating a new template repo with a `handler.py`. +No Lambda code changes, no Lambda redeployment, no registry entries.** + +#### Version pinning + +The `template_repo_ref` property pins the exact git ref (tag, branch, or SHA) that both +the Lambda and the Proposer CodeBuild use. This is how template repo changes are gated +from production use. + +| Value | Behaviour | +|-------|-----------| +| `v2.0.0` (SemVer tag) | Lambda fetches `handler.py` at that tag; Proposer clones and checks out that tag. **Recommended for production.** | +| `main` (branch) | Always latest; appropriate for development and testing. | +| `abc1234` (SHA) | Exact commit; maximally stable but requires manual update. | +| *(absent / empty)* | GHE API returns the default branch; same as `main`. | + +**The ref is set as a static string in the CFN product template — it is not a +user-facing form parameter.** Bumping to a new version is an operator action: + +1. Tag the template repo: `git tag v2.0.0 && git push origin v2.0.0` +2. Update `template_repo_ref` in `service-catalog/{product_type}-product-template.yaml` +3. Run `tf apply` in `deploy_products/` with a bumped `version` key — this creates a new + SC provisioning artifact. Existing provisioned products are unaffected until they are + updated or re-provisioned. + +Because `template_repo_ref` flows through the Lambda to the CodeBuild `TEMPLATE_REPO_REF` +env var, the Lambda and the Proposer always run the **exact same version** of `handler.py` +and the Jinja2 templates — there is no split-brain risk between the two. + +### 5. CloudFormation product template + +The CFN template for a product type lives in `service-catalog/{product_type}-product-template.yaml` +inside the `sc-lambda-ghactions` repo. It follows the same pattern as the existing EKS template: + +- Parameters for user-facing form fields +- A single `Custom::TerraformRepo` resource +- Properties in `snake_case` (avoids PascalCase normalizer edge cases) +- `product_type` as a static string property +- `aws_account_id` and `aws_region` resolved via `!Sub` — not user-facing parameters + +```yaml +Properties: + ServiceToken: !Sub "arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:sc-template-automation" + product_type: s3_bucket + template_repo: template-s3-bucket + template_repo_ref: v2.0.0 # pinned; bump here + new SC artifact version to release + bucket_name: !Ref BucketName + account_name: !Ref AccountName + aws_account_id: !Sub "${AWS::AccountId}" + environment: !Ref Environment + team: !Ref Team + workload: !Ref Workload + tier: !Ref Tier +``` + +`template_repo` and `template_repo_ref` are static strings — they are not user-facing +parameters and do not appear on the SC product form. Changing them requires creating a +new SC provisioning artifact version (managed by `deploy_products/`). + +### 6. `deploy_products/` — dedicated Terraform workspace for SC product management + +SC portfolio and product registration lives in a **dedicated `deploy_products/` workspace**, +separate from `deploy/` (which manages the Lambda and CodeBuild engine). This mirrors the +pattern established by `lambda-template-repo-generator`, which has `deploy/` for the +Lambda infrastructure and `deploy_product/` for the SC product registration. + +``` +sc-lambda-ghactions/ +├── deploy/ ← Lambda, ECR, CodeBuild, IAM (engine — rarely touched) +└── deploy_products/ ← SC portfolio, products, S3, launch roles, OU sharing +``` + +The workspace is driven by two variables: + +```hcl +# deploy_products/terraform.tfvars + +sc_products = { + eks_cluster = { + name = "EKS Cluster Repository Creator" + description = "Creates an EKS cluster account repo from template-eks-cluster." + template = "${path.module}/../service-catalog/eks-cluster-product-template.yaml" + version = "2.0.0" + } + s3_bucket = { + name = "S3 Bucket Repository Creator" + description = "Creates an S3 bucket account repo from template-s3-bucket." + template = "${path.module}/../service-catalog/s3-bucket-product-template.yaml" + version = "1.0.0" + } +} + +# AWS Organizations OU IDs to share the portfolio with. +# Every account in these OUs will see the portfolio in their SC console automatically. +share_ous = [ + "ou-xxxx-xxxxxxxx", # platform-engineering + "ou-xxxx-yyyyyyyy", # app-teams +] +``` + +> **Version alignment:** the `version` key in `sc_products` is the **SC provisioning artifact +> label** displayed in the SC console (e.g. `"2.0.0"`). It is independent of, but should match, +> the `template_repo_ref` property baked into the CFN template file. Convention: +> bump `version` in tfvars and update `template_repo_ref` in the CFN YAML at the same time. + +Terraform iterates `var.sc_products` with `for_each` to create the S3 object, SC product, +provisioning artifact, and launch constraint for each entry. A single shared portfolio +(`aws_servicecatalog_portfolio`) is created once and shared to the OUs listed in +`var.share_ous` via `aws_servicecatalog_portfolio_share` — no per-account work required. + +**Adding a new product type:** +1. Create the template repo and CFN template (steps above) +2. Add one entry to `var.sc_products` in `deploy_products/terraform.tfvars` +3. Run `tf apply` in `deploy_products/` + +No census pipeline PR, no YAML config files, no Terragrunt, no AWS Console clicks. +The `deploy_products/` workspace replaces the `terraform-service-catalog-census` dependency +for this system entirely. Any account in the configured OUs gets access to new products +immediately after apply. + +--- + +## Onboarding Checklist for a New Product Type + +The following checklist can be handed to a product team or platform engineer to +onboard any new Terraform workload without Lambda or CodeBuild changes: + +- [ ] Create `SCT-Engineering/template-{product_type}` containing: + - `handler.py` — `PRODUCT_TYPE`, Pydantic model, `handle()` function + - flat `.tf.j2` file(s) rendered by the Proposer + - `tf-run.data` + - `.sc-automation.yml.j2` (optional; Proposer writes a default if absent) +- [ ] Tag the initial release: `git tag v1.0.0 && git push origin v1.0.0` +- [ ] Add `service-catalog/{product_type}-product-template.yaml` to `sc-lambda-ghactions`, + setting `template_repo_ref: v1.0.0` as a static property +- [ ] Add one entry to `var.sc_products` in `deploy_products/terraform.tfvars` + with `version = "1.0.0"` matching the tag +- [ ] Run `tf apply` in `deploy_products/` — creates S3 artifact, SC product, provisioning + artifact, launch constraint; all OU-member accounts see the new product immediately +- [ ] Validate end-to-end via `scripts/test_service_catalog.py` + +--- + +## Example: S3 Bucket Product + +An S3 bucket product would work as follows end-to-end: + +1. Platform engineer opens Service Catalog, selects **S3 Bucket Repository Creator** +2. Fills in: `bucket_name`, `team`, `workload`, `environment`, `tier` +3. CloudFormation creates a `Custom::TerraformRepo` stack with `product_type: s3_bucket` +4. Lambda validates inputs against `S3BucketConfig`, renders S3 Jinja2 templates +5. Proposer CodeBuild clones `template-s3-bucket`, commits rendered HCL, opens PR +6. CFN stack outputs: `repository_url`, `pull_request_url` +7. Platform engineer reviews and merges PR +8. Webhook fires → Lambda reads `.sc-automation.yml` → starts executor build +9. Executor applies S3 Terragrunt config; posts ✅ commit status on merge commit + +The platform engineer never leaves GitHub or Service Catalog — there is no manual executor step. + +--- + +## Migration Path for Existing EKS Product + +The EKS product continues to work without modification. The `product_type` field defaults +to `eks_cluster` when absent, preserving backward compatibility with any existing +CloudFormation stacks or SC provisioned products. + +--- + +## Infrastructure Cost of Generalization + +| Resource | Current | After generalization | +|---|---|---| +| Lambda functions | 1 (EKS-only) | 1 (shared dispatcher) | +| CodeBuild projects | 2 (builder + creator) | 2 (no change) | +| Secrets Manager secrets | 2 (GHE tokens) + 1 (webhook) | No change | +| Lambda Function URL | 1 | No change | +| ECR repositories | 1 | No change | +| SC portfolios | 1 (EKS) | 1 (shared across all product types) | +| SC products | 1 | +1 per new product type (S3 object + SC product resource) | +| CFN StackSets | 0 | 1 (launch role deployed to all OU member accounts via `deploy_products/`) | + +Each new product type adds a single SC product + S3 artifact. No new Lambda functions, +no new CodeBuild projects, no new secrets — and no dependency on external pipeline teams. diff --git a/docs/repo-vars-and-secrets.md b/docs/repo-vars-and-secrets.md new file mode 100644 index 0000000..9af8576 --- /dev/null +++ b/docs/repo-vars-and-secrets.md @@ -0,0 +1,246 @@ +# Repository Variables and Secrets Management + +**Ported from:** `lambda-template-repo-generator/design-docs/REPO_VARS_AND_SECRETS.md` +**Updated for:** sc-lambda-ghactions (CodeBuild-based initial rollout; GHA planned for later) + +This document describes how environment variables and secrets are made available +to CodeBuild builds started by the sc-lambda-ghactions Lambda. + +In the initial CodeBuild-based rollout, secrets and configuration values are +injected directly as CodeBuild environment variable overrides at build-start time +(via `environmentVariablesOverride` in the `StartBuild` API call). AWS Parameter +Store and Secrets Manager values are fetched by the Lambda and passed through, or +read directly by the CodeBuild buildspec at runtime. + +--- + +## Overview + +The Proposer CodeBuild build has access to: + +1. **Secrets** — read from AWS Secrets Manager; injected as CodeBuild env var overrides at build-start time or fetched in the buildspec via `aws secretsmanager get-secret-value` +2. **Configuration values** — read from AWS Parameter Store; fetched in the buildspec via `aws ssm get-parameter` + +Both are scoped by: +- **Global** — applied to every account repo regardless of product type +- **By product type** — applied only to repos of a specific `product_type` + +--- + +## AWS Parameter Store Structure + +``` +/sc-template-automation/ + ├── variables/ + │ ├── global/ # Variables set on every new repo + │ │ ├── AWS_REGION # e.g. us-gov-west-1 + │ │ └── TERRAFORM_VERSION # e.g. 1.9.1 + │ └── by-type/ # Variables by product_type + │ ├── eks_cluster/ + │ │ ├── CLUSTER_VERSION + │ │ └── NODE_TYPE + │ └── s3_bucket/ + │ └── ... +``` + +## AWS Secrets Manager Structure + +``` +sc-template-automation/ + ├── secrets/global/ # Secrets set on every new repo + │ └── AWS_ACCESS_KEY_ID # (if needed by CodeBuild buildspec) + └── secrets/by-type/ # Secrets by product_type + ├── eks_cluster/ + │ └── KUBECONFIG + └── s3_bucket/ + └── ... +``` + +--- + +## Lambda Infrastructure + +### IAM Permissions + +The Lambda execution role requires: + +```hcl +data "aws_iam_policy_document" "secrets_access" { + statement { + effect = "Allow" + actions = ["secretsmanager:GetSecretValue", "secretsmanager:ListSecrets"] + resources = [ + "arn:${data.aws_partition.current.partition}:secretsmanager:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:secret:sc-template-automation/*" + ] + } +} + +data "aws_iam_policy_document" "ssm_access" { + statement { + effect = "Allow" + actions = ["ssm:GetParameter", "ssm:GetParameters", "ssm:GetParametersByPath"] + resources = [ + "arn:${data.aws_partition.current.partition}:ssm:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:parameter/sc-template-automation/*" + ] + } +} +``` + +### Lambda Environment Variables + +```hcl +environment { + variables = { + PARAM_STORE_PREFIX = "/sc-template-automation" + SECRETS_PREFIX = "sc-template-automation" + } +} +``` + +--- + +## Implementation — Building CodeBuild `environmentVariablesOverride` + +> **Note:** In the CodeBuild-based rollout, there is **no GHE repo secrets/variables API involved**. +> Secrets and configuration values are fetched by the Lambda at invocation time and passed +> directly to CodeBuild as `environmentVariablesOverride`. The GHE repo secrets approach +> is only relevant to the planned later GHA-based rollout. + +The helper `build_env_overrides()` in `lambda/env_builder.py` assembles the override list: + +```python +import boto3 + +ssm = boto3.client("ssm", region_name="us-gov-west-1") +secretsmanager = boto3.client("secretsmanager", region_name="us-gov-west-1") + +PARAM_PREFIX = "/sc-template-automation" +SECRET_PREFIX = "sc-template-automation" + + +def _get_ssm_path(path: str) -> dict[str, str]: + """Return {name: value} for all SSM parameters under the given path.""" + paginator = ssm.get_paginator("get_parameters_by_path") + result = {} + for page in paginator.paginate(Path=f"{PARAM_PREFIX}/{path}", WithDecryption=True): + for p in page["Parameters"]: + name = p["Name"].split("/")[-1] + result[name] = p["Value"] + return result + + +def _get_secrets_path(path: str) -> dict[str, str]: + """Return {name: value} for all Secrets Manager secrets under the given prefix.""" + paginator = secretsmanager.get_paginator("list_secrets") + result = {} + for page in paginator.paginate(Filters=[{"Key": "name", "Values": [f"{SECRET_PREFIX}/{path}"]}]): + for s in page["SecretList"]: + name = s["Name"].split("/")[-1] + value = secretsmanager.get_secret_value(SecretId=s["Name"])["SecretString"] + result[name] = value + return result + + +def build_env_overrides(product_type: str) -> list[dict]: + """ + Return a list of CodeBuild environmentVariablesOverride dicts containing: + - All global SSM variables + - All product-type SSM variables + - All global Secrets Manager secrets (type=SECRETS_MANAGER passed by ref) + - All product-type Secrets Manager secrets + """ + overrides = [] + + # Plain-text variables from SSM (fetched by Lambda, passed as PLAINTEXT) + for name, value in { + **_get_ssm_path("variables/global"), + **_get_ssm_path(f"variables/by-type/{product_type}"), + }.items(): + overrides.append({"name": name, "value": value, "type": "PLAINTEXT"}) + + # Secrets — passed as a Secrets Manager ARN reference so CodeBuild fetches at build time + # This avoids the Lambda ever holding plaintext secret values in memory beyond SSM calls. + for name, arn in { + **_get_secrets_arns("secrets/global"), + **_get_secrets_arns(f"secrets/by-type/{product_type}"), + }.items(): + overrides.append({"name": name, "value": arn, "type": "SECRETS_MANAGER"}) + + return overrides +``` + +> **`SECRETS_MANAGER` type:** When CodeBuild receives an env var with `type=SECRETS_MANAGER`, +> it resolves the value (an ARN) at build-start time using the CodeBuild service role — +> the Lambda never sees the plaintext secret value. + +### Integration in the Lambda Handler + +```python +def handle_create(props: dict): + product_type = props["product_type"] + # ... validate inputs (Pydantic), identify template repo ... + + # Build env var overrides from SSM + Secrets Manager + env_overrides = build_env_overrides(product_type) + + # Add per-invocation values from CFN properties + env_overrides += [ + {"name": "PRODUCT_TYPE", "value": product_type, "type": "PLAINTEXT"}, + {"name": "REPO_NAME", "value": props["project_name"], "type": "PLAINTEXT"}, + {"name": "ENVIRONMENT", "value": props["environment"], "type": "PLAINTEXT"}, + {"name": "AWS_ACCOUNT_ID","value": props["aws_account_id"],"type": "PLAINTEXT"}, + {"name": "AWS_REGION", "value": props["aws_region"], "type": "PLAINTEXT"}, + ] + + codebuild.start_build( + projectName=PROPOSER_PROJECT, + environmentVariablesOverride=env_overrides, + ) +``` + +--- + +## Populating Secrets and Variables + +### Add a global variable (all repos) + +```bash +export AWS_DEFAULT_REGION=us-gov-west-1 +aws ssm put-parameter \ + --name "/sc-template-automation/variables/global/TERRAFORM_VERSION" \ + --value "1.9.1" \ + --type "String" +``` + +### Add a product-type-specific secret + +```bash +export AWS_DEFAULT_REGION=us-gov-west-1 +aws secretsmanager create-secret \ + --name "sc-template-automation/secrets/by-type/eks_cluster/KUBECONFIG" \ + --secret-string "..." +``` + +--- + +## Security Considerations + +- **Encryption at rest:** All secrets are AWS-managed encrypted in Secrets Manager +- **Least privilege:** Lambda role scoped to `sc-template-automation/*` prefix only +- **Audit trail:** CloudTrail records all `GetSecretValue` and `GetParameter` calls +- **Repository isolation:** Secrets are set per-repo via GHE API; they are not + stored in the Lambda or committed to the repo +- **No plaintext in Lambda env:** Secrets are fetched at runtime, not baked into + the container image or Lambda environment variables + +--- + +## Future Enhancements + +- **Secret rotation:** Implement automatic rotation for long-lived credentials +- **Environment-scoped secrets:** Dev/test/prod variants of secrets per repo +- **Organization-level SSM parameters:** Consolidate shared variables (e.g. `TERRAFORM_VERSION`) + into a single SSM path read once at Lambda invocation rather than duplicated across + every `by-type/` subtree, reducing SSM API call volume +- **Validation rules:** Reject variable names that conflict with CodeBuild reserved + names (e.g. `CODEBUILD_*`, `AWS_*` built-ins) diff --git a/docs/service-catalog-census-integration.md b/docs/service-catalog-census-integration.md new file mode 100644 index 0000000..a0cd9bf --- /dev/null +++ b/docs/service-catalog-census-integration.md @@ -0,0 +1,242 @@ +# Service Catalog Product Deployment + +**Previously titled:** Service Catalog Census Integration +**Updated for:** sc-lambda-ghactions +**Date:** 2026-05-20 +**Status:** DRAFT + +--- + +## Executive Summary + +All Service Catalog infrastructure for this system is managed from a single +**`deploy_products/`** Terraform workspace inside `sc-lambda-ghactions`. There is no +dependency on `terraform-service-catalog-census` or any external pipeline. + +The workspace handles the complete deployment lifecycle: +- S3 upload of CFN product templates (versioned) +- SC portfolio + `aws_servicecatalog_portfolio_share` to org OUs +- SC products, provisioning artifacts, and launch constraints +- A CloudFormation StackSet that deploys the IAM launch role to every account in the shared OUs automatically + +Adding a new product type = one entry in `terraform.tfvars` + `tf apply`. No census repo +PRs, no Terragrunt, no YAML config files, no AWS Console clicks. + +--- + +## System Layout + +### sc-lambda-ghactions (all in one repo) + +``` +sc-lambda-ghactions/ +├── lambda/app.py ← Lambda handler (fetches handler.py from template repo at runtime) +├── service-catalog/{type}-template.yaml ← CFN product templates (one per product type) +├── deploy/ ← Engine: Lambda, ECR, CodeBuild, IAM, Function URL +└── deploy_products/ ← SC: portfolio, products, OU sharing, launch role StackSet + +SCT-Engineering/template-{product_type}/ ← one repo per product type; fully self-contained +├── handler.py ← PRODUCT_TYPE + Pydantic model + handle() +├── {workload}.tf.j2 ← Jinja2 HCL templates (flat) +├── tf-run.data ← tf-run orchestration steps +└── .sc-automation.yml.j2 ← optional webhook config template +``` + +### Two workspaces, two responsibilities + +| Workspace | Contains | Apply frequency | +|-----------|----------|-----------------| +| `deploy/` | Lambda, ECR, CodeBuild projects, IAM execution roles, Function URL | Rarely — only when the engine changes | +| `deploy_products/` | S3 templates, SC portfolio, products, artifacts, OU sharing, launch role StackSet | Whenever a new product type is added or a template version bumps | + +--- + +## Resource Classification + +| Resource | Workspace | Scope | +|----------|-----------|-------| +| Lambda function | `deploy/` | csvd-dev only (invoked cross-account via ServiceToken) | +| ECR repository | `deploy/` | csvd-dev only | +| CodeBuild projects (proposer + executor) | `deploy/` | csvd-dev only | +| Lambda cross-account invocation policy | `deploy/` | Org-wide via `aws:PrincipalOrgID` condition | +| S3 bucket + product template objects | `deploy_products/` | csvd-dev only | +| SC portfolio | `deploy_products/` | csvd-dev (shared to OUs) | +| `aws_servicecatalog_portfolio_share` | `deploy_products/` | All OU member accounts | +| SC products + provisioning artifacts | `deploy_products/` | csvd-dev (visible in shared accounts) | +| Launch role (IAM) | `deploy_products/` via CFN StackSet | All OU member accounts | +| Launch constraint | `deploy_products/` | Per product, references launch role ARN pattern | + +--- + +## Step 1 — Engine Infrastructure (`deploy/`) + +The Lambda is centralized in csvd-dev. CloudFormation in any org account invokes it +cross-account via the `ServiceToken` ARN. A single resource policy covers the whole org: + +```hcl +resource "aws_lambda_permission" "cloudformation_org" { + statement_id = "AllowCloudFormationOrgInvoke" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.sc_automation.function_name + principal = "cloudformation.amazonaws.com" + condition { + test = "StringEquals" + variable = "aws:PrincipalOrgID" + values = [var.org_id] + } +} +``` + +No per-account Lambda deployment. This resource lives in `deploy/` and is applied once. + +--- + +## Step 2 — SC Products & OU Sharing (`deploy_products/`) + +The `deploy_products/` workspace manages all SC resources from a single `tf apply`. + +### `var.sc_products` — product registry + +```hcl +# deploy_products/terraform.tfvars +sc_products = { + eks_cluster = { + name = "EKS Cluster Repository Creator" + description = "Creates an EKS cluster account repo from template-eks-cluster." + template = "${path.module}/../service-catalog/eks-cluster-product-template.yaml" + version = "2.0.0" + } + s3_bucket = { + name = "S3 Bucket Repository Creator" + description = "Creates an S3 bucket account repo from template-s3-bucket." + template = "${path.module}/../service-catalog/s3-bucket-product-template.yaml" + version = "1.0.0" + } +} + +# AWS Organizations OU IDs — every account in these OUs sees the portfolio automatically +share_ous = [ + "ou-xxxx-xxxxxxxx", # platform-engineering + "ou-xxxx-yyyyyyyy", # app-teams +] +``` + +Terraform iterates `var.sc_products` with `for_each` to create: +- `aws_s3_object` — versioned CFN template in the artifacts bucket +- `aws_servicecatalog_product` — SC product backed by the S3 object +- `aws_servicecatalog_provisioning_artifact` — the working artifact version (the initial one created by `create-product` is deprecated automatically, as learned from `lambda-template-repo-generator`) +- `aws_servicecatalog_product_portfolio_association` — links product to the shared portfolio +- `aws_servicecatalog_constraint` — attaches the launch role + +### OU portfolio sharing + +```hcl +resource "aws_servicecatalog_portfolio_share" "ou" { + for_each = toset(var.share_ous) + + portfolio_id = aws_servicecatalog_portfolio.this.id + type = "ORGANIZATIONAL_UNIT" + principal_id = each.value + + share_principals = true # member accounts inherit principal associations +} +``` + +Every account in the listed OUs immediately sees all products in the portfolio — no +per-account work required. + +### IAM launch role — StackSet + +The IAM launch role must exist in every member account that will provision products. +`deploy_products/` manages a CloudFormation StackSet that deploys it org-wide: + +```hcl +resource "aws_cloudformation_stack_set" "launch_role" { + name = "sc-automation-launch-role" + permission_model = "SERVICE_MANAGED" + capabilities = ["CAPABILITY_NAMED_IAM"] + template_body = file("${path.module}/cfn/sc-launch-role.yaml") + + parameters = { + CentralAccountId = data.aws_caller_identity.current.account_id + LambdaRegion = data.aws_region.current.name + LambdaName = var.lambda_function_name + } + + auto_deployment { + enabled = true + retain_stacks_on_account_removal = false + } +} + +resource "aws_cloudformation_stack_set_instance" "ou" { + for_each = toset(var.share_ous) + + stack_set_name = aws_cloudformation_stack_set.launch_role.name + deployment_targets { + organizational_unit_ids = [each.value] + } +} +``` + +The CFN template for the launch role is a static file at +`deploy_products/cfn/sc-launch-role.yaml`. It is the same role as in +`lambda-template-repo-generator/deploy_product/main.tf` — CFN + CloudFormation +permissions + `lambda:InvokeFunction` on the central Lambda ARN — but parameterized +so it works across any account without hard-coding the Lambda account ID. + +### Launch constraint + +The constraint ARN pattern references the role deployed by the StackSet: + +```hcl +resource "aws_servicecatalog_constraint" "launch" { + for_each = var.sc_products + + portfolio_id = aws_servicecatalog_portfolio.this.id + product_id = aws_servicecatalog_product.this[each.key].id + type = "LAUNCH" + + parameters = jsonencode({ + # The role ARN uses !Sub at CFN time — member accounts resolve their own account ID + LocalRoleName = "sc-automation-launch-role" + }) +} +``` + +--- + +## Adding a New Product Type + +1. Create `SCT-Engineering/template-{product_type}` with `handler.py`, `.tf.j2` files, `tf-run.data` +2. Add `service-catalog/{product_type}-product-template.yaml` to `sc-lambda-ghactions` +3. Add one entry to `var.sc_products` in `deploy_products/terraform.tfvars` +4. Run `tf apply` in `deploy_products/` — product is live in all OU member accounts immediately + +No census repo PRs. No Terragrunt. No YAML config files. + +--- + +## Moving the Lambda to a Different Account + +If the central Lambda needs to move accounts, only `deploy/` changes. The StackSet +launch roles reference the Lambda by name + account, managed via the `CentralAccountId` +and `LambdaName` StackSet parameters — a single `tf apply` in `deploy_products/` after +re-deploying `deploy/` propagates the updated ARN to all member accounts automatically. + +--- + +## Validation Checklist + +### After `deploy/` apply: +- [ ] Lambda resource policy allows org-wide CloudFormation invocation +- [ ] Cross-account test: invoke Lambda from a different account via CFN Custom Resource + +### After `deploy_products/` apply: +- [ ] StackSet instances show `CURRENT` for all target OUs +- [ ] Launch role exists in at least 2–3 spot-check workload accounts +- [ ] Portfolio visible in SC console in csvd-dev +- [ ] Portfolio shared to target OUs (verify from a workload account) +- [ ] Each product has an active provisioning artifact; initial broken artifact is deprecated +- [ ] Launch constraint references the correct role name +- [ ] End-to-end test: provision from a **workload account** (not csvd-dev) diff --git a/docs/template-management.md b/docs/template-management.md new file mode 100644 index 0000000..64584a3 --- /dev/null +++ b/docs/template-management.md @@ -0,0 +1,317 @@ +# Template Management + +This document describes how template repositories are structured and consumed by +the sc-lambda-ghactions system to add new workloads to existing account repos. + +--- + +## Core Principle: Templates are Delta Overlays + +Template repos do **not** contain a full account repo scaffold. Account repos +already carry all of the standard boilerplate from their initial setup: + +``` +{account-id}-{alias}/ +├── .tf-control # already there — toolchain version pin +├── .tf-control.tfrc # already there — plugin cache / provider mirror +├── .gitignore # already there +├── region.tf # already there +├── credentials.d/ # already there — per-region AWS credential files +├── variables.d/ # already there — profile + region auto.tfvars +├── common/ # existing layer with remote_state.yml, variables, etc. +├── infrastructure/ # existing layer ... +│ ├── remote_state.yml # already there — account-specific bucket/profile/account_id +│ ├── variables.common.tf # already there +│ └── west/ # existing workspace ... +└── vpc/ # existing layer ... +``` + +A template repo provides **only the new files** the Proposer writes into +that existing structure. If the template were to include `.tf-control`, +`region.tf`, `credentials.d/`, or `variables.d/`, it would: + +- **Overwrite working account-specific values** with placeholders or wrong defaults +- Be **non-reusable** across accounts (different profiles, regions, account IDs) +- Duplicate governance already managed by the `terraform/support` repo + +--- + +## What Belongs in a Template Repo + +A template repo is **flat**. It contains only the files that will be written +into `${LAYER}/${REGION_DIR}/` in the target account repo. The Proposer +already knows the destination path from the `LAYER` and `REGION_DIR` env vars +passed by the Lambda — there is no need to encode that in the template structure. + +``` +template-{product_type}/ +├── {workload}.tf.j2 # workload resources — rendered into LAYER/REGION_DIR/ +├── tf-run.data # apply step sequence — copied into LAYER/REGION_DIR/ +└── .sc-automation.yml.j2 # optional — written to repo root (dotfiles are special-cased) +``` + +### Minimal real example — `template-s3-bucket` + +``` +template-s3-bucket/ +├── INF.s3-standard.tf.j2 # S3 bucket + policy resources +├── tf-run.data # REMOTE-STATE + tf-directory-setup + ALL +└── .sc-automation.yml.j2 +``` + +When the Proposer runs with `LAYER=infrastructure REGION_DIR=west`, these three +files land at: +``` +{account-repo}/infrastructure/west/INF.s3-standard.tf +{account-repo}/infrastructure/west/tf-run.data +{account-repo}/.sc-automation.yml ← dotfiles go to repo root +``` + +The same template repo works unchanged for any account, any region, any layer. +No account-specific values. No directory nesting. + +### Multiple variants in one repo (`TEMPLATE_SOURCE_PATH`) + +When a template repo holds more than one product variant, use subdirectories +and set `TEMPLATE_SOURCE_PATH` to select the one to use: + +``` +template-s3/ +├── standard/ +│ ├── INF.s3-standard.tf.j2 +│ └── tf-run.data +└── encrypted/ + ├── INF.s3-encrypted.tf.j2 + └── tf-run.data +``` + +With `TEMPLATE_SOURCE_PATH=encrypted`, the Proposer uses `encrypted/` as the +root and the nesting is stripped — files still land flat in `LAYER/REGION_DIR/`. + +### When the target layer does not yet exist + +If the workload requires adding a **brand-new layer** to the account repo +(e.g. adding `infrastructure/` to an account that only has `common/`), the +template still does not provide the layer-level `remote_state.yml`. Instead, +the Lambda's Pydantic model builds it from SC form inputs and passes it via +`EXTRA_FILES`: + +```python +# Inside the Lambda handler for this product type: +extra_files = { + f"{layer}/remote_state.yml": render_remote_state_yml( + directory=layer, + account_id=req.aws_account_id, + account_alias=req.account_alias, + bucket=f"inf-tfstate-{req.aws_account_id}", + bucket_region="us-gov-east-1", + profile=f"{req.aws_account_id}-{req.account_alias}", + region=req.aws_region, + aws_environment="gov", + ) +} +``` + +`EXTRA_FILES` are written by the Proposer **after** template rendering, so they +can never be accidentally provided by the template repo. The account-specific +values come from the validated Pydantic model, not from a `.j2` file. + +--- + +## Template Repository Conventions + +### `tf-run.data` — required, placed at the workspace root + +The template must include a `tf-run.data` at its root (it lands in +`LAYER/REGION_DIR/` after the Proposer copies it). Minimum content: + +``` +VERSION 1.0 +REMOTE-STATE +COMMAND tf-directory-setup.py --link none +TAG apply-start +ALL +``` + +- `REMOTE-STATE` instructs the Proposer to derive the workspace `remote_state.yml` + from the layer-level one (appending `/{workspace_name}` to `directory`). +- `COMMAND tf-directory-setup.py --link none` causes the Proposer to generate + `remote_state.backend.tf` + the three variant files. `--link none` is the + bootstrap state; the Executor re-links to `--link s3` after first apply. +- `TAG apply-start` lets an operator re-run from this point without re-running + the setup directives. + +### `.sc-automation.yml.j2` — optional, written to repo root + +Files whose names start with `.` at the template root are written to the +**account repo root**, not into `LAYER/REGION_DIR/`. This is how +`.sc-automation.yml.j2` ends up at the right place without a separate +mechanism. If absent, the Proposer writes a default `.sc-automation.yml` +from the Lambda's `TfRunRequest` model. + +### `.terraform.lock.hcl` — include if possible + +Include a pre-generated `.terraform.lock.hcl` at the template root; it lands +in `LAYER/REGION_DIR/`. This avoids a from-scratch provider resolution on +first `tf-init` and gives reviewers visibility into locked provider versions. + +If omitted, the Executor generates it on first `tf-init` and commits it back +to `main` (tagged `[skip ci]`). + +--- + +## CFN Product Template Usage + +```yaml +Resources: + WorkloadRepo: + Type: Custom::TfRunPropose + Properties: + ServiceToken: !Sub "arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:sc-template-automation" + product_type: s3_bucket + account_repo: !Ref AccountRepo # e.g. 229685449397-csvd-dev-platform-dev-gov + layer: infrastructure + region_dir: west + aws_account_id: !Sub "${AWS::AccountId}" + aws_region: !Sub "${AWS::Region}" + # product-type-specific inputs (vary by Pydantic model): + bucket_name: !Ref BucketName + versioning_enabled: "true" +``` + +The Lambda's Pydantic model for `s3_bucket` validates the product-specific +inputs and builds `TEMPLATE_VARS` + any `EXTRA_FILES` (e.g. a new +`remote_state.yml` if the layer doesn't exist). The template repo supplies +only the generic `.tf.j2` and `tf-run.data`; the Lambda supplies all +environment-specific values. + +--- + +## Subdirectory Templates (`TEMPLATE_SOURCE_PATH`) + +See the [Multiple variants](#multiple-variants-in-one-repo-template_source_path) +section above. The `TEMPLATE_SOURCE_PATH` env var tells the Proposer which +subdirectory of the template repo to treat as the root. The selected subtree +is still rendered flat into `LAYER/REGION_DIR/` — the subdirectory path is +stripped entirely. + +--- + +## Proposer Build — What It Does + +The Proposer CodeBuild build clones the **existing account repo** and writes the +template delta on top of it. Steps in order: + +1. `git clone` the account repo; `git checkout -B ${GIT_BRANCH}` +2. If `TEMPLATE_REPO` is set: clone it (optionally at `source_path`), render all `.j2` + files with Jinja2 `StrictUndefined`, copy non-`.j2` files as-is, all at the same + relative paths. Account repo files that the template does not touch are left unchanged. +3. Write any `EXTRA_FILES` entries (path → content map from the Lambda model; overrides + template output). Typical use: new layer-level `remote_state.yml` when the target + layer does not yet exist in the account repo. +4. **REMOTE-STATE bootstrap** — for every `tf-run.data` found that contains a `REMOTE-STATE` + directive: read the layer-level `remote_state.yml` already present in the account repo + (or just written via `EXTRA_FILES`), append `/{workspace_name}` to the `directory` field, + write the result as `remote_state.yml` in the workspace directory. This mirrors exactly + what `tf-run.sh` does at apply time. +5. **`tf-directory-setup.py --link none`** — for every workspace directory that now has a + `remote_state.yml`, run `tf-directory-setup.py --link none` to generate: + - `remote_state.backend.tf` — S3 backend block + - `remote_state.{dir}.tf.s3` / `.local` / `.none` variant files + - Symlink `remote_state.{dir}.tf → remote_state.{dir}.tf.none` (bootstrap state) +6. Write `.sc-automation.yml` at the repo root if absent on `main`. +7. `git add -A && git commit && git push && gh pr create` + +> **Principle: the PR diff is the complete truth.** Every file the Executor will see +> at apply time is committed in the Proposer PR. The Executor never silently creates +> files; its `REMOTE-STATE` and `tf-directory-setup.py` steps are idempotent overwrites. + +--- + +## `.sc-automation.yml` — Automation Config File + +Every account repo that participates in sc-lambda-ghactions automation must have a +`.sc-automation.yml` file at the repo root. The Proposer writes this file when it +creates the initial PR if it does not already exist on `main`. + +### Schema + +```yaml +# .sc-automation.yml +product_type: eks_cluster # Must match PRODUCT_TYPE in template repo's handler.py +executor_project: sc-executor # CodeBuild project name for the Executor build +dry_run: true # If true, Executor runs tf plan only (no apply) +template_repo: SCT-Engineering/template-eks-cluster # Source template repo +template_source_path: "" # Subdirectory within template repo; empty = root +fleet_entry: workloads/eks_cluster/prod/my-cluster/main.tf # Path in terraform-sc-fleet +variables: # Extra key/value pairs injected as CodeBuild env vars + CLUSTER_VERSION: "1.29" + NODE_TYPE: m5.xlarge +``` + +| Field | Required | Description | +|---|---|---| +| `product_type` | ✅ | Routes to the correct Pydantic model and template directory | +| `executor_project` | ✅ | CodeBuild project started by the webhook on PR merge | +| `dry_run` | ✅ | `true` → `tf plan` only; `false` → `tf apply` | +| `template_repo` | ✅ | GHE repo used as the Jinja2 template source; the Lambda fetches it at the ref specified by `template_repo_ref` in the CFN template | +| `template_source_path` | ❌ | Subdirectory within `template_repo`; omit for whole-repo templates | +| `fleet_entry` | ❌ | Relative path of this workload's entry in `terraform-sc-fleet` | +| `variables` | ❌ | Product-type-specific overrides; merged with SSM global defaults | + +> **Versioning:** The Executor reads `.sc-automation.yml` from `main` at build time, not from the +> PR branch, so changes to it take effect on the next automation run without requiring a re-render. + +--- + +## Executor Build — What It Does + +The Executor does **not** render templates or open PRs. It only runs Terraform. + +After a platform engineer merges the Proposer PR to `main`: +1. GHE push webhook → Lambda reads `.sc-automation.yml` → starts `tf-run-executor` +2. Executor clones the account repo at `main` (all files already committed by the Proposer PR) +3. Optionally assumes cross-account IAM role (`TARGET_ACCOUNT_ID`) +4. `cd ${LAYER}/${REGION_DIR}`; runs `tf-run plan` or `tf-run apply` +5. After successful apply: commits `remote_state.{dir}.tf` symlink re-link + + `.terraform.lock.hcl` updates directly to `main` with `[skip ci]` + +The Executor does not touch any file that wasn't already committed in the PR. +It carries no template-repo knowledge and no Jinja2 dependencies. + +### Idempotency + +The Executor is safe to re-run. If `tf-run apply` produces no infrastructure +changes and the post-apply file diff is empty, the commit step is skipped. + +--- + +## Security Considerations + +- **Source path validation:** The Proposer validates that `source_path` (if provided) + exists in the template repo before proceeding. Path traversal (`../`) is rejected. +- **File type restrictions:** Only `.tf`, `.hcl`, `.yml`, `.yaml`, `.md`, `.j2`, + and standard dotfiles are copied. Binary files and executables are rejected. +- **Template repo access:** The GHE token injected into the CodeBuild environment + has read-only access to `SCT-Engineering/template-*` repos and read-write access + only to the target account repo. + +--- + +## Adding a New Product Type + +Checklist when onboarding a new product type: + +- [ ] Create `SCT-Engineering/template-{product_type}` containing: + - `handler.py` — `PRODUCT_TYPE`, Pydantic model, `handle()` function + - flat `.tf.j2` file(s) (rendered into `${LAYER}/${REGION_DIR}/` by the Proposer) + - `tf-run.data` + - `.sc-automation.yml.j2` (optional) + **No files in the Lambda repository need to be created or modified.** +- [ ] Tag the initial release: `git tag v1.0.0 && git push origin v1.0.0` +- [ ] Add `service-catalog/{product_type}-product-template.yaml` to `sc-lambda-ghactions`, + setting `template_repo_ref: v1.0.0` as a static property in the CFN `Properties` block +- [ ] Add one entry to `var.sc_products` in `deploy_products/terraform.tfvars` + with `version = "1.0.0"` matching the tag +- [ ] Run `tf apply` in `deploy_products/` — portfolio, product, artifact, launch roles, + and OU sharing are updated automatically; all OU-member accounts see the change immediately diff --git a/docs/vault-aws-secrets-engine.md b/docs/vault-aws-secrets-engine.md new file mode 100644 index 0000000..af67caf --- /dev/null +++ b/docs/vault-aws-secrets-engine.md @@ -0,0 +1,464 @@ +# HashiCorp Vault for Cross-Account Automation at Census +**Audience:** CSVD Engineering / sc-lambda-ghactions Stakeholders +**Date:** June 2026 +**Author:** David Arnold (`arnol377`) +**Related Jira:** [CSC-1345](https://jira.it.census.gov/browse/CSC-1345) · [CSC-1346](https://jira.it.census.gov/browse/CSC-1346) + +--- + +## 1. The Problem + +The `sc-lambda-ghactions` system automates AWS Service Catalog provisioning by running +`tf-run apply` via CodeBuild. To do that across multiple AWS accounts, CodeBuild needs +temporary credentials for each target account. + +**Current state:** no cross-account credential mechanism exists. + +**The naive fix (and why we rejected it):** +Add a trust policy to `r-inf-terraform` in every account that allows the CodeBuild IAM +role from `csvd-dev` to assume it. This requires: + +- A change to the management account StackSet `allow_assume_role_tf` parameter +- Trust policy propagation to **every org account** — ~450+ and growing +- Each new account onboarded requires the trust to already be in place +- Long-lived STS sessions (up to 1 hour) with no per-use audit trail + +**The right fix:** Vault AWS Secrets Engine. + +--- + +## 2. What Is HashiCorp Vault? + +Vault is a secrets management platform that controls access to tokens, passwords, +certificates, and cloud credentials. Its core value propositions are: + +| Capability | What It Means | +|---|---| +| **Dynamic Secrets** | Credentials generated on demand, expire automatically | +| **Centralized Policy** | One policy engine controls access across all secret types | +| **Audit Log** | Every read, write, and auth event logged with identity + metadata | +| **Identity-Based Access** | "Who are you?" not "What password do you know?" | +| **Encryption as a Service** | Encrypt/decrypt data without exposing keys | + +For our immediate use case, we care about two specific features: +- **AWS Secrets Engine** — generates dynamic IAM credentials per target account +- **IAM Auth Method** — lets CodeBuild authenticate using its own AWS IAM identity (no static creds) + +--- + +## 3. How It Solves the Cross-Account Problem + +``` +CodeBuild (csvd-dev, tf-run-executor-codebuild role) + │ + │ 1. "I am tf-run-executor-codebuild in 229685449397" + │ (signed by AWS STS — no password, no token) + ▼ +Vault Server (IAM Auth Method) + │ + │ 2. Validates identity via AWS STS GetCallerIdentity + │ 3. Checks policy: "executor role may request adsd-dev creds" + │ 4. Generates short-lived IAM key pair for target account + ▼ +CodeBuild receives: + AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY (TTL: 15 min) + │ + ▼ +tf-run apply runs in target account +Credentials expire automatically — nothing to rotate, nothing to leak +``` + +### What changes vs. the StackSet approach + +| Concern | StackSet Trust | Vault | +|---|---|---| +| Per-account setup | Trust policy in every account | Vault AWS backend role per account | +| New account onboarding | StackSet propagation (slow, blast radius) | Add one Vault role (seconds) | +| Credential lifetime | STS session: up to 1 hour | Configurable: 15 min recommended | +| Audit trail | CloudTrail (account-level) | Vault audit log (every access, centralized) | +| Revocation | Cannot revoke active STS session | Vault can revoke any lease instantly | +| Policy changes | StackSet → CloudFormation → IAM (slow) | `vault policy write` (instant) | + +--- + +## 4. Security Benefits + +### 4.1 Dynamic Credentials — Nothing to Rotate + +Static IAM access keys are a top attack vector (OWASP A02: Cryptographic Failures / +misconfigured credentials). With Vault: + +- No long-lived keys stored anywhere — not in Parameter Store, not in environment variables +- Every invocation gets a **unique, time-limited key pair** +- Expiry is enforced by Vault, not by developer discipline +- Compromise of one set of credentials is contained to a 15-minute window and one build job + +### 4.2 Every Access Is Audited + +Vault's audit device logs every auth event, secret read, and policy check with: + +```json +{ + "time": "2026-06-02T14:32:01Z", + "type": "response", + "auth": { + "client_token": "...", + "accessor": "...", + "display_name": "aws-tf-run-executor-codebuild", + "policies": ["default", "sc-automation-executor"], + "metadata": { + "account_id": "229685449397", + "iam_principal_arn": "arn:aws-us-gov:iam::229685449397:role/tf-run-executor-codebuild" + } + }, + "request": { + "path": "aws/creds/adsd-dev", + "operation": "read" + } +} +``` + +This gives you a **complete, tamper-evident record** of which automation job +requested credentials for which account, at what time — satisfying NIST 800-53 +AU-2, AU-3, AU-9 audit requirements. + +### 4.3 Principle of Least Privilege — Enforced Centrally + +Vault policies are written in HCL and version-controlled. Access to any given +account's credentials requires an explicit policy grant: + +```hcl +# Only allow executor to request creds for accounts it's authorized for +path "aws/creds/adsd-*" { + capabilities = ["read"] +} + +path "aws/creds/csvd-*" { + capabilities = ["read"] +} + +# Deny everything else explicitly +path "aws/*" { + capabilities = ["deny"] +} +``` + +No IAM policy sprawl, no StackSet blast radius. One file, version-controlled, +reviewed like any other code change. + +### 4.4 Break-Glass Revocation + +If a CodeBuild build is compromised mid-run, a Vault admin can: + +```bash +vault lease revoke -prefix aws/creds/adsd-dev +``` + +All active credentials for that backend are instantly invalidated — faster than +rotating an IAM key pair manually. + +### 4.5 Alignment with NIST 800-53 Controls + +| NIST Control | Requirement | Vault Feature | +|---|---|---| +| **IA-5** | Authenticator Management — no long-lived passwords | Dynamic secrets, auto-expiry | +| **AC-3** | Access Enforcement | Policy engine per path | +| **AC-17** | Remote Access | IAM Auth — cryptographic identity | +| **AU-2/3/9** | Audit Events, Content, Protection | Audit devices, tamper-evident log | +| **SC-12** | Cryptographic Key Establishment | Transit Secrets Engine (if needed later) | +| **CM-6** | Configuration Settings | Policies in version control | + +--- + +## 5. Automation Benefits + +### 5.1 Zero-Touch Account Onboarding + +When a new AWS account is bootstrapped, the only Vault step is: + +```bash +vault write aws/roles/new-account-name \ + credential_type=iam_user \ + policy_arns="arn:aws-us-gov:iam::aws:policy/AdministratorAccess" + +vault policy write sc-executor-new-account - << EOF +path "aws/creds/new-account-name" { capabilities = ["read"] } +EOF +``` + +Two commands. No StackSet, no CFN stack, no trust policy update. The executor +immediately has the ability to provision into that account. + +### 5.2 CodeBuild Integration Is Simple + +In `buildspec-executor.yml`, the existing `sts:AssumeRole` block becomes: + +```yaml +pre_build: + commands: + - | + if [ -n "$TARGET_ACCOUNT_ID" ]; then + # Authenticate to Vault using this CodeBuild job's IAM identity + VAULT_TOKEN=$(vault write -field=token auth/aws/login \ + role="sc-automation-executor" \ + iam_http_request_method="POST" \ + iam_request_url="$(base64 <<< 'https://sts.us-gov-west-1.amazonaws.com/')" \ + iam_request_body="$(base64 <<< 'Action=GetCallerIdentity&Version=2011-06-15')" \ + iam_request_headers="$(vault-aws-auth-header)") + + # Request short-lived credentials for the target account + CREDS=$(vault read -format=json aws/creds/${CROSS_ACCOUNT_ROLE}) + export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.data.access_key') + export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.data.secret_key') + export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.data.security_token') + fi +``` + +No secrets stored in environment variables. No secrets in SSM Parameter Store. +The only trust relationship needed is between CodeBuild's IAM role and Vault's +IAM auth endpoint — which is a single Vault config entry, not a per-account change. + +### 5.3 The `cross_account_role` Field Is Already Wired + +The `sc-lambda-ghactions` system already passes `CROSS_ACCOUNT_ROLE` from the +`.sc-automation.yml` file through to CodeBuild. Vault just becomes the consumer +of that field name — no CFN template changes, no Lambda changes needed. + +--- + +## 6. Government / Compliance Considerations + +> **Important:** This section is critical for Census. Read before approving deployment. + +### 6.1 Vault OSS License: Business Source License (BSL 1.1) + +In August 2023, HashiCorp changed Vault (and Terraform) from MPL 2.0 to +**Business Source License (BSL) 1.1**. + +**Key BSL terms:** +- Free to use for **any internal purpose**, including government automation +- Restriction applies only to building a **competing product** (a commercial secrets + management service sold to others) +- After **4 years**, the code converts to MPL 2.0 automatically +- No per-seat or per-server fees for self-hosted OSS usage + +**For Census Bureau use:** ✅ BSL is acceptable. Census is not building a competing +commercial secrets management product. Using Vault to automate internal AWS +infrastructure is squarely within permitted BSL use. + +**However:** Legal should formally bless this before production deployment, as BSL +is a relatively new license and some agencies have blanket policies against non-OSI-approved +licenses (BSL is **not** OSI-approved). + +### 6.2 OpenBao — The True OSS Alternative + +If BSL creates a legal / policy blocker, **OpenBao** is a drop-in replacement: + +| | Vault OSS (BSL) | OpenBao | +|---|---|---| +| License | BSL 1.1 (not OSI-approved) | **MPL 2.0** (OSI-approved) | +| Fork basis | — | Vault 1.14.x | +| API compatibility | — | 100% compatible | +| Governance | IBM/HashiCorp | Linux Foundation | +| FIPS build | Enterprise only | Community FIPS build available | +| Support | HashiCorp Enterprise contract | Community + vendors | + +OpenBao is the recommended path if legal flags BSL. The implementation is identical — +same API, same SDK, same `vault` CLI commands. + +### 6.3 FIPS 140-2 / 140-3 Requirement + +Federal systems processing sensitive data must use **FIPS 140-2 validated cryptographic +modules** (NIST SP 800-131A, OMB M-19-17). + +| Build | FIPS Status | +|---|---| +| Vault OSS | ❌ No FIPS-validated modules | +| Vault Enterprise + FIPS build | ✅ FIPS 140-2 validated (NSS/BoringCrypto) | +| OpenBao (FIPS build) | ✅ FIPS 140-2 via Go FIPS fork (non-validated) | +| OpenBao + BoringCrypto | 🔄 Community working on validated build | + +**Recommendation:** +- **Non-production / dev:** Vault OSS or OpenBao standard build is fine +- **Production ATO:** Vault Enterprise with FIPS build, OR work with your ISSO to + determine if OpenBao's BoringCrypto build satisfies the ATO boundary + +### 6.4 FedRAMP + +**FedRAMP is for cloud service providers (CSPs), not agencies.** +Census Bureau does not need Vault itself to be FedRAMP authorized. You need: + +1. Vault to run on **FedRAMP-authorized infrastructure** → ✅ AWS GovCloud (us-gov-west-1) + is FedRAMP High authorized +2. Vault to be included in the **system boundary** of an agency ATO +3. The ISSO and AO to authorize Vault as a software component under FISMA + +**HCP Vault Dedicated** (HashiCorp's cloud-hosted Vault) does hold FedRAMP Moderate +authorization — but that is a separate product and would require routing traffic to +HashiCorp's infrastructure, which may not be acceptable for GovCloud workloads. + +**Recommended path:** Self-hosted Vault OSS/Enterprise on AWS GovCloud, included in +the existing Census ATO boundary. This is the same pattern used by other FISMA-High +federal agencies running Vault on GovCloud. + +### 6.5 Compliance Summary + +| Requirement | Vault OSS | Vault Enterprise | OpenBao | Notes | +|---|---|---|---|---| +| BSL license | ✅ internal use OK | ✅ | ✅ MPL 2.0 | Legal sign-off needed for BSL | +| FIPS 140-2 | ❌ | ✅ FIPS build | 🔄 in progress | Required for production ATO | +| FedRAMP (self-hosted) | ✅ via agency ATO | ✅ via agency ATO | ✅ | Not a Vault property — agency ATO | +| AWS GovCloud compatible | ✅ | ✅ | ✅ | Runs on any compute | +| NIST 800-53 audit controls | ✅ audit log | ✅ | ✅ | All builds have audit devices | +| No long-lived credentials | ✅ | ✅ | ✅ | Core Vault capability | + +--- + +## 7. Deployment Architecture + +``` +AWS GovCloud (us-gov-west-1) +└── VPC: csvd-dev-gov + + ┌─────────────────────────────────────┐ + │ ECS Fargate / EC2 (TBD: CSC-1346) │ + │ │ + │ Vault Server (HA cluster) │ + │ ├── Backend: S3 (encrypted) │ + │ ├── HA: DynamoDB lock │ + │ ├── Unseal: AWS KMS auto-unseal │ + │ ├── Audit: CloudWatch Logs │ + │ └── TLS: ACM Private CA │ + └──────────────────┬──────────────────┘ + │ + ┌──────────▼──────────┐ + │ Vault Backends │ + ├─────────────────────┤ + │ aws/ ← dynamic │ + │ roles/adsd-dev │ → IAM credentials for 015325649777 + │ roles/csvd-dev │ → IAM credentials for 229685449397 + │ roles/ditd-prod │ → IAM credentials for ... + │ ... │ + ├─────────────────────┤ + │ auth/aws/ │ + │ roles/sc-executor│ → trusts tf-run-executor-codebuild + └─────────────────────┘ + │ + ┌──────────▼──────────────────────────────┐ + │ CodeBuild: tf-run-executor │ + │ (229685449397, us-gov-west-1) │ + │ │ + │ 1. vault login (IAM auth) │ + │ 2. vault read aws/creds/${ROLE} │ + │ 3. tf-run apply with dynamic creds │ + └─────────────────────────────────────────┘ +``` + +**Cluster topology is an open question (CSC-1346).** Options: +- **Single-node ECS Fargate** — simplest, lowest cost, acceptable for dev/non-prod +- **3-node ECS Fargate HA** — recommended for production +- **EC2 Auto Scaling Group** — most resilient, more ops overhead +- **HCP Vault Dedicated (GovCloud)** — managed, FedRAMP Moderate, but cost and + network routing to HashiCorp infra needs evaluation + +--- + +## 8. What We Are NOT Proposing + +To keep scope realistic: + +- ❌ Replacing AWS Secrets Manager for application secrets +- ❌ PKI / certificate management (yet) +- ❌ Database credentials (yet) +- ❌ Transit encryption as a service (yet) +- ❌ Org-wide Vault rollout — CSVD first, expand after buy-in + +The initial scope is **one thing:** dynamic AWS credentials for the `sc-lambda-ghactions` +executor. Everything else is future potential. + +--- + +## 9. Risks and Mitigations + +| Risk | Likelihood | Impact | Mitigation | +|---|---|---|---| +| Vault cluster goes down, blocking deployments | Medium | High | HA cluster + runbook; circuit-breaker in buildspec | +| ISSO does not authorize Vault for ATO boundary | Medium | High | Engage ISSO early; start with dev account only | +| BSL license rejected by legal | Low | Medium | Switch to OpenBao (same API, MPL 2.0) | +| Team unfamiliar with Vault ops | Medium | Medium | Start small; document runbooks; CSC-1346 topology decision | +| KMS auto-unseal key deletion | Very Low | Critical | KMS key deletion protection enabled; backup unseal keys in SSM | + +--- + +## 10. Recommended Path Forward + +### Phase 1 — Proof of Concept (2 weeks) +- [ ] **CSC-1346** — Decide cluster topology (recommendation: single-node Fargate for PoC) +- [ ] Deploy Vault OSS in `csvd-dev` dev environment +- [ ] Configure AWS IAM auth + one AWS backend role for `csvd-dev` account +- [ ] Wire `buildspec-executor.yml` to use `vault read` instead of `sts:AssumeRole` +- [ ] Demo to CSVD stakeholders + +### Phase 2 — ISSO Engagement + ATO Review (parallel) +- [ ] Work with Census ISSO to add Vault as a component in the ATO boundary +- [ ] Assess FIPS 140-2 requirement — Vault Enterprise vs OpenBao FIPS build +- [ ] Legal review of BSL 1.1 for internal government use + +### Phase 3 — Production Hardening (post-ATO) +- [ ] HA cluster (3-node Fargate) +- [ ] KMS auto-unseal in production +- [ ] CloudWatch audit log forwarding +- [ ] Vault policies for all target accounts +- [ ] **CSC-1344** unblocked → E2E test (**CSC-1343**) + +### Phase 4 — Expansion (post buy-in from Manny) +- [ ] Onboard other teams (adsd, ditd, ent) — one Vault role per account +- [ ] Standardize in account bootstrapping runbook + +--- + +## 11. Call to Action + +| Who | Ask | +|---|---| +| **Manny** | Executive buy-in to invest in Vault as org-wide credential platform | +| **CSVD team** (`badra001`, `dwara001`, `pubba001`, `kalep001`, `alade001`) | Review this proposal; join CSC-1345/CSC-1346 discussion | +| **Census ISSO** | Early engagement on ATO boundary inclusion | +| **Census Legal** | BSL 1.1 license review (or OpenBao as fallback) | +| **`arnol377`** | CSC-1346 topology decision → PoC deployment | + +--- + +## Appendix A — Quick Reference + +```bash +# How CodeBuild authenticates (IAM auth) +vault write auth/aws/login \ + role="sc-automation-executor" \ + iam_http_request_method=POST \ + iam_request_url=... \ + iam_request_body=... \ + iam_request_headers=... + +# How executor gets creds for a target account +vault read aws/creds/adsd-dev +# Returns: access_key, secret_key, security_token (TTL: 15m) + +# Admin: add a new account +vault write aws/roles/new-account \ + credential_type=assumed_role \ + role_arns="arn:aws-us-gov:iam::ACCOUNT_ID:role/r-inf-terraform" + +# Admin: revoke all active leases for an account +vault lease revoke -prefix aws/creds/adsd-dev +``` + +## Appendix B — Links + +- HashiCorp Vault docs: https://developer.hashicorp.com/vault/docs +- Vault AWS Secrets Engine: https://developer.hashicorp.com/vault/docs/secrets/aws +- Vault AWS Auth Method: https://developer.hashicorp.com/vault/docs/auth/aws +- OpenBao project: https://openbao.org +- BSL 1.1 full text: https://www.hashicorp.com/bsl +- Jira CSC-1345: https://jira.it.census.gov/browse/CSC-1345 +- Jira CSC-1346: https://jira.it.census.gov/browse/CSC-1346 diff --git a/docs/workflow-flowcharts.md b/docs/workflow-flowcharts.md new file mode 100644 index 0000000..ef860d4 --- /dev/null +++ b/docs/workflow-flowcharts.md @@ -0,0 +1,135 @@ +# Service Catalog Automation — Workflow Flowcharts + +**Ported and updated from:** `lambda-template-repo-generator/docs/DEMO_FLOWCHART.md` +**Updated for:** sc-lambda-ghactions (CodeBuild-based initial rollout; GHA planned for later) + +Generic overview of all end-to-end flows for any Service Catalog product built +on the sc-lambda-ghactions pattern. Intended for stakeholder demos and onboarding +conversations. + +--- + +## Flow 1 — Provisioning (SC Form → New Account Repo + PR) + +```mermaid +flowchart TD + A([👤 Engineer]) -->|Fills out form & clicks Launch| B[AWS Service Catalog] + + B -->|Creates CloudFormation Stack| C[CloudFormation\nCustom Resource] + + C -->|Cross-account invocation\nvia ServiceToken| D[Lambda Function\ncsvd-dev] + + D -->|Fetches GHE token| E[(Secrets Manager\ncsvd-dev)] + + D -->|Starts CodeBuild build\nproduct_type + inputs as env vars| F[CodeBuild\nProposer — csvd-dev] + + F -->|Clones template repo| G[SCT-Engineering/template-{product_type}] + F -->|Renders Jinja2 templates\nCommits rendered HCL| H[New Branch\nproposal/timestamp] + F -->|Opens| I[Pull Request\nproposal → main] + F -->|Commits entry to| K[terraform-sc-fleet\nworkloads/{type}/{name}/main.tf] + + D -->|Polls CodeBuild build\nevery 20s until complete| F + D -->|Returns repo URL + PR URL| C + + C -->|Stack outputs| B + B -->|Status: AVAILABLE\n+ repo & PR links| A + + style A fill:#4a90d9,color:#fff + style B fill:#f5a623,color:#fff + style C fill:#f5a623,color:#fff + style D fill:#7ed321,color:#fff + style E fill:#9b59b6,color:#fff + style F fill:#27ae60,color:#fff + style G fill:#2c3e50,color:#fff + style H fill:#2c3e50,color:#fff + style I fill:#e74c3c,color:#fff + style K fill:#8e44ad,color:#fff + +``` + +--- + +## Flow 2 — Apply on Merge (Webhook → Auto-Executor) + +After a platform engineer reviews and merges the Proposer PR, the webhook handler +automatically starts the executor build — no manual SC provisioning step required. + +```mermaid +flowchart TD + A([👤 Platform Engineer]) -->|Reviews & merges PR| B[GitHub Enterprise\nmain branch] + + B -->|Push event| C[Lambda Function URL\nPOST /webhook] + + C -->|Verifies HMAC signature| C + C -->|Reads .sc-automation.yml\nfrom merged commit| D{layer / region_dir\nconfigured?} + + D -->|Yes| E[Starts CodeBuild build\nexecutor — csvd-dev] + D -->|No| Z([Skip — no automation config]) + + E -->|Reads .sc-automation.yml\nvia buildspec env var| G{dry_run: true?} + G -->|Yes| H[terraform plan only] + G -->|No| I[terraform apply] + + E -->|POST commit status via GitHub API| B + B -->|✅ or ❌ on merge commit| A + + style A fill:#4a90d9,color:#fff + style B fill:#2c3e50,color:#fff + style C fill:#7ed321,color:#fff + style D fill:#f5a623,color:#fff + style E fill:#27ae60,color:#fff + style G fill:#f5a623,color:#fff + style H fill:#9b59b6,color:#fff + style I fill:#e74c3c,color:#fff + style Z fill:#95a5a6,color:#fff +``` + +--- + +## Flow 3 — Fleet-Wide Update (CSVD Operations) + +CSVD-initiated update applied across all managed workloads — e.g. a version bump. +No Service Catalog involvement; runs directly from `terraform-sc-fleet`. + +```mermaid +flowchart TD + A([👤 CSVD Engineer]) -->|python update_fleet.py\n--product-type eks_cluster --lifecycle dev| B[terraform-sc-fleet\nscripts/update_fleet.py] + + B -->|Walks workloads/eks_cluster/dev/**| C{maintenance\nwindow open?} + + C -->|Yes| D[tf apply\nper workload folder] + C -->|No| E([Skip workload\nlog window info]) + + D -->|Starts CodeBuild build\nexecutor — csvd-dev| F[CodeBuild\nExecutor — csvd-dev] + + F -->|Renders + commits\nupdated HCL| G[Account Repo\nNew branch] + G -->|Opens| H[Pull Request\nfor CSVD or customer review] + + F -->|POST commit status via GitHub API| H + + B -->|Summary: N applied\nM skipped| A + + style A fill:#4a90d9,color:#fff + style B fill:#8e44ad,color:#fff + style C fill:#f5a623,color:#fff + style D fill:#7ed321,color:#fff + style E fill:#95a5a6,color:#fff + style F fill:#27ae60,color:#fff + style G fill:#2c3e50,color:#fff + style H fill:#e74c3c,color:#fff +``` + +--- + +## Key Design Points + +| # | Point | +|---|-------| +| 1 | **Self-service provisioning** — engineer fills a form; no CSVD involvement for the create path | +| 2 | **Centralized compute** — Lambda, CodeBuild projects, and GHE tokens all live in csvd-dev; the provisioner's account only sees a CFN stack with output URLs | +| 3 | **Lambda as thin orchestrator** — validates inputs, starts CodeBuild build, polls for completion, returns URLs to CFN | +| 4 | **CodeBuild runs the Terraform** — actual repo creation and HCL rendering logic lives in CodeBuild buildspecs, not bespoke Lambda Python. GHA workflows are planned for a later rollout phase. | +| 5 | **Auto-apply on merge** — webhook handler eliminates the manual executor step; merge = apply | +| 6 | **Fleet operations separate from provisioning** — `terraform-sc-fleet` + `update_fleet.py` give CSVD a single command for fleet-wide changes | +| 7 | **Works for any product type** — swap `product_type` in the SC form and the entire chain routes to a different template repo, Pydantic model, and Jinja2 templates, with no Lambda plumbing changes | +| 8 | **Governance via GHE** — branch protection and CODEOWNERS are baked into every provisioned repo at creation time; customers can propose changes but cannot merge without CSVD approval | diff --git a/lambda/app.py b/lambda/app.py index 79c674f..be15e59 100644 --- a/lambda/app.py +++ b/lambda/app.py @@ -43,27 +43,38 @@ class TfRunRequest(BaseModel): """Validated input for a tf-run-executor CodeBuild invocation.""" + action: Literal["propose", "apply"] = Field(..., description="propose = render templates + open PR; apply = tf-run apply on main after PR is merged") account_repo: str = Field(..., description="Account repo name, e.g. 229685449397-csvd-dev-platform-dev-gov") layer: Literal["common", "infrastructure", "vpc"] = Field(..., description="Terraform layer") - region_dir: Literal["east", "west"] = Field(..., description="Region directory") - tf_run_start_tag: str = Field(default="", description="tf-run.data TAG label to start from; empty = from beginning") - extra_files: dict = Field(default_factory=dict, description='JSON map {"relative/path": "content"} written before tf-run') - git_branch: str = Field(default="repo-init", description="Branch to commit and open PR from") - dry_run: bool = Field(default=False, description="true = tf plan only, no apply") + region_dir: Literal["east", "west", "global"] = Field(..., description="Region directory (east, west, or global for non-regional resources like SSO/IAM)") - @field_validator("extra_files", mode="before") + # --- Proposer fields (action=propose only) --- + template_repo: str = Field(default="", description="GHE repo name containing Jinja2/raw template files to render into the account repo") + template_vars: dict = Field(default_factory=dict, description='JSON map of variables passed to Jinja2 when rendering template_repo files') + extra_files: dict = Field(default_factory=dict, description='JSON map {"relative/path": "content"} written into account repo (after template rendering)') + git_branch: str = Field(default="propose/sc-automation", description="Branch to commit and open PR from (propose only)") + + # --- Executor fields (action=apply only) --- + target_account_id: str = Field(default="", description="AWS account ID to assume cross_account_role in before running tf-run; empty = run with CodeBuild role (csvd-dev)") + cross_account_role: str = Field(default="r-inf-terraform", description="IAM role name to assume in target_account_id (default: r-inf-terraform)") + tf_run_start_tag: str = Field(default="", description="tf-run.data TAG label to start from; empty = from beginning (apply only)") + dry_run: bool = Field(default=False, description="true = tf-run plan only, no apply (apply action only)") + + @field_validator("extra_files", "template_vars", mode="before") @classmethod - def parse_extra_files(cls, v: Any) -> Any: - """Accept a JSON string or a dict for extra_files. + def parse_json_dict_fields(cls, v: Any) -> Any: + """Accept a JSON string or a dict for dict-typed fields. - CFN parameters are always strings, so '{}' or '{"path": "content"}' + CFN parameters are always strings, so '{}' or '{"key": "val"}' must be parsed before Pydantic validates the dict type. """ if isinstance(v, str): + if v.strip() == "": + return {} try: return json.loads(v) except json.JSONDecodeError as exc: - raise ValueError(f"extra_files must be a valid JSON object string; got: {v!r}") from exc + raise ValueError(f"Field must be a valid JSON object string; got: {v!r}") from exc return v class Config: @@ -129,34 +140,53 @@ def send_cfn_response( def start_codebuild_build( tf_req: TfRunRequest, - github_token: str, request_id: str, ) -> str: - """Start the tf-run-executor CodeBuild project with per-build env-var overrides. + """Start the proposer or executor CodeBuild project with per-build env-var overrides. + + GITHUB_TOKEN is intentionally omitted here — both CodeBuild projects define it + as type=SECRETS_MANAGER at the project level. The CodeBuild service role has + secretsmanager:GetSecretValue for that secret, so CodeBuild fetches the current + value fresh at each build start without the token ever appearing in CloudTrail + (StartBuild) or BatchGetBuilds API responses. Passing it as PLAINTEXT here would + override that project-level definition and expose the token in both. Returns the CodeBuild build ID. """ - project_name = os.environ.get("CODEBUILD_PROJECT_NAME", "tf-run-executor") + if tf_req.action == "propose": + project_name = os.environ.get("PROPOSER_PROJECT_NAME", "tf-run-proposer") + env_overrides = [ + {"name": "ACCOUNT_REPO", "value": tf_req.account_repo, "type": "PLAINTEXT"}, + {"name": "LAYER", "value": tf_req.layer, "type": "PLAINTEXT"}, + {"name": "REGION_DIR", "value": tf_req.region_dir, "type": "PLAINTEXT"}, + {"name": "GIT_BRANCH", "value": tf_req.git_branch, "type": "PLAINTEXT"}, + {"name": "TEMPLATE_REPO", "value": tf_req.template_repo, "type": "PLAINTEXT"}, + {"name": "TEMPLATE_VARS", "value": json.dumps(tf_req.template_vars), "type": "PLAINTEXT"}, + {"name": "EXTRA_FILES", "value": json.dumps(tf_req.extra_files), "type": "PLAINTEXT"}, + ] + else: # apply + project_name = os.environ.get("EXECUTOR_PROJECT_NAME", "tf-run-executor") + env_overrides = [ + {"name": "ACCOUNT_REPO", "value": tf_req.account_repo, "type": "PLAINTEXT"}, + {"name": "LAYER", "value": tf_req.layer, "type": "PLAINTEXT"}, + {"name": "REGION_DIR", "value": tf_req.region_dir, "type": "PLAINTEXT"}, + {"name": "TARGET_ACCOUNT_ID", "value": tf_req.target_account_id, "type": "PLAINTEXT"}, + {"name": "CROSS_ACCOUNT_ROLE", "value": tf_req.cross_account_role, "type": "PLAINTEXT"}, + {"name": "TF_RUN_START_TAG", "value": tf_req.tf_run_start_tag, "type": "PLAINTEXT"}, + {"name": "DRY_RUN", "value": str(tf_req.dry_run).lower(), "type": "PLAINTEXT"}, + ] + region = os.environ.get("AWS_REGION", os.environ.get("AWS_DEFAULT_REGION", "us-gov-west-1")) cb = boto3.client("codebuild", region_name=region) logger.info( - f"[{request_id}] Starting CodeBuild '{project_name}' for " + f"[{request_id}] Starting CodeBuild '{project_name}' (action={tf_req.action}) for " f"repo={tf_req.account_repo} layer={tf_req.layer}/{tf_req.region_dir}" ) response = cb.start_build( projectName=project_name, - environmentVariablesOverride=[ - {"name": "ACCOUNT_REPO", "value": tf_req.account_repo, "type": "PLAINTEXT"}, - {"name": "LAYER", "value": tf_req.layer, "type": "PLAINTEXT"}, - {"name": "REGION_DIR", "value": tf_req.region_dir, "type": "PLAINTEXT"}, - {"name": "GIT_BRANCH", "value": tf_req.git_branch, "type": "PLAINTEXT"}, - {"name": "TF_RUN_START_TAG", "value": tf_req.tf_run_start_tag, "type": "PLAINTEXT"}, - {"name": "EXTRA_FILES", "value": json.dumps(tf_req.extra_files), "type": "PLAINTEXT"}, - {"name": "DRY_RUN", "value": str(tf_req.dry_run).lower(), "type": "PLAINTEXT"}, - {"name": "GITHUB_TOKEN", "value": github_token, "type": "PLAINTEXT"}, - ], + environmentVariablesOverride=env_overrides, ) build_id = response["build"]["id"] logger.info(f"[{request_id}] CodeBuild build started: {build_id}") @@ -287,7 +317,7 @@ def lambda_handler(event: dict, context) -> dict: tf_req = TfRunRequest(**normalized) logger.info( - f"[{request_id}] repo={tf_req.account_repo} " + f"[{request_id}] action={tf_req.action} repo={tf_req.account_repo} " f"layer={tf_req.layer}/{tf_req.region_dir} " f"branch={tf_req.git_branch} dry_run={tf_req.dry_run}" ) @@ -296,7 +326,7 @@ def lambda_handler(event: dict, context) -> dict: logger.info(f"[{request_id}] Fetching GitHub token from secret: {github_token_secret}") github_token = get_secret(github_token_secret) - build_id = start_codebuild_build(tf_req, github_token, request_id) + build_id = start_codebuild_build(tf_req, request_id) # Poll — leave 60s buffer before Lambda timeout for cfn-response PUT lambda_timeout_s = context.get_remaining_time_in_millis() / 1000 @@ -304,23 +334,31 @@ def lambda_handler(event: dict, context) -> dict: build_status, logs_url = poll_codebuild_build(build_id, request_id, poll_budget_min) if build_status == "SUCCEEDED": - pr_url = fetch_pr_url(github_token, tf_req.account_repo, tf_req.git_branch, request_id) github_base = os.environ.get("GITHUB_API", "https://github.e.it.census.gov/api/v3").rstrip("/").removesuffix("/api/v3") github_org = os.environ.get("GITHUB_ORG_NAME", "SCT-Engineering") repo_url = f"{github_base}/{github_org}/{tf_req.account_repo}" - response_data = { - "PullRequestUrl": pr_url, - "pull_request_url": pr_url, - "RepositoryUrl": repo_url, - "repository_url": repo_url, - "BranchName": tf_req.git_branch, - "branch_name": tf_req.git_branch, - "CodeBuildBuildId": build_id, - } + if tf_req.action == "propose": + pr_url = fetch_pr_url(github_token, tf_req.account_repo, tf_req.git_branch, request_id) + response_data = { + "PullRequestUrl": pr_url, + "pull_request_url": pr_url, + "RepositoryUrl": repo_url, + "repository_url": repo_url, + "BranchName": tf_req.git_branch, + "branch_name": tf_req.git_branch, + "CodeBuildBuildId": build_id, + } + else: # apply + response_data = { + "ApplyStatus": "SUCCEEDED", + "RepositoryUrl": repo_url, + "repository_url": repo_url, + "CodeBuildBuildId": build_id, + } send_cfn_response( event, context, "SUCCESS", response_data, - physical_resource_id=f"{tf_req.account_repo}-{tf_req.layer}-{tf_req.region_dir}", + physical_resource_id=f"{tf_req.action}-{tf_req.account_repo}-{tf_req.layer}-{tf_req.region_dir}", ) return {"statusCode": 200, "body": json.dumps(response_data)} diff --git a/scripts/tf-control.sh b/scripts/tf-control.sh deleted file mode 100755 index ef1b36f..0000000 --- a/scripts/tf-control.sh +++ /dev/null @@ -1,392 +0,0 @@ -#!/bin/bash - -get_git_root() -{ - TOP=$(git rev-parse --show-toplevel 2>/dev/null) - if [ -z "$TOP" ] - then - TOP=$HOME - fi -} - -do_help() -{ - local ACTIONS=$@ - echo "* help: $THIS $VERSION" - echo " tf-{action}: runs command 'terraform {action}' with specific arguments" - echo " tf-{action} less [arguments]: shows the latest log file for the specific {action}" - echo " TFLESS=1 tf-{action} [arguments]: shows the latest log file for the specific {action}" - echo " tf-log {string} [list|{filename}: tails the last log matching the patern logs/{string}*, or provides a list of file, or uses a specific file" - echo " tf-cli {string}: runs the approriate terraform binary with whaterver {string} as arguments" - echo "" - echo "* environment variables" - echo " TFCONTROL: point to alternate .tf-control; default looks lin git-root, then \$HOME" - echo " TF_CLI_CONFIG_FILE: point to alternate .tf-control.tfrc; default looks lin git-root, then \$HOME, and default \$HOME/.terraformrc" - echo " TFARGS: extra args to pass to terraform command. Only applies to actions apply and destroy" - echo " TFNOCOLOR: color disabled by default -no-color, to enable, set this any value" - echo " TFNOLOG: setting this at all will disable logging through 'tee' to a file. Needed to pull tf-state properly" - echo " TFNOPROXY: do not auto-set proxy. With changes to the firewalls, proxy is needed, so we set it for init options" - echo "" - echo "* Available Actions:" - for a in $ACTIONS - do - echo " tf-${a}" - done - echo "" - echo "* Special Actions:" - echo " tf-plan summary: produces a list of items to create, destroy, replace, or update. Requires having run 'tf-plan' first" - echo " tf-apply summary: produces a list of items like plan but under an apply action. Requires having run 'tf-apply' first and then answering 'no'" - echo " tf-destroy summary: produces a list of items to destroy. Requires having run 'tf-destroy' first and then answering 'no'" - return 0 -} - -# pass things like -target= -# make aliases -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-init -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-plan -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-apply -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-destroy -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-refresh -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-output -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-validate -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-import -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-state -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-fmt -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-taint -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-console -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-log -# ln -s $BINDIR/tf-control.sh $BINDIR/tf-cli - -THIS=$(basename $0) -VERSION="1.11.0" -ACTION=$(basename $THIS .sh | sed -e 's/^tf-//') -LOGDIR="logs" - -umask 002 - -# path or name of terraform binary -# get from top of git repo or $HOME/.tf-control -CURRENTDIR=$(pwd) -get_git_root -if [ -z "$TFCONTROL" ] -then - if [ -r $TOP/.tf-control ] - then - TFCONTROL=$TOP/.tf-control - elif [ -r $CURRENTDIR/.tf-control ] - then - TFCONTROL=$CURRENTDIR/.tf-control - elif [ -r $HOME/.tf-control ] - then - TFCONTROL=$HOME/.tf-control - fi -fi -# if .tf-control in git-root, override it with a local .tf-control.override -if [ -r $CURRENTDIR/.tf-control.override ] -then - TFCONTROL=$CURRENTDIR/.tf-control.override -fi -if [ ! -z "$TFCONTROL" ] -then - source $TFCONTROL -fi - -if [ -z $TFCOMMAND ] -then - TFCOMMAND="terraform" -fi - -# look for config file -if [ -z "$TF_CLI_CONFIG_FILE" ] -then - if [ -r $TOP/.tf-control.tfrc ] - then - export TF_CLI_CONFIG_FILE=$TOP/.tf-control.tfrc - elif [ -r $CURRENTDIR/.tf-control.tfrc ] - then - export TF_CLI_CONFIG_FILE=$CURRENTDIR/.tf-control.tfrc - elif [ -r $HOME/.tf-control.tfrc ] - then - export TF_CLI_CONFIG_FILE=$HOME/.tf-control.tfrc - else - unset TF_CLI_CONFIG_FILE - fi -fi - -# based on issue https://github.com/hashicorp/terraform/issues/32901 -# where shared provider cache doesn't work right for multiple users in 1.4. This is needed to get beyond 1.3. -# it really does belong in the .tf-control file but that's a lot of files to change -export TF_PLUGIN_CACHE_MAY_BREAK_DEPENDENCY_LOCK_FILE=1 - -ACTIONS="init plan apply destroy refresh output validate import state fmt taint console log cli" -declare -A actions -for action in $ACTIONS -do - actions["$action"]=$action -done - -if [[ ! -z "$1" ]] && [[ "$1" == "help" ]] -then - do_help $ACTIONS - exit 0 -fi - -# set soem TF_VAR variables ifnot already set -if [ -z "$TF_VAR_os_environment" ] -then - export TF_VAR_os_environment="{\"pwd\":\"$(pwd)\"}" -fi -if [ -z "$TF_VAR_os_username" ] -then - export TF_VAR_os_username=$USER -fi - -# calling the script directly installs into $BINDIR | /apps/terraform/bin -if [ $ACTION == "control" ] -then - if [ -z $BINDIR ] - then - location=$(which $THIS 2> /dev/null) - if [ -z $location ] - then - BINDIR="/apps/terraform/bin" - else - BINDIR=$(dirname $location) - fi - fi - umask 022 - echo "* installing $THIS v$VERSION in $BINDIR" - cp $THIS $BINDIR/ - chmod 755 $BINDIR/$THIS - for action in $ACTIONS - do - echo "+ enabling tf-$action to $BINDIR/$THIS" - ln -sf $BINDIR/$THIS $BINDIR/tf-$action - done - exit 0 -fi - -# pass TFCOLOR=-color if wanting colorized output -if [ -z $TFCOLOR ] -then - if [ ! -z "$TFNOCOLOR" ] - then - TFCOLOR="" - else - TFCOLOR="-no-color" - fi -fi - -if [ ! -d $LOGDIR ] -then - mkdir -p $LOGDIR -fi -YMDSTAMP=$(date +%Y%m%d) -start=$(date +%s) -STAMP="$YMDSTAMP.$start" -LOGFILE="$LOGDIR/$ACTION.$STAMP.log" - -if [[ ! -z $1 ]] && [[ $1 == "less" ]] || [[ ! -z "$TFLESS" ]] -then - file=$(ls $LOGDIR/$ACTION*.log 2> /dev/null | tail -n 1) - if [ -z "$file" ] - then - echo "* No log file for action=$ACTION. Please run 'tf-$ACTION' first" - exit 1 - else - echo "# results from file $file" - less $file - exit 0 - fi -fi -if ( [[ $ACTION == "plan" ]] || [[ $ACTION == "destroy" ]] || [[ $ACTION == "apply" ]] ) && [[ ! -z $1 ]] && [[ $1 == "summary" ]] -then - file=$(ls $LOGDIR/$ACTION*.log 2> /dev/null | tail -n 1) - if [ -z "$file" ] - then - echo "* Previous $ACTION file does not exist. Please run 'tf-$ACTION'" - exit 1 - fi - echo "* tf-$ACTION summary from log $file" - for op in created updated replaced destroyed - do - cc=$(grep -c " $op" $file) - echo "> to-be $op ($cc)" - grep " $op" $file - echo "" - done - for op in changed moved - do - cc=$(grep -cE "has \b$op\b" $file) - echo "> has $op ($cc)" - grep -E " has \b$op\b" $file - echo "" - done - grep ^Plan $file - exit 0 -fi -if [ $ACTION == "log" ] -then - logtype="$1" - logaction="$2" - if [[ ! -z "$logaction" ]] && [[ $logaction == "list" ]] - then - echo "* available files for pattern '${logtype}*':" - ls logs/${logtype}* -C1 | sed -e 's/^/ /' - echo "" - exit 0 - elif [[ ! -z "$logaction" ]] - then - TFLOGFILE=$logaction - else - TFLOGFILE=$(ls logs/${logtype}* 2> /dev/null | tail -n 1) - fi - if [ ! -z "$TFLOGFILE" ] - then - if [[ -z "$logaction" ]] - then - echo "* available files for pattern '${logtype}*':" - ls logs/${logtype}* -C1 | sed -e 's/^/ /' - echo "" - fi - echo "* showing logfile=$TFLOGFILE" - less $TFLOGFILE - exit $? - else - echo "* No log files exist matching pattern '${logtype}*'" - exit 1 - fi -fi - -if [ ${actions[$ACTION]} == $ACTION ] -then - ( echo "# starting v$VERSION action $ACTION file $LOGFILE stamp $STAMP time $start"; \ - echo "# current_directory=$(pwd)"; \ - echo "# git_repository=$(git remote -v show | grep fetch | awk '{print $2}')"; \ - echo "# git_current_branch=$(git branch | grep ^* | awk '{print $2}')"; \ - echo "# terraform_version=$($TFCOMMAND -v|grep ^Terraform)"; \ - echo "# TFCONTROL=$TFCONTROL"; \ - echo "# TF_CLI_CONFIG_FILE=$TF_CLI_CONFIG_FILE"; \ - echo "# TFARGS=\"$TFARGS\" TFNOCLOR=$TFNOCOLOR TFNOLOG=$TFNOLOG TFNOPROXY=$TFNOPROXY"; \ - echo "# env TF_VAR_ variables"; \ - printenv | grep TF_VAR_ | sed -e 's/^/# /'; \ - echo "" ) |& tee $LOGFILE -else - echo "* action:${actions[$ACTION]}" - echo "* invalid action $ACTION, exiting" - exit 1 -fi - -# TFARGS="" -r=0 -if [ $ACTION == "init" ] -then -# $TFCOMMAND init $TFARGS $TFCOLOR $@ |& tee -a $LOGFILE - if [[ -z "$TFNOPROXY" ]] && [[ -z "$HTTPS_PROXY" ]] && [[ -r "/apps/terraform/etc/set-proxy.sh" ]] - then - source /apps/terraform/etc/set-proxy.sh - fi - $TFCOMMAND init $TFCOLOR $@ |& tee -a $LOGFILE - r=$? -fi - -if [ $ACTION == "plan" ] -then -# $TFCOMMAND plan $TFARGS $TFCOLOR $@ |& tee -a $LOGFILE - $TFCOMMAND plan $TFCOLOR $@ |& tee -a $LOGFILE - r=$? -fi - -if [ $ACTION == "apply" ] -then - $TFCOMMAND apply $TFARGS $TFCOLOR $@ |& tee -a $LOGFILE - r=$? -fi - -if [ $ACTION == "destroy" ] -then - $TFCOMMAND destroy $TFARGS $TFCOLOR $@ |& tee -a $LOGFILE - r=$? -fi - -if [ $ACTION == "refresh" ] -then -# $TFCOMMAND refresh $TFARGS $TFCOLOR $@ |& tee -a $LOGFILE - $TFCOMMAND refresh $TFCOLOR $@ |& tee -a $LOGFILE - r=$? -fi - -if [ $ACTION == "validate" ] -then -# $TFCOMMAND validate $TFARGS $TFCOLOR $@ |& tee -a $LOGFILE - $TFCOMMAND validate $TFCOLOR $@ |& tee -a $LOGFILE - r=$? -fi - -if [ $ACTION == "output" ] -then -# $TFCOMMAND output $TFARGS $@ -# $TFCOMMAND output $TFARGS $TFCOLOR $@ |& tee -a $LOGFILE - $TFCOMMAND output $TFCOLOR $@ |& tee -a $LOGFILE - r=$? - exit $r -fi - -if [ $ACTION == "import" ] -then -# $TFCOMMAND import $TFARGS $@ |& tee -a $LOGFILE - $TFCOMMAND import $@ |& tee -a $LOGFILE - r=$? -fi - -# not recommended for pull -if [ $ACTION == "state" ] -then -# $TFCOMMAND state $TFARGS $@ |& tee -a $LOGFILE - if [ ! -z "$TFNOLOG" ] - then - $TFCOMMAND state $@ - else - $TFCOMMAND state $@ |& tee -a $LOGFILE - fi - r=$? -fi - -if [ $ACTION == "fmt" ] -then -# $TFCOMMAND fmt $TFARGS $@ |& tee -a $LOGFILE - $TFCOMMAND fmt $@ |& tee -a $LOGFILE - r=$? -fi - -if [ $ACTION == "taint" ] -then -# $TFCOMMAND taint $TFARGS $@ |& tee -a $LOGFILE - $TFCOMMAND taint $@ |& tee -a $LOGFILE - r=$? -fi - -# This doesnt work to 'tee' because we can't leave stdin on the terminal. -if [ $ACTION == "console" ] -then -# $TFCOMMAND console $TFARGS $@ |& tee -a $LOGFILE -# $TFCOMMAND console $@ |& tee -a $LOGFILE - echo "* TFNOLOG in effect for $ACTION" - $TFCOMMAND console $@ - r=$? -fi - -if [ $ACTION == "cli" ] -then -# $TFCOMMAND $TFARGS $@ |& tee -a $LOGFILE - $TFCOMMAND $@ |& tee -a $LOGFILE - r=$? -fi - -end=$(date +%s) -elapsed=$(( $end - $start )) -(echo "# ending v$VERSION action $ACTION file $LOGFILE stamp $STAMP start $start end $end elapsed $elapsed"; echo "") |& tee -a $LOGFILE - -echo "" -echo "# results in file $LOGFILE stamp $STAMP status=$r" -# echo $r diff --git a/scripts/tf-directory-setup.py b/scripts/tf-directory-setup.py deleted file mode 100755 index 7c7b5e1..0000000 --- a/scripts/tf-directory-setup.py +++ /dev/null @@ -1,225 +0,0 @@ -#!/apps/terraform/python/bin/python -# /bin/env python - -from jinja2 import Environment,FileSystemLoader -import os -#import csv -#import re -import sys -from pprint import pprint -from datetime import datetime,date,time -from dateutil import tz -from dateutil.parser import parse as date_parse -import yaml -import hashlib -import argparse -from pathlib import Path - -def parse_arguments(version): - parser = argparse.ArgumentParser(description="Setup directory for Terraform (remote state, links)",add_help=True) - parser.add_argument('filename', action='store', help="Configuration filename to read (remote_state.yml)", default='remote_state.yml', nargs='?') - parser.add_argument('--version', action='version', version='%(prog)s '+version) - parser.add_argument("-n","--dry-run", action="store_true", dest="dry_run", help="Dry run, do not create links or remote state configuration", default=False) - parser.add_argument("-d","--debug", action="store_true", dest="debug", help="debugging", default=False) - parser.add_argument("-v","--verbose", action="store_true", dest="verbose", help="verbose output", default=False) - parser.add_argument("-f","--force", action="store_true", dest="force", help="Force", default=False) - parser.add_argument("-l","--link", action="store", dest="link", help="Make link to .tf", choices=['none', 'local', 's3']) - args = parser.parse_args() - return args - -def touch_file(file): - if os.path.exists(file): - os.utime(file,None) - else: - open(file,'a').close() - -def read_yaml(file): - data={} - with open(file, 'r') as stream: - try: - data=yaml.full_load(stream) - except yaml.YAMLError as e: - print(e) - return None - return data - -def create_backend(args,version): - data=read_yaml(args.filename) -# initialize missing fields - data['make_links']=data.get('make_links',True) - - if args.debug: - print('* data =') - pprint(data) - print('* args =',args) - print("") - - dry_s="[dry-run] " if args.dry_run else "" - this_dir=os.getcwd() - -# print('args',args) -# sys.exit(0) - - file_loader=FileSystemLoader('/apps/terraform/template') - env=Environment( - loader=file_loader, - trim_blocks=True, - lstrip_blocks=True - ) - - data['directory']=data.get('directory','') - if data['directory'] == "": - print("* error, 'directory' cannot be empty") - sys.exit(1) - - tf_backend=env.get_template('remote_state.backend.tf.j2') - tf_backend_data_local=env.get_template('remote_state.data.tf.local.j2') - tf_backend_data_s3=env.get_template('remote_state.data.tf.s3.j2') - - tf_output=tf_backend.render(data=data) - tf_filename='remote_state.backend.tf' - if os.path.exists(tf_filename): - do_create=args.force - else: - do_create=True - if do_create: - print("* {}creating file {}".format(dry_s,tf_filename)) - if not args.dry_run: - with open(tf_filename, 'w') as tf_file: - tf_file.write(tf_output) - else: - if args.debug or args.verbose: - print("* {}not creating file {}".format(dry_s,tf_filename)) - - d=data['directory'].replace('/','_').replace('.','_') - data['directory_replaced']=d - - base_dir=this_dir.replace(data['directory'],'') - dir_paths=data['directory'].split(os.path.sep) - rp=['..'] * len(dir_paths) - rel_path=os.path.join(*rp) - if args.debug: - print("* this_dir={}\n base_dir={}\n directory={}".format(this_dir,base_dir,data['directory'])) - print(" path_length={}\n relative_path_to_top={}/".format(len(dir_paths),rel_path)) - - - tf_output=tf_backend_data_s3.render(data=data) - tf_filename='remote_state.%s.tf.s3' % d - if do_create: - print("* {}creating file {}".format(dry_s,tf_filename)) - if not args.dry_run: - with open(tf_filename, 'w') as tf_file: - tf_file.write(tf_output) - else: - if args.debug or args.verbose: - print("* {}not creating file {}".format(dry_s,tf_filename)) - - tf_output=tf_backend_data_local.render(data=data) - tf_filename='remote_state.%s.tf.local' % d - if do_create: - print("* {}creating file {}".format(dry_s,tf_filename)) - if not args.dry_run: - with open(tf_filename, 'w') as tf_file: - tf_file.write(tf_output) - else: - if args.debug or args.verbose: - print("* {}not creating file {}".format(dry_s,tf_filename)) - - tf_filename='remote_state.%s.tf.none' % d - if do_create: - print("* {}touching file {}".format(dry_s,tf_filename)) - if not args.dry_run: - touch_file(tf_filename) - else: - if args.debug or args.verbose: - print("* {}not touching file {}".format(dry_s,tf_filename)) - - tf_filename='remote_state.%s.tf' % d - if args.link is not None: - source_file='{}.{}'.format(tf_filename,args.link) - if os.path.exists(tf_filename): - if not os.path.islink(tf_filename): - print("* {}target file {} is not a link, fixing".format(dry_s,tf_filename)) - if not args.dry_run: - os.remove(tf_filename) - if args.verbose: - print("* {}removing file {}".format(dry_s,tf_filename)) - if not args.dry_run: - os.symlink(source_file, tf_filename) - print("* {}link {} to {}".format(dry_s,source_file, tf_filename)) - else: - if do_create: - print("* sample ln commands to run\n") - print("# ln -sf {}.none {}".format(tf_filename,tf_filename)) - print("# ln -sf {}.local {}".format(tf_filename,tf_filename)) - print("# ln -sf {}.s3 {}".format(tf_filename,tf_filename)) - - return - -#--- -# main -#--- -def main(): - version='2.2.2' - args=parse_arguments(version) - - create_backend(args,version) - return - -#--- -# main -#--- -if __name__ == '__main__': - main() - -# if make_links, then read all links in the parent directory and make a link to them -# if link_files [] exists, then make only links to the parent directory for those files -# first time through (or --init, --step1), remote state doesn't exist), make the link to -> none -# after remote state has stuff (--step2), or remote state link does exist, change the link -> s3 -# optional use remote_state as local (local: true | false) -# bring template files into this script (like with rotate-keys.py) -# add logging - -## directory: "common" -## profile: "123456789012-mycloud" -## bucket: "inf-tfstate-123456789012" -## bucket_region: "us-gov-west-1" -## region: "us-gov-west-1" -## regions: ["us-gov-west-1"] -## account_id: "123456789012" -## account_alias: "mycloud" -## aws_environment: "govcloud" -# -# provider_configs: -# - provider.ldap # gets link to ../provider_configs.d/provider.ldap.* ./ -# - provider.dns # gets link to ../provider_configs.d/provider.dns.* ./ -# -# parent_links: -# - (file) # makes link files in the parent directory. Expect this not to be needed with this new setup -# - random_parent_file.tf -# - random_parent_link.tf -# -# consider making TOP/remote_state.d and then link all to that, and link remote_state.d to each directory -# remote_state: -# - (component-directory) # finds and links remote state with the appropriate component. Examples: -# - infrastructure -# - infrastructure/west-1 -# - common/apps/myapp1 - - - -## cwd=Path.cwd() -## top=None -## for p in cwd.parents: -## if (p / "TOP").exists() or ( (p / "init").exists() and (p / "init").is_dir() ): -## top=p -## -## if top: -## rel=cwd.relative_to(top) -## rel_top=['..'] * len(rel.parts) -## rel_top_s=os.path.join(*rel_top) -## else: -## rel=None -## rel_top_s='' -## -## print('cwd={}\ntop={}\nrel={}\nrel_to_top={}'.format(cwd,top,rel,rel_top_s)) diff --git a/scripts/tf-run b/scripts/tf-run deleted file mode 100755 index 3c0bc01..0000000 --- a/scripts/tf-run +++ /dev/null @@ -1,921 +0,0 @@ -#!/bin/bash - -get_git_root() -{ - TOP=$(git rev-parse --show-toplevel 2>/dev/null) -} - -get_relative_to_git_root() -{ -# TOP=$(git rev-parse --show-toplevel 2>/dev/null) - get_git_root - TOPBASE=$(basename $TOP) - CWD=$(pwd) - RELATIVE_PATH=$(echo $CWD | sed -e "s/^.*$TOPBASE//" | tr -cd / | sed -e 's#/#../#g') -# return 0 -} - -get_relative_directory() -{ - local FILE=$1 - if [ -z "$FILE" ] - then - echo "* get_relative_up(): error, missing FILE" - return 1 - fi - CWD=$(pwd) - RELATIVE_PATH="" - local c=0 - while [[ ! -r "$CWD/$FILE" ]] || [[ -L "$CWD/$FILE" ]] - do -## echo "[debug] checking $CWD/$FILE" -# stop if at top of repo (.git directory) or at / - if [[ ! -r "$CWD/.git" ]] && [[ "$CWD" != "/" ]] - then - CWD=$(dirname $CWD) - RELATIVE_PATH+="../" - c=$(( c + 1 )) -## echo "[debug] going up cwd=$CWD c=$c path=$RELATIVE_PATH" - else -## echo "[debug] hit git root or / cwd=$CWD" - RELATIVE_PATH="" - c=-1 - break - fi - done - if [[ $c -ge 0 ]] && [[ -z "$RELATIVE_PATH" ]] - then -## echo "[debug] found in current directory" - RELATIVE_PATH="./" - fi -} - -get_profile() -{ - if [ -z $ERROR_GET_PROFILE ] - then - ERROR_GET_PROFILE=0 - fi - local FILES=$(ls *tfvars 2>/dev/null) - if [ -z "$AWS_PROFILE" ] - then - if [ -z "$FILES" ] - then - [ $ERROR_GET_PROFILE -gt 0 ] && echo "* [WARNING] cannot determine profile from *.tfvars" - ERROR_GET_PROFILE=$(( $ERROR_GET_PROFILE + 1 )) - return 1 - else - PROFILE=$(grep -E '^\bprofile\b *' $FILES | sed -e 's/^.*profile.* =//' -e 's/\"//g' -e 's/#.*$//' -e 's/^ *//' | head -n 1) - fi - else - PROFILE=$AWS_PROFILE - fi -# echo "* using profile=$PROFILE" - ERROR_GET_PROFILE=0 - return 0 -} - -get_region() -{ - if [ -z $ERROR_GET_REGION ] - then - ERROR_GET_REGION=0 - fi - local FILES=$(ls *tfvars 2>/dev/null) - if [ -z "$AWS_REGION" ] - then - if [ -z "$FILES" ] - then - [ $ERROR_GET_REGION -gt 0 ] && echo "* [WARNING] cannot determine region from *.tfvars" - ERROR_GET_REGION=$(( $ERROR_GET_REGION + 1 )) - return 1 - else - REGION=$(grep -E '^\bregion\b *' $FILES | sed -e 's/^.*region.* =//' -e 's/\"//g' -e 's/#.*$//' -e 's/^ *//' | head -n 1) - fi - else - REGION=$AWS_REGION - fi - if [ ! -z $REGION ] - then - if [[ $REGION =~ gov ]] - then - SHORT_REGION=$(echo $REGION | sed -e 's/^us-gov-//' -e 's/-[0-9]$//') - else - SHORT_REGION=$(echo $REGION | sed -e 's/^us-//') - fi - fi -# echo "* using region=$REGION short_region=$SHORT_REGION" - ERROR_GET_REGION=0 - return 0 -} - -# returns value in stdout -replace_placeholders() -{ - local item="$1" - - if [ ! -z $next ] - then - item=$(echo $item | sed -e "s/%%NEXT%%/$next/g") - fi - if [ ! -z $previous ] - then - item=$(echo $item | sed -e "s/%%PREVIOUS%%/$previous/g") - fi - if [ ! -z $PROFILE ] - then - item=$(echo $item | sed -e "s/%%PROFILE%%/$PROFILE/g") - fi - if [ ! -z $REGION ] - then - item=$(echo $item | sed -e "s/%%REGION%%/$REGION/g") - fi - if [ ! -z $SHORT_REGION ] - then - item=$(echo $item | sed -e "s/%%SHORT_REGION%%/${SHORT_REGION}/g") - fi - echo "$item" -} - -get_file_from_git() -{ - local FILE=$1 - local URL=$2 - local status=0 - if [[ -z "$FILE" ]] || [[ -z "$URL" ]] - then - echo "* missing FILE or URL argument" - status=1 - else - if [ -r "$FILE" ] - then - echo "* file $FILE exists, not overwriting" - else - echo "* getting init file $FILE" - curl -q -s -k -o "$FILE" "$URL" - status=$? - fi - fi - return $status -} - -do_clean() -{ - local WHAT=$1 - if [ -z "$WHAT" ] - then - WHAT="clean" - fi - - echo "* executing $WHAT, removing remote_state.*" - echo -n "> " - for f in $(ls remote_state.* -d) - do - rm $f && echo -n " $f" - done - echo "" - - echo "* executing $WHAT, removing links" - echo -n "> " - for f in $(find . -maxdepth 1 -type l -print) - do - rm $f && echo -n " $f" - done - echo "" - return 0 -} - -do_superclean() -{ - do_clean superclean - echo "* executing superclean, removing logs, .terraform files, terraform.tfstate files" - echo -n "> " - for f in $(ls -d logs .terraform .terraform*hcl terraform.tfstate* 2> /dev/null) - do - rm -rf $f && echo -n " $f" - done - echo "" - return 0 -} - -do_help() -{ - local ACTIONS=$@ - echo "* help: $THIS $VERSION" - echo " tf-run: ACTION [list | start_number | tag:start_tag] [end_number | tag:end_tag | only | +N]" - echo " ACTION = plan | apply | destroy | list | init | init-upgrade | clean | superclean | tags" - echo "" - echo " init: get a base tf-run.data if none exists, creates region.tf if not exist, creates locals.tf if not exists; also gets .tf-control* files" - echo " init-upgrade: get the .tf-control* files needed for using TF 1.x" - echo " check: looks over module calls for proper use, proper versions, other things" - echo " list: same as '$THIS plan list'" - echo " plan: run through the contents of tf-run.data and do a plan for each" - echo " apply: run through the contents of tf-run.data and do a apply for each" - echo " destroy: looks for a tf-run.destroy.data, and removes in that specific order. If missing, it attempts all at once." - echo " clean: removes remote_state.*, links." - echo " superclean: removes remote_state.*, links, logs/, .terraform files, terraform.tfstate files" - echo " tags: get a list of tags and its respective step number" - echo "" - echo " arguments:" - echo " * list: list out all of the steps, but don't execute anything" - echo " * start_number: start executing ACTION at step number start_number" - echo " * tag:start_tag: start executing ACTION at tag labelled start_tag. The 'tag:' must be present for this option" - echo " * end_number: stop executing ACTION after step number end_number" - echo " * tag:end_tag: stop executing ACTION on the step before the tag labelled end_tag. The 'tag:' must be present for this option" - echo " * only: execute ACTION for only the one step indicated by start" - echo " * +N: execute ACTION starting at start and ending at start + N" - echo "" - return 0 -} - -ask_continue() { - local continue_status=0 - local PREFIX=$1 - local DURATION=$2 - local DEFAULT=$3 - if [ -z $DURATON ] - then - DURATION=10 - fi - if [ -z $DEFAULT ] - then - DEFAULT="y" - fi -# echo "" -# read -n 1 -p "${PREFIX}continue [y|n: default=$DEFAULT]? " -t $DURATION CONTINUE < /dev/tty - read -n 1 -p "${PREFIX}continue [y|n: default=$DEFAULT]? " -t $DURATION CONTINUE - continue_status=$? - echo "" - - if [ -z $CONTINUE ] - then - CONTINUE=$DEFAULT - fi - if [[ $continue_status != 0 ]] - then - CONTINUE=$DEFAULT - else - if [[ $CONTINUE != "y" ]] && [[ $CONTINUE != "n" ]] - then - CONTINUE="n" - fi - fi -# echo "value=$CONTINUE status=$continue_status" -} - -umask 002 - -THIS=$(basename $0) -VERSION="1.13.13" -LOGDIR="logs" -if [ ! -d $LOGDIR ] -then - mkdir -p $LOGDIR -fi -#GITSYSTEM="gitlab" -GITSYSTEM="github" - -ACTION=$1 -if [ -z "$ACTION" ] -then - echo "* missing ACTION (plan | apply | destroy)" - exit 1 -fi - -if [[ ! -z "$ACTION" ]] && [[ "$ACTION" == "help" ]] -then - do_help - exit 0 -fi - -if [ $ACTION == "list" ] -then - ACTION="plan" - START="list" -fi - -# if [[ $ACTION == "plan" ]] || [[ $ACTION == "apply" ]] || [[ $ACTION == "init" ]] || [[ $ACTION == "destroy" ]] || [[ $ACTION == "clean" ]] || [[ $ACTION = "superclean" ]] -ALL_ACTIONS=(plan apply init init-upgrade check destroy clean superclean tags) -action_valid=0 -for a in "${ALL_ACTIONS[@]}" -do - if [ "$ACTION" == $a ] - then - action_valid=1 - break - fi -done - -if [ $action_valid == 1 ] -then - echo "* running action=$ACTION" -else - echo "* invalid action=$ACTION" - echo "" - do_help - exit 1 -fi - -YMDSTAMP=$(date +%Y%m%d) -stime=$(date +%s) -STAMP="$YMDSTAMP.$stime" -LOGFILE="$LOGDIR/run.$ACTION.$STAMP.log" - -# from: https://stackoverflow.com/questions/25833676/redirect-echo-output-in-shell-script-to-logfile -if [[ "$START" != "list" ]] && [[ "$ACTION" != "init" ]] -then - exec > >(tee -i $LOGFILE) - exec 2>&1 -else - LOGFILE="$LOGFILE (not-created)" -fi - -echo "* START: $THIS $VERSION start=$stime end=$etime logfile=$LOGFILE" -[[ ! -z $PROFILE ]] || get_profile -[[ ! -z $REGION ]] || get_region - -if [ $ACTION == "init" ] -then - get_git_root - - if [ $GITSYSTEM == "github" ] - then - get_file_from_git tf-run.data "https://${GITSYSTEM}.e.it.census.gov/raw/terraform/support/master/local-app/tf-run/applications/base/tf-run.data" - get_file_from_git region.tf "https://${GITSYSTEM}.e.it.census.gov/raw/terraform/support/master/local-app/tf-run/applications/base/region.tf" - get_file_from_git locals.tf "https://${GITSYSTEM}.e.it.census.gov/raw/terraform/support/master/local-app/tf-run/applications/base/locals.tf" - get_file_from_git versions.tf "https://${GITSYSTEM}.e.it.census.gov/raw/terraform/support/master/local-app/tf-run/applications/base/versions.tf" - elif [ $GITSYSTEM == "gitlab" ] - then - get_file_from_git tf-run.data "https://${GITSYSTEM}.e.it.census.gov/terraform/support/-/raw/master/local-app/tf-run/applications/base/tf-run.data" - get_file_from_git region.tf "https://${GITSYSTEM}.e.it.census.gov/terraform/support/-/raw/master/local-app/tf-run/applications/base/region.tf" - get_file_from_git locals.tf "https://${GITSYSTEM}.e.it.census.gov/terraform/support/-/raw/master/local-app/tf-run/applications/base/locals.tf" - get_file_from_git versions.tf "https://${GITSYSTEM}.e.it.census.gov/terraform/support/-/raw/master/local-app/tf-run/applications/base/versions.tf" - fi - - if [ ! -r "$TOP/.tf-control" ] - then - if [ $GITSYSTEM == "github" ] - then - get_file_from_git .tf-control "https://${GITSYSTEM}.e.it.census.gov/raw/terraform/support/master/local-app/aws-account-setup/ansible/roles/setup-git-repo/files/.tf-control" - elif [ $GITSYSTEM == "gitlab" ] - then - get_file_from_git .tf-control "https://${GITSYSTEM}.e.it.census.gov/terraform/support/-/raw/master/local-app/aws-account-setup/ansible/roles/setup-git-repo/files/.tf-control" - fi - fi - if [ ! -r "$TOP/.tf-control.tfrc" ] - then - if [ $GITSYSTEM == "github" ] - then - get_file_from_git .tf-control.tfrc "https://${GITSYSTEM}.e.it.census.gov/raw/terraform/support/master/local-app/aws-account-setup/ansible/roles/setup-git-repo/files/.tf-control.tfrc" - elif [ $GITSYSTEM == "gitlab" ] - then - get_file_from_git .tf-control.tfrc "https://${GITSYSTEM}.e.it.census.gov/terraform/support/-/raw/master/local-app/aws-account-setup/ansible/roles/setup-git-repo/files/.tf-control.tfrc" - fi - fi - exit 0 -fi - -if [ $ACTION == "init-upgrade" ] -then - get_git_root - if [ ! -r "$TOP/.tf-control" ] - then - if [ $GITSYSTEM == "github" ] - then - get_file_from_git .tf-control "https://${GITSYSTEM}.e.it.census.gov/raw/terraform/support/master/local-app/aws-account-setup/ansible/roles/setup-git-repo/files/.tf-control" - elif [ $GITSYSTEM == "gitlab" ] - then - get_file_from_git .tf-control "https://${GITSYSTEM}.e.it.census.gov/terraform/support/-/raw/master/local-app/aws-account-setup/ansible/roles/setup-git-repo/files/.tf-control" - fi - fi - if [ ! -r "$TOP/.tf-control.tfrc" ] - then - if [ $GITSYSTEM == "github" ] - then - get_file_from_git .tf-control.tfrc "https://${GITSYSTEM}.e.it.census.gov/raw/terraform/support/master/local-app/aws-account-setup/ansible/roles/setup-git-repo/files/.tf-control.tfrc" - elif [ $GITSYSTEM == "gitlab" ] - then - get_file_from_git .tf-control.tfrc "https://${GITSYSTEM}.e.it.census.gov/terraform/support/-/raw/master/local-app/aws-account-setup/ansible/roles/setup-git-repo/files/.tf-control.tfrc" - fi - fi - exit 0 -fi - -if [ $ACTION == "check" ] -then - TF_UPGRADE_MODULES="" - TF_UPGRADE_MODULES+=" aws-common-security-groups" - TF_UPGRADE_MODULES+=" aws-edl-launch-instance" - TF_UPGRADE_MODULES+=" aws-iam-role" - TF_UPGRADE_MODULES+=" aws-iam-user" - TF_UPGRADE_MODULES+=" aws-inf-setup" - TF_UPGRADE_MODULES+=" aws-s3" - TF_UPGRADE_MODULES+=" aws-setup-s3-object-logging" - TF_UPGRADE_MODULES+=" aws-tls-certificate" - TF_UPGRADE_MODULES+=" aws-vpc-setup" - TF_UPGRADE_MODULES+=" dns-lookup" - TF_UPGRADE_MODULES+=" aws-ecr-copy-images" - _TFSTRING="" - for f in $TF_UPGRADE_MODULES - do - if [ ! -z $_TFSTRING ] - then - _TFSTRING+="|" - fi - _TFSTRING+="$f" - done - - echo -n "* [check] outdated .tf with git::https format: " - c=$(cat *.tf 2>/dev/null| grep -cE '^[^#]*source.*git::https') - if [ $c == 0 ] - then - echo "OK" - else - echo "NOT-OK" - echo "* found $c source statements, please fix to git@ format." - grep -E '^[^#]*source.*git::https' *.tf - fi - echo "" - - echo -n "* [check] for .tf for eligible modules not using ?ref=tf-upgrade: " - c=$(cat *.tf | grep -E "^[^#]*source.*git.*($_TFSTRING)" | grep -c -v ref=tf-upgrade) - if [ $c == 0 ] - then - echo "OK" - else - echo "NOT-OK" - echo "* found $c source statements not referencing ref=tf-upgrade, please verify with the list at" - echo " https://${GITSYSTEM}.e.it.census.gov/terraform/support/blob/master/docs/how-to/terraform-upgrade/upgrade-code.md#modules" - echo " and change accordingly if the module here is on the list." - grep -E "^[^#]*source.*git.*($_TFSTRING)" *.tf | grep -v ref=tf-upgrade - fi - echo "" - exit 0 -fi - -if [[ $ACTION == "clean" ]] || [[ $ACTION == "superclean" ]] -then - echo ""; echo "About to execute $ACTION. This is destructive and will remove files." - ask_continue "Continue (y|n)? " "" "n" - if [ $CONTINUE == "y" ] - then - if [ $ACTION == "clean" ] - then - do_clean - else - do_superclean - fi - else - echo "* action $ACTION declined" - fi - exit 0 -fi - -TFRUNFILE_VERSION="" -if [ $ACTION == "destroy" ] -then - if [ -r "tf-run.destroy.data" ] - then - RUNFILE="tf-run.destroy.data" - else - TFRUNFILE_VERSION="generated.$STAMP" - RUNFILE="ALL" - fi -else - RUNFILE="tf-run.data" -fi - -# read file tf-run.data -declare -a targets=() -declare -a targets_status=() -declare -A tags -# TFRUNFILE_VERSION="" -# RUNFILE="tf-run.data" -if [ -r $RUNFILE ] -then - c=1 - pos=1 - echo "* reading from $RUNFILE" - while IFS="" read line - do - nline=$(echo $line | sed -e 's/^#.*$//') - if [ ! -z "$nline" ] - then - targets+=( "$line" ) - targets_status+=0 - words=( $line ) - pos=$(( pos + 1 )) - if [ ${words[0]} == "VERSION" ] - then - TFRUNFILE_VERSION=${words[1]} - pos=$(( pos - 1 )) - fi - if [ ${words[0]} == "TAG" ] - then - pos=$(( pos - 1 )) - tags[${words[1]}]=$pos - fi - fi - c=$(( $c + 1 )) - done < $RUNFILE - echo "* read ${#targets[@]} entries from $RUNFILE" -elif [ "$RUNFILE" == "ALL" ] - then - targets+=( "ALL" ) - targets_status+=0 - c=1 -else - echo "* unable to open tf-run.data, exiting" - exit 1 -fi -TOTAL_TARGETS=${#targets[@]} - -if [ $ACTION == "tags" ] -then - echo "* available TAGS and step numbers" - for t in "${!tags[@]}" - do - echo "TAG $t = ${tags[$t]}" - done - echo "" - exit 0 -fi - -if [ -z "$START" ] -then - START=$2 -fi -if [[ ! -z "$START" ]] && [[ "$START" == "list" ]] -then - LIST=1 - echo "> list" -elif [ ! -z "$START" ] -then - LIST=0 - - if [ $(echo "$START" | grep -c "^tag:" ) -gt 0 ] - then - START_TAG=$(echo "$START" | sed -e 's/^tag://') - if [ -z "$START_TAG" ] - then - echo "* start tag:NAME used but NAME is missing" - fi - START=${tags[$START_TAG]} - if [ -z "$START" ] - then - echo "* start tag:$START_TAG used but tag is not found" - fi - fi -else - LIST=0 -fi - -START=$(( START * 1 )) - -END=$3 -if [ ! -z "$END" ] -then - if [ "$END" == "only" ] - then - END=$START - elif [ $(echo "$END" | grep -c "^+") -gt 0 ] - then - NEND=$(echo "$END" | sed -e 's/^+//') - END=$(( START + NEND )) - elif [ $(echo "$END" | grep -c "^tag:" ) -gt 0 ] - then - END_TAG=$(echo "$END" | sed -e 's/^tag://') - if [ -z "$END_TAG" ] - then - echo "* end tag:NAME used but NAME is missing" - fi - END=$(( ${tags[$END_TAG]} - 1 )) - if [ -z "$END" ] - then - echo "* end tag:$END_TAG used but tag is not found" - fi - fi -fi -END=$(( END * 1 )) - -## c=1 -## for t in "${targets[@]}" -## do -## tfargs="" -## if [ "$t" != "ALL" ] -## then -## for tt in $t -## do -## tfargs+="-target=$tt " -## done -## fi -## echo "* $c tf-$ACTION $tfargs" -## c=$(( $c + 1 )) -## done -## exit 0 -## fi - -# add to history: .tf-run.history -TFR_HISTORY=".tf-run.history" - -#if [ ! -r $TFR_HISTORY ] -#then -# echo "timestamp,username,action,start,end,status" >> $TFR_HISTORY -#fi -#echo "$(date +%s),$USER,$ACTION,$START,$END,start" >> $TFR_HISTORY - -status=0 -echo ">> START: start_time=$stime version=$VERSION data.version=$TFRUNFILE_VERSION start=$START end=$END start_tag=$START_TAG" -echo "- profile=$PROFILE region=$REGION short_region=$SHORT_REGION" -c=0 -for t in "${targets[@]}" -do - c=$(( $c + 1 )) - next=$(( $c + 1 )) - - target=$t - words=( $t ) - w=${words[0]} - rest=$(echo "${words[@]:1}" | sed -e "s/%%NEXT%%/$next/g") - rest=$(replace_placeholders "$rest") - - if [[ "$w" == "VERSION" ]] || [[ "$w" == "TAG" ]] - then - TOTAL_TARGETS=$(( $TOTAL_TARGETS - 1 )) - fi - if [[ ! -z $START ]] && [[ $START -gt $c ]] - then - if [[ $w == "VERSION" ]] || [[ $w == "TAG" ]] - then - c=$(( $c - 1 )) - fi - continue - fi - if [[ ! -z $END ]] && [[ $END != 0 ]] && [[ $c -gt $END ]] - then - break -# echo "break c=$c end=$END" -# else -# echo "not break c=$c end=$END" - fi - - case $w in - REMOTE-STATE) - echo "> [$c] $w> generate-remote-state" - if [ $LIST == 0 ] - then - if [ -r ../remote_state.yml ] - then - cat ../remote_state.yml | sed -E s#\(^directory.*\)\"#\\1/$(basename $(pwd))\"# > remote_state.yml - status=$? - echo -n "* generated line: "; grep ^directory remote_state.yml - echo ""; echo "= Complete: $c $w> $rest | status=$status" - else - echo "* missing parent remote_state.yml, exiting" - status=1 - exit $status - fi - targets_status[$c]=$status - fi - continue - ;; - BACKUP-STATE) - echo "> [$c] $w> backup-state to $LOGDIR/backup.$STAMP.tfstate" - if [ $LIST == 0 ] - then - TFNOLOG=1 tf-state pull > $LOGDIR/backup.$STAMP.tftate - status=$? - echo ""; echo "= Complete: $c $w> backup-state | status=$status" - targets_status[$c]=$status - fi - c=$(( $c - 1 )) - continue - ;; - COMMAND) - echo "> [$c] $w> $rest" - if [ $LIST == 0 ] - then - $rest - status=$? - echo ""; echo "= Complete: $c $w> $rest | status=$status" - targets_status[$c]=$status - fi -# should check to see if we are running tf-directory-setup.py, so grab this stuff after each command if missing - [[ ! -z $PROFILE ]] || get_profile - [[ ! -z $REGION ]] || get_region - continue - ;; - LINKTOP) - LINKARG=$(replace_placeholders ${words[@]:1:1}) - get_relative_to_git_root - echo "> [$c] $w> ln -sf ${RELATIVE_PATH}${LINKARG} ./" - if [ $LIST == 0 ] - then - if [ -e "${RELATIVE_PATH}${LINKARG}" ] - then - ln -sf ${RELATIVE_PATH}${LINKARG} ./ - status=$? - else - echo "* linked-to file ${RELATIVE_PATH}${LINKARG} does not exist, skipping" - status=0 - fi - echo ""; echo "= Complete: $c $w> relative-link-to-top $LINKARG | status=$status" - targets_status[$c]=$status - fi - continue - ;; - LINK) - LINKARG=$(replace_placeholders ${words[@]:1:1}) - get_relative_directory $LINKARG - echo "> [$c] $w> ln -sf ${RELATIVE_PATH}${LINKARG} ./" - if [ $LIST == 0 ] - then - if [[ -e "${RELATIVE_PATH}${LINKARG}" ]] && [[ ! -z "${RELATIVE_PATH}" ]] - then -# don't make link if the file is found in the current directory as we'll make a looping link -# the get_relative_directory is looking for the real file in the hierarchy - if [ "${RELATIVE_PATH}" != "./" ] - then - ln -sf ${RELATIVE_PATH}${LINKARG} ./ - status=$? - fi - else - echo "* linked-to file ${RELATIVE_PATH}${LINKARG} does not exist in current or parent dirs, skipping" - status=0 - fi - echo ""; echo "= Complete: $c $w> relative-link $LINKARG | status=$status" - targets_status[$c]=$status - fi - continue - ;; - VERSION) - c=$(( $c - 1 )) - continue - ;; - TAG) -# this is a placeholder, almost like a go-to, but it does not increment the number - echo "" - echo "# [$w] $rest" - c=$(( $c - 1 )) - targets_status[$c]=$status - continue - ;; - COMMENT) - echo "> [$c] $w> $rest" - status=$? -## echo ""; echo "= Complete: $c $w> $rest | status=$status" - targets_status[$c]=$status - continue - ;; - CHECK) - echo "> [$c] $w> $rest" - status=$? - targets_status[$c]=$status - if [ $LIST == 0 ] - then - echo ""; echo "= Complete: $c $w> $rest | status=$status" - ask_continue "} Next: $next, " - if [ $CONTINUE == "n" ] - then - break - fi - fi - continue - ;; - PAUSE) - if [ -z $rest ] - then - SLEEP=15 - else - SLEEP=$rest - fi - if [ $LIST == 0 ] - then - echo "> [$c] $w> sleeping for $SLEEP" - sleep $SLEEP - else - echo "> [$c] $w> would sleep for $SLEEP (actual sleep 2)" - sleep 2 - fi - status=$? - targets_status[$c]=$status - if [ $LIST == 0 ] - then - echo ""; echo "= Complete: $c $w> $rest | status=$status" - ask_continue "} Next: $next, " - if [ $CONTINUE == "n" ] - then - break - fi - fi - continue - ;; - STOP) - echo "> [$c] $w> $rest" - status=$? - if [ $LIST == 0 ] - then - echo ""; echo "= Complete: $c $w> $rest | status=$status" - echo "- Continue $next: $THIS $ACTION $next" - break - else - continue - fi - ;; - CLEAN) - ;; - POLICY) - PFILES="${words[@]:1}" - if [ -z $PFILES ] - then - PFILES="*.tf" - fi - PTARGETS=( $(grep -iE "^resource\b.*aws_iam_policy\b" $PFILES | awk '{print $2 "." $3}' |sed -e 's/"//g') ) - status=$? - echo "> [$c] $w> ($PFILES) ${PTARGETS[@]}" - target="${PTARGETS[@]}" - targets_status[$c]=$status - if [ -z "$target" ] - then - if [ $LIST == 0 ] - then - echo "= No policy targets found, skipping this step | status=$status" - echo "- Continue $next: $THIS $ACTION $next" - fi - continue - fi - ;; - *) - ;; - esac - - if [ $status != 0 ] - then - echo "* error encountered, status=$status; exiting" - exit $status - fi - - tfargs="" - if [ "$t" != "ALL" ] - then - for tt in $target - do - tfargs+="-target=$tt " - done - fi - if [ $LIST == 1 ] - then - echo "> [$c] tf-$ACTION $TFOPTIONS $tfargs" - continue - fi - - echo "> [$c] tf-$ACTION $TFOPTIONS $tfargs" - if [ -z $DRY_RUN ] - then - tf-$ACTION $TFOPTIONS $tfargs - else - echo " (dry-run)" - fi - status=$? - targets_status[$c]=$status - - if [ $status != 0 ] - then - echo "> [$c] exiting status=$status" - break - fi - - if [ -z $DRY_RUN ] - then - echo ""; echo "= Complete: $c $w> $rest | status=$status" - ask_continue "} Next: $next, " -# ask_continue - fi - - if [ $CONTINUE == "n" ] - then - break - fi -done - -etime=$(date +%s) -xtime=$(( $etime - $stime )) -if [ $c -ge $TOTAL_TARGETS ] -then - if [ $LIST == 0 ] - then - echo "<< COMPLETE $c/$TOTAL_TARGETS targets" - fi -else - if [ $LIST == 0 ] - then - echo "<< INCOMPLETE $c/$TOTAL_TARGETS last_item=$c" - fi -fi -echo "<< END: start_time=$stime end_time=$etime elapsed=$xtime logfile=$LOGFILE status=$status" -exit $status - -## TO DO -# add LINK to make a link to a file in successive parent directories (variables.vpc.tf, for example) -# make regionshort -# use get_region and get_profile diff --git a/service-catalog/executor-template.yaml b/service-catalog/executor-template.yaml new file mode 100644 index 0000000..06d55c2 --- /dev/null +++ b/service-catalog/executor-template.yaml @@ -0,0 +1,127 @@ +AWSTemplateFormatVersion: '2010-09-09' +Description: >- + Service Catalog Product: Apply Terraform changes for an account repo layer. + Clones the account repo at main, optionally assumes a cross-account role, + and runs tf-run apply. Intended for use AFTER the corresponding Propose + product's pull request has been reviewed and merged. + +Metadata: + AWS::CloudFormation::Interface: + ParameterGroups: + - Label: + default: "Target Repository" + Parameters: + - AccountRepo + - Layer + - RegionDir + - Label: + default: "Execution Options" + Parameters: + - TargetAccountId + - TfRunStartTag + - DryRun + + ParameterLabels: + AccountRepo: + default: "Account Repo Name" + Layer: + default: "Terraform Layer" + RegionDir: + default: "Region Directory" + TargetAccountId: + default: "Target AWS Account ID (optional)" + TfRunStartTag: + default: "tf-run Start Tag (optional)" + DryRun: + default: "Dry Run (plan only)" + +Parameters: + AccountRepo: + Type: String + Description: >- + Account repo name, e.g. 229685449397-csvd-dev-platform-dev-gov. + Must already exist in the SCT-Engineering GitHub org. + AllowedPattern: '^[a-z0-9][a-z0-9-]*[a-z0-9]$' + ConstraintDescription: Lowercase letters, numbers, and hyphens only + MinLength: 3 + MaxLength: 100 + + Layer: + Type: String + Description: Terraform layer to run tf-run apply in + AllowedValues: + - common + - infrastructure + - vpc + + RegionDir: + Type: String + Description: Region directory within the layer + AllowedValues: + - east + - west + - global + + TargetAccountId: + Type: String + Description: >- + AWS account ID that CodeBuild should run terraform apply against. + When set, CodeBuild assumes arn:{partition}:iam::{TargetAccountId}:role/sc-automation-codebuild-role + before executing tf-run. That role must exist in the target account with a + trust policy allowing the CodeBuild IAM role from csvd-dev (229685449397). + Leave blank to run with the CodeBuild role's own credentials (csvd-dev only). + Default: "" + MaxLength: 12 + AllowedPattern: '^[0-9]{12}$|^$' + ConstraintDescription: Must be a 12-digit AWS account ID or empty + + TfRunStartTag: + Type: String + Description: >- + tf-run.data TAG label to start execution from. + Leave blank to run all steps from the beginning. + Default: "" + MaxLength: 100 + + DryRun: + Type: String + Description: >- + Set to 'true' to run tf plan only (no apply). + Useful for validating before committing to an apply. + AllowedValues: + - "true" + - "false" + Default: "false" + +Resources: + ApplyResource: + Type: Custom::TerraformApply + Properties: + ServiceToken: !Sub "arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:tf-run-executor-trigger" + # action is hardcoded — this product always applies + action: apply + account_repo: !Ref AccountRepo + layer: !Ref Layer + region_dir: !Ref RegionDir + target_account_id: !Ref TargetAccountId + tf_run_start_tag: !Ref TfRunStartTag + dry_run: !Ref DryRun + +Outputs: + ApplyStatus: + Description: Result of the tf-run apply ("SUCCEEDED" or "FAILED") + Value: !GetAtt ApplyResource.ApplyStatus + Export: + Name: !Sub '${AWS::StackName}-ApplyStatus' + + RepositoryUrl: + Description: URL of the account repository + Value: !GetAtt ApplyResource.repository_url + Export: + Name: !Sub '${AWS::StackName}-RepositoryUrl' + + CodeBuildBuildId: + Description: ID of the CodeBuild executor build + Value: !GetAtt ApplyResource.CodeBuildBuildId + Export: + Name: !Sub '${AWS::StackName}-CodeBuildBuildId' diff --git a/service-catalog/product-template.yaml b/service-catalog/product-template.yaml index a137a5f..92c014d 100644 --- a/service-catalog/product-template.yaml +++ b/service-catalog/product-template.yaml @@ -11,11 +11,17 @@ Metadata: - Layer - RegionDir - Label: - default: "Execution Options" + default: "Execution Environment" Parameters: - GitBranch - TfRunStartTag - DryRun + - TargetAccountId + - Label: + default: "Template Repository (optional)" + Parameters: + - TemplateRepo + - TemplateVars - Label: default: "Extra Files (optional)" Parameters: @@ -34,6 +40,12 @@ Metadata: default: "tf-run Start Tag (optional)" DryRun: default: "Dry Run (plan only)" + TargetAccountId: + default: "Target AWS Account ID (optional)" + TemplateRepo: + default: "Template Repository Name (optional)" + TemplateVars: + default: "Template Variables (JSON)" ExtraFiles: default: "Extra Config Files (JSON)" @@ -58,10 +70,11 @@ Parameters: RegionDir: Type: String - Description: Region directory within the layer + Description: Region directory within the layer (use 'global' for non-regional resources like SSO/IAM) AllowedValues: - east - west + - global GitBranch: Type: String @@ -88,11 +101,42 @@ Parameters: - "false" Default: "false" + TargetAccountId: + Type: String + Description: >- + AWS account ID that CodeBuild should run terraform apply against. + When set, CodeBuild assumes arn:{partition}:iam::{TargetAccountId}:role/sc-automation-codebuild-role + before executing tf-run. That role must exist in the target account with a + trust policy allowing the CodeBuild IAM role from csvd-dev (229685449397). + Leave blank to run with the CodeBuild role's own credentials (csvd-dev only). + Default: "" + MaxLength: 12 + AllowedPattern: '^[0-9]{12}$|^$' + ConstraintDescription: Must be a 12-digit AWS account ID or empty + + TemplateRepo: + Type: String + Description: >- + Name of a GHE repo (in the same org) containing template files. + Files ending in .j2 are rendered as Jinja2 templates using TemplateVars. + All other files are copied as-is. Results land in the account repo at + the same relative paths before tf-run executes. + Leave blank to skip template rendering. + Default: "" + MaxLength: 100 + + TemplateVars: + Type: String + Description: >- + JSON object of variables passed to Jinja2 when rendering .j2 files + from the TemplateRepo. Example: {"cluster_name": "my-eks", "env": "dev"} + Default: "{}" + ExtraFiles: Type: String Description: >- - JSON object mapping relative repo paths to file contents. - These files are written into the account repo before tf-run executes. + JSON object mapping relative repo paths to raw file contents. + Applied after TemplateRepo rendering; keys here override template output. Example: {"vpc/west/my-config.tf": "# placeholder"} Default: "{}" @@ -109,6 +153,9 @@ Resources: region_dir: !Ref RegionDir git_branch: !Ref GitBranch tf_run_start_tag: !Ref TfRunStartTag + template_repo: !Ref TemplateRepo + template_vars: !Ref TemplateVars + target_account_id: !Ref TargetAccountId dry_run: !Ref DryRun extra_files: !Ref ExtraFiles diff --git a/service-catalog/proposer-template.yaml b/service-catalog/proposer-template.yaml new file mode 100644 index 0000000..bc3dd50 --- /dev/null +++ b/service-catalog/proposer-template.yaml @@ -0,0 +1,145 @@ +AWSTemplateFormatVersion: '2010-09-09' +Description: >- + Service Catalog Product: Propose Terraform changes for an account repo layer. + Clones the account repo, renders template files, writes extra config files, + commits to a branch, and opens a pull request for human review. + A separate "Apply" product executes the changes after the PR is merged. + +Metadata: + AWS::CloudFormation::Interface: + ParameterGroups: + - Label: + default: "Target Repository" + Parameters: + - AccountRepo + - Layer + - RegionDir + - Label: + default: "Proposal Branch" + Parameters: + - GitBranch + - Label: + default: "Template Repository (optional)" + Parameters: + - TemplateRepo + - TemplateVars + - Label: + default: "Extra Files (optional)" + Parameters: + - ExtraFiles + + ParameterLabels: + AccountRepo: + default: "Account Repo Name" + Layer: + default: "Terraform Layer" + RegionDir: + default: "Region Directory" + GitBranch: + default: "Proposal Branch Name" + TemplateRepo: + default: "Template Repository Name (optional)" + TemplateVars: + default: "Template Variables (JSON)" + ExtraFiles: + default: "Extra Config Files (JSON)" + +Parameters: + AccountRepo: + Type: String + Description: >- + Account repo name, e.g. 229685449397-csvd-dev-platform-dev-gov. + Must already exist in the SCT-Engineering GitHub org. + AllowedPattern: '^[a-z0-9][a-z0-9-]*[a-z0-9]$' + ConstraintDescription: Lowercase letters, numbers, and hyphens only + MinLength: 3 + MaxLength: 100 + + Layer: + Type: String + Description: Terraform layer to write files into + AllowedValues: + - common + - infrastructure + - vpc + + RegionDir: + Type: String + Description: Region directory within the layer + AllowedValues: + - east + - west + - global + + GitBranch: + Type: String + Description: Branch to commit proposed changes to and open the PR from + Default: propose/sc-automation + MinLength: 1 + MaxLength: 100 + + TemplateRepo: + Type: String + Description: >- + Name of a GHE repo (in the same org) containing template files. + Files ending in .j2 are rendered as Jinja2 templates using TemplateVars. + All other files are copied as-is. Results land in the account repo at + the same relative paths. + Leave blank to skip template rendering. + Default: "" + MaxLength: 100 + + TemplateVars: + Type: String + Description: >- + JSON object of variables passed to Jinja2 when rendering .j2 files + from the TemplateRepo. Example: {"cluster_name": "my-eks", "env": "dev"} + Default: "{}" + + ExtraFiles: + Type: String + Description: >- + JSON object mapping relative repo paths to raw file contents. + Applied after TemplateRepo rendering; keys here override template output. + Example: {"vpc/west/my-config.tf": "# placeholder"} + Default: "{}" + +Resources: + ProposeResource: + Type: Custom::TerraformPropose + Properties: + ServiceToken: !Sub "arn:${AWS::Partition}:lambda:${AWS::Region}:${AWS::AccountId}:function:tf-run-executor-trigger" + # action is hardcoded — this product always proposes (open PR only) + action: propose + account_repo: !Ref AccountRepo + layer: !Ref Layer + region_dir: !Ref RegionDir + git_branch: !Ref GitBranch + template_repo: !Ref TemplateRepo + template_vars: !Ref TemplateVars + extra_files: !Ref ExtraFiles + +Outputs: + PullRequestUrl: + Description: URL of the pull request opened by the proposer + Value: !GetAtt ProposeResource.pull_request_url + Export: + Name: !Sub '${AWS::StackName}-PullRequestUrl' + + RepositoryUrl: + Description: URL of the account repository + Value: !GetAtt ProposeResource.repository_url + Export: + Name: !Sub '${AWS::StackName}-RepositoryUrl' + + BranchName: + Description: Branch that was committed to + Value: !GetAtt ProposeResource.branch_name + Export: + Name: !Sub '${AWS::StackName}-BranchName' + + CodeBuildBuildId: + Description: ID of the CodeBuild proposer build + Value: !GetAtt ProposeResource.CodeBuildBuildId + Export: + Name: !Sub '${AWS::StackName}-CodeBuildBuildId'