diff --git a/.checkov.yml b/.checkov.yml new file mode 100644 index 00000000..cc000299 --- /dev/null +++ b/.checkov.yml @@ -0,0 +1,24 @@ +branch: master +download-external-modules: true +evaluate-variables: true +external-checks-dir: + - security/custom_checks +framework: + - terraform + - kubernetes +output: + - cli + - json + - junitxml +skip-check: + - CKV_AWS_79 # Instance Metadata Service Version 1 + - CKV_AWS_130 # Ensure VPC subnets are not assigned public IP by default +quiet: true +compact: true +directory: + - . + - modules/* +secrets-scan-file-type: + - tf + - yaml + - json diff --git a/.github/platform-tg-infra.code-workspace b/.github/platform-tg-infra.code-workspace index 5047434c..e7bd7b97 100644 --- a/.github/platform-tg-infra.code-workspace +++ b/.github/platform-tg-infra.code-workspace @@ -2,7 +2,7 @@ "folders": [ { "name": "platform-tg-infra", - "path": "../" + "path": ".." }, { "name": "tfmod-cert-mgr", @@ -28,6 +28,10 @@ "name": "tfmod-eks-dns", "path": "../../tfmod-eks-dns" }, + { + "name": "tfmod-gogatekeeper", + "path": "../../tfmod-gogatekeeper" + }, { "name": "tfmod-grafana", "path": "../../tfmod-grafana" @@ -48,6 +52,10 @@ "name": "tfmod-karpenter", "path": "../../tfmod-karpenter" }, + { + "name": "tfmod-keycloak", + "path": "../../tfmod-keycloak" + }, { "name": "tfmod-kiali", "path": "../../tfmod-kiali" @@ -69,13 +77,25 @@ "path": "../../tfmod-tempo" }, { + "name": "terraform-aws-eks", "path": "../../terraform-aws-eks" }, { + "name": "karpenter-provider-aws", "path": "../../karpenter-provider-aws" }, { + "name": "terragrunt", "path": "../../terragrunt" + }, + { + "path": "../../terraform-aws-rds" + }, + { + "path": "../../aws-rds" + }, + { + "path": "../../morpheus-terraform-dev" } ] } diff --git a/configs/node-groups.yaml b/configs/node-groups.yaml new file mode 100644 index 00000000..11e09cad --- /dev/null +++ b/configs/node-groups.yaml @@ -0,0 +1,48 @@ +nodeGroups: + - name: general-purpose + instanceTypes: + - m6i.xlarge + - m6a.xlarge + - m5.xlarge + minSize: 2 + maxSize: 10 + desiredSize: 2 + labels: + node-type: general + taints: [] + updateConfig: + maxUnavailable: 1 + + - name: compute-optimized + instanceTypes: + - c6i.2xlarge + - c6a.2xlarge + - c5.2xlarge + minSize: 1 + maxSize: 20 + desiredSize: 2 + labels: + node-type: compute + taints: + - key: workload + value: batch + effect: NoSchedule + updateConfig: + maxUnavailable: 2 + + - name: memory-optimized + instanceTypes: + - r6i.2xlarge + - r6a.2xlarge + - r5.2xlarge + minSize: 1 + maxSize: 10 + desiredSize: 2 + labels: + node-type: memory + taints: + - key: workload + value: memory-intensive + effect: NoSchedule + updateConfig: + maxUnavailable: 1 diff --git a/configs/resource-quotas.yml b/configs/resource-quotas.yml new file mode 100644 index 00000000..655595d0 --- /dev/null +++ b/configs/resource-quotas.yml @@ -0,0 +1,36 @@ +apiVersion: v1 +kind: ResourceQuota +metadata: + name: default-quota +spec: + hard: + requests.cpu: "20" + requests.memory: 40Gi + limits.cpu: "40" + limits.memory: 80Gi + pods: "100" + services: "50" + secrets: "100" + configmaps: "100" + persistentvolumeclaims: "50" + +--- +apiVersion: v1 +kind: LimitRange +metadata: + name: default-limits +spec: + limits: + - type: Container + default: + cpu: 500m + memory: 512Mi + defaultRequest: + cpu: 100m + memory: 256Mi + max: + cpu: "4" + memory: 8Gi + min: + cpu: 50m + memory: 64Mi diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 00000000..8ea6c671 --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,88 @@ +# Platform Infrastructure Architecture + +## Complete Platform Architecture + +```mermaid +graph TD + %% Core Network Infrastructure + VPC[VPC Module] --> DNS[DNS Module] + VPC --> SUBNETS[Subnet Configuration] + SUBNETS --> PRIVATE[Private Subnets] + SUBNETS --> PUBLIC[Public Subnets] + + %% EKS Cluster and Core Components + VPC --> EKS[EKS Cluster] + EKS --> IAM[IAM Roles Module] + EKS --> EKS_CONFIG[EKS Configuration] + EKS --> KARPENTER[Karpenter] + + %% Security and Access Management + EKS --> CERT_MGR[Cert Manager] + EKS --> GATEKEEPER[GoGatekeeper] + + %% Service Mesh + EKS_CONFIG --> ISTIO[Istio Service Mesh] + ISTIO --> KIALI[Kiali Dashboard] + ISTIO --> INGRESS[Service Ingress] + + %% Monitoring and Observability + EKS --> MONITORING[Monitoring Stack] + MONITORING --> PROMETHEUS[Prometheus] + MONITORING --> GRAFANA[Grafana] + MONITORING --> LOKI[Loki Log Aggregation] + MONITORING --> TEMPO[Tempo Tracing] + + %% Additional Services + EKS --> DASHBOARD[Kubernetes Dashboard] + EKS --> METRICS[Metrics Server] + EKS --> KEYCLOAK[Keycloak SSO] + + %% Infrastructure Management + TERRAGRUNT[Terragrunt] --> VPC + TERRAGRUNT --> EKS + + %% Database Layer + VPC --> RDS[RDS Database] + + %% Styling + classDef core fill:#f9f,stroke:#333,stroke-width:2px + classDef security fill:#bbf,stroke:#333,stroke-width:2px + classDef monitoring fill:#bfb,stroke:#333,stroke-width:2px + + class VPC,EKS,EKS_CONFIG core + class CERT_MGR,GATEKEEPER,IAM security + class PROMETHEUS,GRAFANA,LOKI,TEMPO monitoring +``` + +## Component Descriptions + +### Core Infrastructure +- **VPC Module**: Network foundation with public/private subnets +- **EKS Cluster**: Managed Kubernetes service +- **Karpenter**: Autoscaling node management +- **DNS Module**: Route53 DNS management + +### Security Layer +- **Cert Manager**: Certificate lifecycle management +- **GoGatekeeper**: Policy enforcement +- **IAM Roles**: AWS IAM integration + +### Service Mesh +- **Istio**: Service mesh implementation +- **Kiali**: Service mesh visualization +- **Service Ingress**: External traffic management + +### Monitoring Stack +- **Prometheus**: Metrics collection +- **Grafana**: Metrics visualization +- **Loki**: Log aggregation +- **Tempo**: Distributed tracing + +### Additional Services +- **Kubernetes Dashboard**: Cluster management UI +- **Metrics Server**: Resource metrics +- **Keycloak**: Identity management + +### Infrastructure Management +- **Terragrunt**: Infrastructure deployment orchestration +- **RDS**: Managed database services diff --git a/docs/DOCUMENTATION_STANDARDS.md b/docs/DOCUMENTATION_STANDARDS.md new file mode 100644 index 00000000..b00374bc --- /dev/null +++ b/docs/DOCUMENTATION_STANDARDS.md @@ -0,0 +1,56 @@ +# Documentation Standards Guide + +## README Structure +Each module must include a README.md with the following sections: + +1. Overview + - Purpose + - Key features + - Architecture diagram + +2. Prerequisites + - Required tooling + - Required permissions + - Dependencies + +3. Usage + - Basic example + - Advanced examples + - Input variables table + - Output variables table + +4. Architecture + - Component diagram + - Network flow + - Security considerations + +5. Operations + - Deployment guide + - Monitoring + - Troubleshooting + - Maintenance + +## Changelog Format +Use Commitizen convention: + +``` +feat: New feature +fix: Bug fix +docs: Documentation changes +style: Formatting changes +refactor: Code restructure without behavior change +test: Test updates +chore: Maintenance tasks +``` + +## Diagrams +- Use PlantUML for architecture diagrams +- Include source files in `docs/diagrams` +- Export PNG/SVG to `docs/images` +- Keep diagrams up to date with code changes + +## Usage Examples +- Provide basic and advanced examples +- Include realistic variable values +- Document required permissions +- Include expected outputs diff --git a/docs/INFRASTRUCTURE_STANDARDS.md b/docs/INFRASTRUCTURE_STANDARDS.md new file mode 100644 index 00000000..bdcdda6c --- /dev/null +++ b/docs/INFRASTRUCTURE_STANDARDS.md @@ -0,0 +1,75 @@ +# Infrastructure Standards + +## Node Group Configuration + +### Instance Types +```hcl +locals { + instance_types = { + general_purpose = ["m6i.xlarge", "m6a.xlarge", "m5.xlarge"] + compute_optimized = ["c6i.2xlarge", "c6a.2xlarge", "c5.2xlarge"] + memory_optimized = ["r6i.2xlarge", "r6a.2xlarge", "r5.2xlarge"] + } +} +``` + +### Node Labels +```yaml +labels: + node-type: [general|compute|memory] + environment: [dev|stage|prod] + workload-type: [service|batch|system] +``` + +## Auto-scaling Configuration + +### Cluster Autoscaler +```yaml +cluster-autoscaler: + scaleDownUnneededTime: 10m + scaleDownDelayAfterAdd: 10m + maxNodeProvisionTime: 15m + maxGracefulTermination: 10m +``` + +### Karpenter Settings +```yaml +provisioner: + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: ["spot", "on-demand"] + limits: + resources: + cpu: 1000 + memory: 1000Gi +``` + +## Storage Classes + +### Standard Classes +```yaml +storage-classes: + standard: + type: gp3 + encrypted: true + reclaimPolicy: Delete + premium: + type: io2 + iops: 5000 + encrypted: true + reclaimPolicy: Retain +``` + +## Resource Quotas + +### Default Quotas +```yaml +quotas: + default: + requests.cpu: "20" + requests.memory: 40Gi + limits.cpu: "40" + limits.memory: 80Gi + pods: "100" +``` diff --git a/docs/MODULE_DEPENDENCIES.md b/docs/MODULE_DEPENDENCIES.md new file mode 100644 index 00000000..34372650 --- /dev/null +++ b/docs/MODULE_DEPENDENCIES.md @@ -0,0 +1,45 @@ +# Module Dependencies + +## Core Infrastructure Dependencies + +```mermaid +graph TD + VPC[VPC Module] --> EKS[EKS Module] + EKS --> EKS_CONFIG[EKS Config Module] + EKS --> KARPENTER[Karpenter Module] + EKS_CONFIG --> ISTIO[Istio Module] + ISTIO --> INGRESS[Service Ingress Module] + EKS --> MONITORING[Monitoring Stack] + MONITORING --> PROMETHEUS[Prometheus Module] + MONITORING --> GRAFANA[Grafana Module] +``` + +## Module Initialization Order + +1. Network Infrastructure + - VPC Module + - DNS Module + +2. Cluster Infrastructure + - EKS Module + - IAM Roles Module + - EKS Configuration + +3. Cluster Add-ons + - Metrics Server + - Cert Manager + - Karpenter + +4. Observability Stack + - Prometheus + - Grafana + - Loki + - Tempo + +## Version Compatibility Matrix + +| Module | Version | Dependencies | Breaking Changes | +|--------|---------|--------------|------------------| +| EKS | v1.0.0 | AWS Provider >= 4.0 | None | +| Karpenter | v0.5.0 | EKS >= 1.0.0 | Node group naming | +| Istio | v1.2.0 | EKS >= 1.0.0 | Service mesh config | diff --git a/docs/MODULE_STANDARDS.md b/docs/MODULE_STANDARDS.md new file mode 100644 index 00000000..88699ced --- /dev/null +++ b/docs/MODULE_STANDARDS.md @@ -0,0 +1,69 @@ +# Module Standards + +## Directory Structure +``` +module/ +├── README.md +├── main.tf +├── variables.tf +├── outputs.tf +├── versions.tf +├── examples/ +│ ├── basic/ +│ └── complete/ +└── tests/ + ├── defaults/ + └── complete/ +``` + +## Naming Conventions + +### Resource Naming +```hcl +resource "aws_iam_role" "example" { + name = format("%s-%s-%s", var.prefix, var.environment, var.name) + # ... +} +``` + +### Variable Structure +```hcl +variable "cluster_config" { + type = object({ + name = string + version = string + environment = string + vpc_id = string + }) + description = "EKS cluster configuration" +} +``` + +## Version Constraints + +### Provider Versions +```hcl +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 4.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.0" + } + } + required_version = ">= 1.0" +} +``` + +## Documentation Requirements + +### README Structure +1. Overview +2. Usage +3. Inputs/Outputs +4. Examples +5. Requirements +6. Dependencies diff --git a/docs/OBSERVABILITY_STANDARDS.md b/docs/OBSERVABILITY_STANDARDS.md new file mode 100644 index 00000000..a6d95bbb --- /dev/null +++ b/docs/OBSERVABILITY_STANDARDS.md @@ -0,0 +1,67 @@ +# Observability Standards + +## Metrics Collection + +### Golden Signals +- Latency +- Traffic +- Errors +- Saturation + +### Standard Labels +```yaml +labels: + environment: [dev|stage|prod] + service: + team: + cost_center: +``` + +### SLO Definitions +```yaml +slos: + availability: + target: 99.9% + window: 30d + latency: + target: 95% + threshold: 500ms + window: 30d +``` + +## Logging Standards + +### Log Format +```json +{ + "timestamp": "ISO8601", + "level": "INFO|WARN|ERROR", + "service": "service_name", + "trace_id": "uuid", + "message": "log message", + "metadata": {} +} +``` + +### Retention Policy +- Hot storage: 7 days +- Warm storage: 30 days +- Cold storage: 365 days + +## Alerting Standards + +### Alert Severity Levels +- P1: Critical - Immediate action required +- P2: High - Action required within 1 hour +- P3: Medium - Action required within 24 hours +- P4: Low - Action required within 1 week + +### Alert Format +```yaml +alert: + name: AlertName + severity: P1|P2|P3|P4 + description: "Clear description of the alert" + runbook_url: "Link to runbook" + notification_channels: ["slack", "email"] +``` diff --git a/docs/SECURITY_AUDIT_CHECKLIST.md b/docs/SECURITY_AUDIT_CHECKLIST.md new file mode 100644 index 00000000..f0b1bc09 --- /dev/null +++ b/docs/SECURITY_AUDIT_CHECKLIST.md @@ -0,0 +1,43 @@ +# EKS Security Audit Checklist + +## Cluster Configuration +- [ ] EKS Control Plane Logging enabled +- [ ] Kubernetes API server endpoint private +- [ ] Secrets encryption enabled +- [ ] Latest EKS version deployed +- [ ] IRSA (IAM Roles for Service Accounts) enabled + +## Network Security +- [ ] Security groups follow least privilege +- [ ] Network policies implemented +- [ ] All ports documented and justified +- [ ] No public endpoints exposed +- [ ] VPC flow logs enabled + +## Authentication & Authorization +- [ ] IAM policies follow least privilege +- [ ] RBAC policies implemented +- [ ] Service account tokens auto-rotated +- [ ] AWS IAM authenticator configured +- [ ] Regular access review process + +## Data Protection +- [ ] EBS encryption enabled +- [ ] Secrets managed by AWS Secrets Manager +- [ ] ETCd encryption enabled +- [ ] S3 bucket encryption enabled +- [ ] Regular key rotation configured + +## Compliance +- [ ] FIPS endpoints enabled +- [ ] Compliance tags implemented +- [ ] Regular security scans configured +- [ ] Audit logging enabled +- [ ] Compliance reports automated + +## Monitoring & Alerts +- [ ] Security event logging enabled +- [ ] Alert thresholds configured +- [ ] Incident response plan documented +- [ ] Regular security testing scheduled +- [ ] Compliance monitoring automated diff --git a/docs/SECURITY_BASELINE.md b/docs/SECURITY_BASELINE.md new file mode 100644 index 00000000..ffd32a28 --- /dev/null +++ b/docs/SECURITY_BASELINE.md @@ -0,0 +1,76 @@ +# EKS Security Baseline + +## Security Group Configuration + +### Node Group Security +```hcl +# Example security group configuration +resource "aws_security_group" "node_group" { + name_prefix = "eks-node-group" + vpc_id = var.vpc_id + + ingress { + from_port = 443 + to_port = 443 + protocol = "tcp" + security_groups = [var.cluster_security_group_id] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} +``` + +## Encryption Standards + +### Data at Rest +- EBS Volumes: AWS KMS encryption required +- Secrets: Envelope encryption with automatic key rotation +- ETCd: AWS KMS encryption enabled + +### Data in Transit +- TLS 1.2+ required for all API communications +- mTLS required for service-to-service communication +- Certificate rotation every 90 days + +## Network Policies + +### Default Deny Policy +```yaml +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: default-deny +spec: + podSelector: {} + policyTypes: + - Ingress + - Egress +``` + +## Pod Security Standards + +### Baseline Pod Security +```yaml +apiVersion: pod-security.admission.config.k8s.io/v1 +kind: PodSecurityConfiguration +defaults: + enforce: "baseline" + enforce-version: "latest" + audit: "restricted" + audit-version: "latest" + warn: "restricted" + warn-version: "latest" +``` + +## Compliance Requirements + +### GovCloud Specific +- FIPS 140-2 endpoints enabled +- NIST 800-53 controls implemented +- Regular security assessments +- Continuous monitoring enabled diff --git a/docs/TESTING_STANDARDS.md b/docs/TESTING_STANDARDS.md new file mode 100644 index 00000000..c731eaf3 --- /dev/null +++ b/docs/TESTING_STANDARDS.md @@ -0,0 +1,107 @@ +# Testing Standards + +## Validation Testing + +### Pre-commit Hooks +```yaml +repos: +- repo: https://github.com/antonbabenko/pre-commit-terraform + rev: v1.64.0 + hooks: + - id: terraform_fmt + - id: terraform_docs + - id: terraform_tflint + - id: terraform_validate +``` + +### Static Analysis +```hcl +provider "aws" { + region = var.region + + default_tags { + tags = { + Environment = var.environment + Terraform = "true" + Project = var.project + } + } +} + +# Required variable validation +variable "environment" { + type = string + validation { + condition = contains(["dev", "stage", "prod"], var.environment) + error_message = "Environment must be dev, stage, or prod." + } +} +``` + +## Integration Testing + +### Test Structure +``` +tests/ +├── integration/ +│ ├── eks_cluster/ +│ │ ├── test_cluster.tf +│ │ └── variables.tf +│ └── monitoring/ +│ ├── test_prometheus.tf +│ └── variables.tf +└── e2e/ + └── complete_setup/ + ├── main.tf + └── outputs.tf +``` + +### Example Test Case +```hcl +module "test_eks" { + source = "../../" + + cluster_name = "test-cluster" + cluster_version = "1.24" + + vpc_id = module.vpc.vpc_id + subnet_ids = module.vpc.private_subnets + + enable_logging = true +} + +output "test_cluster_status" { + value = module.test_eks.cluster_status +} +``` + +## Security Testing + +### Checkov Configuration +```yaml +checkov: + skip-check: + - CKV_AWS_79 # Ensure Instance Metadata Service Version 1 is not enabled + external-checks-dir: + - security/custom_checks +``` + +### Custom Security Checks +```python +from checkov.common.models.enums import CheckResult, CheckCategories +from checkov.terraform.checks.resource.base_resource_check import BaseResourceCheck + +class EnsureEncryption(BaseResourceCheck): + def __init__(self): + name = "Ensure encryption is enabled" + id = "CKV_CUSTOM_1" + supported_resources = ['aws_ebs_volume'] + categories = [CheckCategories.ENCRYPTION] + super().__init__(name=name, id=id, categories=categories, supported_resources=supported_resources) + + def scan_resource_conf(self, conf): + if 'encrypted' in conf.keys(): + if conf['encrypted'][0]: + return CheckResult.PASSED + return CheckResult.FAILED +``` diff --git a/docs/VERSION_CONTROL.md b/docs/VERSION_CONTROL.md new file mode 100644 index 00000000..bc433f6a --- /dev/null +++ b/docs/VERSION_CONTROL.md @@ -0,0 +1,52 @@ +# Version Control Standards + +## Semantic Versioning + +### Version Format +- MAJOR.MINOR.PATCH +- Example: 1.2.3 + +### Version Rules +1. MAJOR version - Incompatible API changes +2. MINOR version - Backwards-compatible features +3. PATCH version - Bug fixes + +## Release Process + +### Release Branches +``` +main +├── release/1.0.x +├── release/1.1.x +└── release/2.0.x +``` + +### Version Tags +```bash +# Release tags +v1.0.0 +v1.0.1 +v1.1.0 +v2.0.0 +``` + +## Breaking Changes + +### Documentation Format +```markdown +# Breaking Changes + +## Version 2.0.0 +- Changed: Resource naming convention +- Removed: Deprecated variables +- Required: AWS Provider >= 4.0 +``` + +## Upgrade Guidelines + +### Module Updates +1. Review breaking changes +2. Update dependencies +3. Test in non-production +4. Update documentation +5. Create migration guide diff --git a/docs/templates/MODULE_README.md b/docs/templates/MODULE_README.md new file mode 100644 index 00000000..99123315 --- /dev/null +++ b/docs/templates/MODULE_README.md @@ -0,0 +1,71 @@ +# Module Name + +## Overview +Brief description of the module's purpose and functionality. + +## Prerequisites +* Required tools and versions +* Required permissions +* Dependencies + +## Usage + +### Basic Example +```hcl +module "example" { + source = "path/to/module" + + // Required variables + environment = "production" + region = "us-west-2" +} +``` + +### Advanced Example +```hcl +module "example" { + source = "path/to/module" + + // Detailed configuration + environment = "production" + region = "us-west-2" + high_availability = true + backup_retention = 30 +} +``` + +## Architecture +[Insert architecture diagram] + +### Components +* Component 1 - Description +* Component 2 - Description + +### Network Flow +[Insert network flow diagram] + +## Operations + +### Deployment +Step-by-step deployment instructions + +### Monitoring +Key metrics and monitoring guidelines + +### Troubleshooting +Common issues and solutions + +## Input Variables +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| var1 | Description | type | default | yes/no | + +## Outputs +| Name | Description | +|------|-------------| +| out1 | Description | + +## Security Considerations +* Security group configurations +* IAM permissions +* Encryption settings diff --git a/lab/_envcommon/default-versions.hcl b/lab/_envcommon/default-versions.hcl index c2e4f946..9584c945 100644 --- a/lab/_envcommon/default-versions.hcl +++ b/lab/_envcommon/default-versions.hcl @@ -46,13 +46,6 @@ locals { telemetry_namespace = "telemetry" # kubectl_image_tag = "1.30.4" - ################ - # k8s-dashboard - ################ - dashboard_hostname = "k8s-dashboard" - # k8s_dashboard_metrics_scraper = "1.0.8" - k8s_dashboard_version = "6.0.6" - ################ # Cert-Manager ################ @@ -80,12 +73,26 @@ locals { grafana_tag = "11.4.0" init_chown_data_image_tag = "1.31.1" + ################ + # k8s-dashboard + ################ + dashboard_hostname = "k8s-dashboard" + k8s_dashboard_version = "6.0.6" + ################ # Karpenter ################ karpenter_helm_chart = "1.1.1" karpenter_tag = "1.1.1" + ################ + # keycloak + ################ + keycloak_app_version = "v26.1.2" + keycloak_chart_version = "24.4.10" + keycloak_hostname = "keycloak" + keycloak_namespace = "keycloak" + ################ # Kiali ################ diff --git a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-k8s-dashboard/terragrunt.hcl b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-k8s-dashboard/terragrunt.hcl index 05fdb934..6b553503 100644 --- a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-k8s-dashboard/terragrunt.hcl +++ b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-k8s-dashboard/terragrunt.hcl @@ -41,5 +41,5 @@ inputs = { # Dashboard Configuration k8s_dashboard_version = include.root.inputs.k8s_dashboard_version - namespace = include.root.inputs.dashboard_hostname + namespace = include.root.inputs.dashboard_hostname } diff --git a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-keycloak/terragrunt.hcl b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-keycloak/terragrunt.hcl new file mode 100644 index 00000000..fbc810b8 --- /dev/null +++ b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-keycloak/terragrunt.hcl @@ -0,0 +1,46 @@ +include "root" { + path = find_in_parent_folders("root.hcl") + merge_strategy = "deep" + expose = true +} + +terraform { + source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-keycloak.git?ref=standards" + extra_arguments "retry_lock" { + commands = get_terraform_commands_that_need_locking() + arguments = ["-lock-timeout=20m"] + } +} + +dependency "eks" { + config_path = "../eks" + mock_outputs = { + cluster_name = "a-cluster-name" + vpc_id = "vpc-12345678" + database_subnet_ids = ["subnet-1", "subnet-2"] + } +} + +inputs = { + profile = include.root.inputs.aws_profile + region = include.root.inputs.aws_region + environment = include.root.inputs.environment + cluster_name = dependency.eks.outputs.cluster_name + cluster_domain = include.root.inputs.vpc_domain_name + keycloak_version = include.root.inputs.keycloak_chart_version + keycloak_tag = include.root.inputs.keycloak_app_version + keycloak_hostname = include.root.inputs.keycloak_hostname + namespace = include.root.inputs.keycloak_namespace + admin_email = include.root.inputs.creator + database_subnet_ids = dependency.eks.outputs.database_subnet_ids + + # Database configuration + db_engine = "aurora-postgresql" + db_instance_type = "db.t4g.medium" + db_name = "keycloak" + db_user = "keycloak" + + # Project information + project_name = include.root.inputs.project_name + tags = include.root.inputs.tags +} diff --git a/monitoring/grafana-dashboards.json b/monitoring/grafana-dashboards.json new file mode 100644 index 00000000..01d36852 --- /dev/null +++ b/monitoring/grafana-dashboards.json @@ -0,0 +1,44 @@ +{ + "dashboards": [ + { + "name": "Cluster Overview", + "panels": [ + { + "title": "Node CPU Usage", + "type": "graph", + "targets": [ + { + "expr": "cluster:node_cpu:ratio_rate5m", + "legendFormat": "{{node}}" + } + ] + }, + { + "title": "Pod Resource Usage", + "type": "graph", + "targets": [ + { + "expr": "cluster:pod_cpu:usage_rate5m", + "legendFormat": "{{pod}}" + } + ] + } + ] + }, + { + "name": "Service SLOs", + "panels": [ + { + "title": "Request Latency", + "type": "graph", + "targets": [ + { + "expr": "http_request_duration_seconds:99percentile", + "legendFormat": "{{service}}" + } + ] + } + ] + } + ] +} diff --git a/monitoring/prometheus-rules.yaml b/monitoring/prometheus-rules.yaml new file mode 100644 index 00000000..fa63c5ee --- /dev/null +++ b/monitoring/prometheus-rules.yaml @@ -0,0 +1,39 @@ +groups: +- name: kubernetes.rules + rules: + - record: cluster:node_cpu:ratio_rate5m + expr: sum(rate(node_cpu_seconds_total{mode!="idle"}[5m])) by (node) / count(node_cpu_seconds_total{mode="idle"}) by (node) + + - alert: NodeCPUUsage + expr: cluster:node_cpu:ratio_rate5m > 0.8 + for: 10m + labels: + severity: warning + annotations: + description: "CPU usage on {{ $labels.node }} is above 80%" + +- name: kubernetes.pod.rules + rules: + - record: cluster:pod_cpu:usage_rate5m + expr: sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, namespace) + + - alert: PodCPUThrottling + expr: rate(container_cpu_cfs_throttled_seconds_total[5m]) > 0 + for: 15m + labels: + severity: warning + annotations: + description: "Pod {{ $labels.pod }} in {{ $labels.namespace }} is being throttled" + +- name: application.slos + rules: + - record: http_request_duration_seconds:99percentile + expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, service)) + + - alert: HighLatency + expr: http_request_duration_seconds:99percentile > 0.5 + for: 5m + labels: + severity: critical + annotations: + description: "Service {{ $labels.service }} is experiencing high latency" diff --git a/plan.md b/plan.md new file mode 100644 index 00000000..bd058fd3 --- /dev/null +++ b/plan.md @@ -0,0 +1,271 @@ +Project Plan: EKS Infrastructure Codebase Improvements +1. Documentation Standardization + - Create centralized documentation standards guide + - Implement standardized README structure across all modules: + * Overview and purpose + * Prerequisites and dependencies + * Usage examples with variables + * Architecture diagrams + * Operations guide + - Establish changelog format using Commitizen convention + - Create architecture diagrams: + * High-level system architecture + * Module relationships + * Network flow diagrams + * Security group configurations + - Develop consistent module examples: + * Basic usage patterns + * Advanced configurations + * Migration guides + * Troubleshooting guides + - Implementation timeline: + * Week 1: Standards guide creation + * Week 2-3: README updates + * Week 4: Diagram creation + * Week 5: Example development + * Week 6: Review and refinement + +2. Security Enhancements + - EKS Security Group Configurations: + * Implement least-privilege access rules + * Restrict node group communication + * Define approved ingress/egress patterns + * Document security group dependencies + + - AWS GovCloud Security Implementation: + * Enable FIPS 140-2 compliant endpoints + * Implement NIST 800-53 controls + * Configure AWS KMS for all sensitive data + * Enable AWS Organizations SCPs + + - Encryption Configurations: + * Enable envelope encryption for secrets + * Implement at-rest encryption for EBS volumes + * Configure TLS for all service communications + * Rotate encryption keys automatically + + - Network Security Policies: + * Define default deny policies + * Create application-specific network policies + * Implement pod security policies + * Configure service mesh security + + - Implementation Timeline: + * Week 1: Security audit and gap analysis + * Week 2: Security group updates + * Week 3: Encryption improvements + * Week 4: Network policy implementation + * Week 5: Testing and validation + * Week 6: Documentation and training + +3. Observability Improvements + - Prometheus Configuration Standardization: + * Define standard metric collection rules + * Implement consistent recording rules + * Set up unified alerting rules + * Configure HA architecture + + - Metrics Collection Strategy: + * Define golden signals metrics + * Implement custom metric collectors + * Set up SLO/SLI tracking + * Configure cost metrics collection + + - Logging Framework: + * Implement structured logging + * Configure log aggregation + * Set up log retention policies + * Enable audit logging + + - Grafana Dashboards: + * Create cluster health dashboards + * Implement cost monitoring views + * Set up performance dashboards + * Configure security monitoring panels + + - Implementation Timeline: + * Week 1: Metrics standardization + * Week 2: Logging implementation + * Week 3: Dashboard creation + * Week 4: Alert configuration + * Week 5: Testing and validation + * Week 6: Documentation and training + +4. Infrastructure Optimization + - Node Group Configuration: + * Implement right-sized instance types + * Configure optimal scaling thresholds + * Set up mixed-instance policies + * Define node taints and labels + + - Auto-scaling Strategy: + * Configure Cluster Autoscaler settings + * Implement Karpenter provisioners + * Set up pod disruption budgets + * Define scaling policies + + - Storage Optimization: + * Define storage class specifications + * Implement volume encryption + * Configure backup policies + * Set up snapshot schedules + + - Resource Management: + * Implement namespace quotas + * Define limit ranges + * Configure resource requests/limits + * Set up cost allocation tags + + - Implementation Timeline: + * Week 1: Node group optimization + * Week 2: Auto-scaling implementation + * Week 3: Storage configuration + * Week 4: Resource quotas setup + * Week 5: Testing and validation + * Week 6: Documentation and training + +5. Module Organization + - Module Standardization: + * Create consistent module structure + * Implement standard naming conventions + * Define input/output patterns + * Establish version constraints + + - Variable Management: + * Create shared variable definitions + * Implement variable validation rules + * Define default value standards + * Document variable dependencies + + - Version Control: + * Implement semantic versioning + * Create version compatibility matrix + * Define upgrade paths + * Document breaking changes + + - Dependencies: + * Map module relationships + * Document cross-module dependencies + * Define initialization order + * Create dependency graphs + + - Implementation Timeline: + * Week 1: Module structure standardization + * Week 2: Variable management + * Week 3: Version control implementation + * Week 4: Dependency documentation + * Week 5: Testing and validation + * Week 6: Documentation and training + +6. Testing Framework + - Terraform Validation: + * Implement pre-commit hooks + * Configure format checking + * Add variable validation + * Set up static analysis + + - Integration Testing: + * Create test environments + * Implement end-to-end tests + * Configure smoke tests + * Set up regression testing + + - Security Testing: + * Implement security scanners + * Configure compliance checks + * Add vulnerability scanning + * Set up secret detection + + - Test Automation: + * Configure CI/CD pipelines + * Implement test reporting + * Set up coverage tracking + * Create automated rollbacks + + - Implementation Timeline: + * Week 1: Validation framework setup + * Week 2: Integration test development + * Week 3: Security scanning implementation + * Week 4: Automation configuration + * Week 5: Testing and validation + * Week 6: Documentation and training + +Implementation Priority: + - Security Enhancements (Critical) + - Observability Improvements (High) + - Infrastructure Optimization (High) + - Documentation Standardization (Medium) + - Module Organization (Medium) + - Testing Framework (Medium) + +Key Metrics: + - Security compliance score + - Resource utilization efficiency + - Documentation coverage + - Test coverage + - Code duplication reduction + - Deployment success rate + +Next Steps: + +1. Security Audit (Week 1-2) + - Perform comprehensive security assessment + * Review IAM roles and permissions + * Audit security group configurations + * Analyze network policies + * Review encryption settings + - Generate security findings report + - Prioritize security improvements + - Create remediation timeline + +2. Implementation Planning (Week 2-3) + - Create detailed project timeline + * Break down tasks by module + * Identify dependencies + * Assign ownership + * Set milestones + - Establish success criteria + - Define rollback procedures + - Create risk mitigation strategies + +3. Testing Pipeline Setup (Week 3-4) + - Configure CI/CD infrastructure + * Set up test environments + * Implement automated testing + * Configure quality gates + * Enable security scanning + - Create test data sets + - Develop test scenarios + - Implement monitoring for test environments + +4. Documentation Enhancement (Week 4-5) + - Audit existing documentation + - Create documentation templates + - Update README files + - Generate architecture diagrams + - Create operational runbooks + - Document emergency procedures + +5. Module Consolidation (Week 5-6) + - Analyze current module structure + - Identify consolidation opportunities + - Create module dependency map + - Plan refactoring phases + - Document migration steps + - Create validation checklist + +6. Validation and Review (Week 6-7) + - Conduct peer reviews + - Perform security validation + - Test documentation accuracy + - Validate monitoring setup + - Review automation effectiveness + - Gather stakeholder feedback + +7. Training and Handover (Week 7-8) + - Prepare training materials + - Schedule training sessions + - Document operational procedures + - Create troubleshooting guides + - Set up support channels + - Plan knowledge transfer sessions diff --git a/tests/terraform.tftest.hcl b/tests/terraform.tftest.hcl new file mode 100644 index 00000000..7dfcc8e8 --- /dev/null +++ b/tests/terraform.tftest.hcl @@ -0,0 +1,40 @@ +variables { + cluster_name = "test-cluster" + cluster_version = "1.24" + vpc_id = "vpc-12345678" + subnet_ids = ["subnet-1", "subnet-2"] + region = "us-gov-east-1" + environment = "test" +} + +run "cluster_creation" { + command = plan + + assert { + condition = length(aws_eks_cluster.main) > 0 + error_message = "EKS cluster was not created" + } + + assert { + condition = aws_eks_cluster.main.encryption_config[0].provider[0].key_arn != null + error_message = "EKS cluster encryption is not configured" + } +} + +run "node_groups" { + command = plan + + assert { + condition = length(aws_eks_node_group.main) > 0 + error_message = "Node groups were not created" + } +} + +run "security_groups" { + command = plan + + assert { + condition = length(aws_security_group_rule.cluster) > 0 + error_message = "Security group rules were not created" + } +}