From 8c0c742f088e42acd7d35545ab97fc6fc41792ba Mon Sep 17 00:00:00 2001 From: "Matthew C. Morgan" Date: Wed, 12 Feb 2025 15:54:55 -0500 Subject: [PATCH] revert changes from testing and add notes --- lab/_envcommon/default-versions.hcl | 2 +- .../eks-cert-manager/terragrunt.hcl | 2 +- .../eks-config/terragrunt.hcl | 2 +- .../eks-dns/terragrunt.hcl | 2 +- .../eks-grafana/terragrunt.hcl | 2 +- .../eks-istio/terragrunt.hcl | 2 +- .../eks-k8s-dashboard/terragrunt.hcl | 2 +- .../eks-karpenter/terragrunt.hcl | 2 +- .../eks-loki/terragrunt.hcl | 2 +- .../eks-metrics-server/terragrunt.hcl | 2 +- .../eks-prometheus/terragrunt.hcl | 2 +- .../eks-tempo/terragrunt.hcl | 2 +- .../platform-eng-eks-mcm/eks/terragrunt.hcl | 2 +- notes.md | 78 +++++++++++++++++++ 14 files changed, 91 insertions(+), 13 deletions(-) create mode 100644 notes.md diff --git a/lab/_envcommon/default-versions.hcl b/lab/_envcommon/default-versions.hcl index 15dbfbc..cad4a0f 100644 --- a/lab/_envcommon/default-versions.hcl +++ b/lab/_envcommon/default-versions.hcl @@ -8,7 +8,7 @@ locals { custom_service_eks_account = "${local.release_version}" eks_module_version = "20.33.1" istio_ingress_version = "${local.release_version}" - release_version = "main" + release_version = "0.1.1" # change to main when testing updated modules ##################### # TF Providers diff --git a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-cert-manager/terragrunt.hcl b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-cert-manager/terragrunt.hcl index 1005f35..3636c4a 100644 --- a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-cert-manager/terragrunt.hcl +++ b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-cert-manager/terragrunt.hcl @@ -5,7 +5,7 @@ include "root" { } terraform { - source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-cert-mgr.git?ref=no-kubectl" + source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-cert-mgr.git?ref=${include.root.inputs.release_version}" extra_arguments "retry_lock" { commands = get_terraform_commands_that_need_locking() arguments = ["-lock-timeout=20m"] diff --git a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-config/terragrunt.hcl b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-config/terragrunt.hcl index 82f9b11..0b3f63c 100644 --- a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-config/terragrunt.hcl +++ b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-config/terragrunt.hcl @@ -6,7 +6,7 @@ include "root" { } terraform { - source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-eks-configuration.git?ref=main" + source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-eks-configuration.git?ref=${include.root.inputs.release_version}" extra_arguments "retry_lock" { commands = get_terraform_commands_that_need_locking() arguments = ["-lock-timeout=20m"] diff --git a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-dns/terragrunt.hcl b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-dns/terragrunt.hcl index be3addc..1caebed 100644 --- a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-dns/terragrunt.hcl +++ b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-dns/terragrunt.hcl @@ -5,7 +5,7 @@ include "root" { } terraform { - source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-eks-dns.git?ref=main" + source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-eks-dns.git?ref=${include.root.inputs.release_version}" extra_arguments "retry_lock" { commands = get_terraform_commands_that_need_locking() arguments = ["-lock-timeout=20m"] diff --git a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-grafana/terragrunt.hcl b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-grafana/terragrunt.hcl index d945f2e..278fffa 100644 --- a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-grafana/terragrunt.hcl +++ b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-grafana/terragrunt.hcl @@ -5,7 +5,7 @@ include "root" { } terraform { - source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-grafana.git?ref=main" + source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-grafana.git?ref=${include.root.inputs.release_version}" extra_arguments "retry_lock" { commands = get_terraform_commands_that_need_locking() arguments = ["-lock-timeout=20m"] diff --git a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-istio/terragrunt.hcl b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-istio/terragrunt.hcl index eb7b9ca..d21dc4b 100644 --- a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-istio/terragrunt.hcl +++ b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-istio/terragrunt.hcl @@ -5,7 +5,7 @@ include "root" { } terraform { - source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-istio.git?ref=main" + source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-istio.git?ref=${include.root.inputs.release_version}" extra_arguments "retry_lock" { commands = get_terraform_commands_that_need_locking() arguments = ["-lock-timeout=20m"] diff --git a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-k8s-dashboard/terragrunt.hcl b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-k8s-dashboard/terragrunt.hcl index be47974..9e66315 100644 --- a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-k8s-dashboard/terragrunt.hcl +++ b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-k8s-dashboard/terragrunt.hcl @@ -5,7 +5,7 @@ include "root" { } terraform { - source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-k8s-dashboard.git?ref=main" + source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-k8s-dashboard.git?ref=${include.root.inputs.release_version}" extra_arguments "retry_lock" { commands = get_terraform_commands_that_need_locking() arguments = ["-lock-timeout=20m"] diff --git a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-karpenter/terragrunt.hcl b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-karpenter/terragrunt.hcl index 8938179..c157bb7 100644 --- a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-karpenter/terragrunt.hcl +++ b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-karpenter/terragrunt.hcl @@ -5,7 +5,7 @@ include "root" { } terraform { - source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-karpenter.git?ref=no-kubectl" + source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-karpenter.git?ref=${include.root.inputs.release_version}" extra_arguments "retry_lock" { commands = get_terraform_commands_that_need_locking() arguments = ["-lock-timeout=20m"] diff --git a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-loki/terragrunt.hcl b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-loki/terragrunt.hcl index b351429..9091966 100644 --- a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-loki/terragrunt.hcl +++ b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-loki/terragrunt.hcl @@ -5,7 +5,7 @@ include "root" { } terraform { - source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-loki.git?ref=main" + source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-loki.git?ref=${include.root.inputs.release_version}" extra_arguments "retry_lock" { commands = get_terraform_commands_that_need_locking() arguments = ["-lock-timeout=20m"] diff --git a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-metrics-server/terragrunt.hcl b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-metrics-server/terragrunt.hcl index 08d7fc4..bfa29d4 100644 --- a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-metrics-server/terragrunt.hcl +++ b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-metrics-server/terragrunt.hcl @@ -5,7 +5,7 @@ include "root" { } terraform { - source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-metrics-server.git?ref=main" + source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-metrics-server.git?ref=${include.root.inputs.release_version}" extra_arguments "retry_lock" { commands = get_terraform_commands_that_need_locking() arguments = ["-lock-timeout=20m"] diff --git a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-prometheus/terragrunt.hcl b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-prometheus/terragrunt.hcl index 4c734f8..0975121 100644 --- a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-prometheus/terragrunt.hcl +++ b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-prometheus/terragrunt.hcl @@ -5,7 +5,7 @@ include "root" { } terraform { - source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-prometheus.git?ref=main" + source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-prometheus.git?ref=${include.root.inputs.release_version}" extra_arguments "retry_lock" { commands = get_terraform_commands_that_need_locking() arguments = ["-lock-timeout=20m"] diff --git a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-tempo/terragrunt.hcl b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-tempo/terragrunt.hcl index f44e5c0..e05746a 100644 --- a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-tempo/terragrunt.hcl +++ b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks-tempo/terragrunt.hcl @@ -5,7 +5,7 @@ include "root" { } terraform { - source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-tempo.git?ref=main" + source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-tempo.git?ref=${include.root.inputs.release_version}" extra_arguments "retry_lock" { commands = get_terraform_commands_that_need_locking() arguments = ["-lock-timeout=20m"] diff --git a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks/terragrunt.hcl b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks/terragrunt.hcl index 59e9a75..5088910 100644 --- a/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks/terragrunt.hcl +++ b/lab/development/us-gov-east-1/vpc/platform-eng-eks-mcm/eks/terragrunt.hcl @@ -6,7 +6,7 @@ include "root" { } terraform { - source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-eks.git?ref=taints" + source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-eks.git?ref=${include.root.inputs.release_version}" extra_arguments "retry_lock" { commands = get_terraform_commands_that_need_locking() arguments = ["-lock-timeout=20m"] diff --git a/notes.md b/notes.md new file mode 100644 index 0000000..55a5ffc --- /dev/null +++ b/notes.md @@ -0,0 +1,78 @@ +I really like these suggestions, but I want to help shape your suggestions with some prime directives for these tasks: +1. security is paramount. we operate in govcloud and handle titled data. security is the most important consideration. +2. cost control - this is a base cluster for a customer to build on top of for their apps. It is anticipated there will be significant time between initial provisioning and first use. The cheapest possible configuration for secure operations in govcloud. +3. simplicity. ideally, I want to be able to add a single file to an exising git repository (which represents an aws account), and have it spawn this entire cluster definition. +4. maintainability. As in, a minimum amount of effort to maintain,, prioritizing future-proofing in decisions. +5. extensibility. try to keep things modular and able to be glued together as easy as possible. +6. best practices. should probably be higher in this list, but at all times, we should endevour to follow/encourage best practices. +7. testability. we are dealing with eks clusters in aws here. by nature, these are expensive resources. anything we can do to test without creation of resources, or rapid creation and destruction, is encouraged. +8. documentation - including the 5 W's (who, what, when, where, why, and how) + +Given those guidelines, does that change your suggestions? Should we start the code review over with those in mind? + +Improvement: Consider adding validation blocks for required variables +Improvement: Add more detailed comments explaining configuration choices +Improvement: Consider tagging strategy for cost allocation +Improvement: Add lifecycle policies for node groups +Warning: Public endpoint access enabled - consider restricting CIDR ranges +Improvement: Add explicit IAM role configurations +Improvement: Implement network policies +# Add to cluster configuration +cluster_security_group_additional_rules = { + ingress_nodes_ephemeral_ports = { + description = "Node to node ephemeral ports" + protocol = "tcp" + from_port = 1025 + to_port = 65535 + type = "ingress" + source_node_security_group = true + } +} +Add CloudWatch logging configuration +Implement proper metrics collection +Set up alerts for cluster health +Improvement: Add more detailed documentation +Improvement: Consider adding test environments +Add README files in each major directory +Document deployment procedures +Add troubleshooting guides +Document network architecture + +resource "aws_eks_cluster" "main" { + # ...existing code... + vpc_config { + endpoint_private_access = true + endpoint_public_access = false # Force private endpoint only + security_group_ids = [aws_security_group.cluster.id] + subnet_ids = var.private_subnet_ids + } + + encryption_config { + provider { + key_arn = aws_kms_key.eks.arn + } + resources = ["secrets"] + } +} + +24m Warning FailedGetResourceMetric horizontalpodautoscaler/loki-write failed to get cpu utilization: unable to get metrics for resource cpu: no metrics returned from resource metrics API +24m Warning FailedComputeMetricsReplicas horizontalpodautoscaler/loki-write invalid metrics (1 invalid out of 1), first error is: failed to get cpu resource metric value: failed to get cpu utilization: unable to get metrics for resource cpu: no metrics returned from resource metrics API +22m Warning FailedGetResourceMetric horizontalpodautoscaler/loki-write failed to get cpu utilization: did not receive metrics for targeted pods (pods might be unready) +2 +29m Warning FailedGetResourceMetric horizontalpodautoscaler/istiod failed to get cpu utilization: unable to get metrics for resource cpu: unable to fetch metrics from resource metrics API: the server could not find the requested resource (get pods.metrics.k8s.io) +29m Warning FailedComputeMetricsReplicas horizontalpodautoscaler/istiod invalid metrics (1 invalid out of 1), first error is: failed to get cpu resource metric value: failed to get cpu utilization: unable to get metrics for resource cpu: unable to fetch metrics from resource metrics API: the server could not find the requested resource (get pods.metrics.k8s.io) +29m Warning FailedGetResourceMetric horizontalpodautoscaler/istiod failed to get cpu utilization: unable to get metrics for resource cpu: unable to fetch metrics from resource metrics API: the server is currently unable to handle the request (get pods.metrics.k8s.io) +29m Warning FailedComputeMetricsReplicas horizontalpodautoscaler/istiod invalid metrics (1 invalid out of 1), first error is: failed to get cpu resource metric value: failed to get cpu utilization: unable to get metrics for resource cpu: unable to fetch metrics from resource metrics API: the server is currently unable to handle the request (get pods.metrics.k8s.io) +2 +* Failed to execute "terraform_current apply -lock-timeout=20m -auto-approve -input=false -auto-approve" in ./.terragrunt-cache/jrM5TqaHxjlphT8vQ1DicmFp6eM/1NbRS_ankC8AcxKegXNWAnjyQEg + ╷ + │ Error: Unable to continue with install: Certificate "platform-eng-eks-mcm" in namespace "istio-system" exists and cannot be imported into the current release: invalid ownership metadata; annotation validation error: key "meta.helm.sh/release-name" must equal "grafana-grafana-ingress": current value is "k8s-dashboard-k8s-dashboard-ingress"; annotation validation error: key "meta.helm.sh/release-namespace" must equal "grafana": current value is "k8s-dashboard" + │ + │ with module.ingress_resources.helm_release.ingress, + │ on .terraform/modules/ingress_resources/main.tf line 6, in resource "helm_release" "ingress": + │ 6: resource "helm_release" "ingress" { + │ + ╵ + + exit status 1 + \ No newline at end of file