Skip to content

Commit

Permalink
revert changes from testing and add notes
Browse files Browse the repository at this point in the history
  • Loading branch information
morga471 committed Feb 12, 2025
1 parent c59fbcc commit 788748d
Show file tree
Hide file tree
Showing 14 changed files with 91 additions and 13 deletions.
2 changes: 1 addition & 1 deletion lab/_envcommon/default-versions.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ locals {
custom_service_eks_account = "${local.release_version}"
eks_module_version = "20.33.1"
istio_ingress_version = "${local.release_version}"
release_version = "main"
release_version = "0.1.1" # change to main when testing updated modules

#####################
# TF Providers
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ include "root" {
}

terraform {
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-cert-mgr.git?ref=no-kubectl"
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-cert-mgr.git?ref=${include.root.inputs.release_version}"
extra_arguments "retry_lock" {
commands = get_terraform_commands_that_need_locking()
arguments = ["-lock-timeout=20m"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ include "root" {
}

terraform {
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-eks-configuration.git?ref=main"
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-eks-configuration.git?ref=${include.root.inputs.release_version}"
extra_arguments "retry_lock" {
commands = get_terraform_commands_that_need_locking()
arguments = ["-lock-timeout=20m"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ include "root" {
}

terraform {
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-eks-dns.git?ref=main"
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-eks-dns.git?ref=${include.root.inputs.release_version}"
extra_arguments "retry_lock" {
commands = get_terraform_commands_that_need_locking()
arguments = ["-lock-timeout=20m"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ include "root" {
}

terraform {
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-grafana.git?ref=main"
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-grafana.git?ref=${include.root.inputs.release_version}"
extra_arguments "retry_lock" {
commands = get_terraform_commands_that_need_locking()
arguments = ["-lock-timeout=20m"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ include "root" {
}

terraform {
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-istio.git?ref=main"
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-istio.git?ref=${include.root.inputs.release_version}"
extra_arguments "retry_lock" {
commands = get_terraform_commands_that_need_locking()
arguments = ["-lock-timeout=20m"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ include "root" {
}

terraform {
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-k8s-dashboard.git?ref=main"
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-k8s-dashboard.git?ref=${include.root.inputs.release_version}"
extra_arguments "retry_lock" {
commands = get_terraform_commands_that_need_locking()
arguments = ["-lock-timeout=20m"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ include "root" {
}

terraform {
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-karpenter.git?ref=no-kubectl"
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-karpenter.git?ref=${include.root.inputs.release_version}"
extra_arguments "retry_lock" {
commands = get_terraform_commands_that_need_locking()
arguments = ["-lock-timeout=20m"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ include "root" {
}

terraform {
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-loki.git?ref=main"
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-loki.git?ref=${include.root.inputs.release_version}"
extra_arguments "retry_lock" {
commands = get_terraform_commands_that_need_locking()
arguments = ["-lock-timeout=20m"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ include "root" {
}

terraform {
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-metrics-server.git?ref=main"
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-metrics-server.git?ref=${include.root.inputs.release_version}"
extra_arguments "retry_lock" {
commands = get_terraform_commands_that_need_locking()
arguments = ["-lock-timeout=20m"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ include "root" {
}

terraform {
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-prometheus.git?ref=main"
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-prometheus.git?ref=${include.root.inputs.release_version}"
extra_arguments "retry_lock" {
commands = get_terraform_commands_that_need_locking()
arguments = ["-lock-timeout=20m"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ include "root" {
}

terraform {
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-tempo.git?ref=main"
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-tempo.git?ref=${include.root.inputs.release_version}"
extra_arguments "retry_lock" {
commands = get_terraform_commands_that_need_locking()
arguments = ["-lock-timeout=20m"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ include "root" {
}

terraform {
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-eks.git?ref=taints"
source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-eks.git?ref=${include.root.inputs.release_version}"
extra_arguments "retry_lock" {
commands = get_terraform_commands_that_need_locking()
arguments = ["-lock-timeout=20m"]
Expand Down
78 changes: 78 additions & 0 deletions notes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
I really like these suggestions, but I want to help shape your suggestions with some prime directives for these tasks:
1. security is paramount. we operate in govcloud and handle titled data. security is the most important consideration.
2. cost control - this is a base cluster for a customer to build on top of for their apps. It is anticipated there will be significant time between initial provisioning and first use. The cheapest possible configuration for secure operations in govcloud.
3. simplicity. ideally, I want to be able to add a single file to an exising git repository (which represents an aws account), and have it spawn this entire cluster definition.
4. maintainability. As in, a minimum amount of effort to maintain,, prioritizing future-proofing in decisions.
5. extensibility. try to keep things modular and able to be glued together as easy as possible.
6. best practices. should probably be higher in this list, but at all times, we should endevour to follow/encourage best practices.
7. testability. we are dealing with eks clusters in aws here. by nature, these are expensive resources. anything we can do to test without creation of resources, or rapid creation and destruction, is encouraged.
8. documentation - including the 5 W's (who, what, when, where, why, and how)

Given those guidelines, does that change your suggestions? Should we start the code review over with those in mind?

Improvement: Consider adding validation blocks for required variables
Improvement: Add more detailed comments explaining configuration choices
Improvement: Consider tagging strategy for cost allocation
Improvement: Add lifecycle policies for node groups
Warning: Public endpoint access enabled - consider restricting CIDR ranges
Improvement: Add explicit IAM role configurations
Improvement: Implement network policies
# Add to cluster configuration
cluster_security_group_additional_rules = {
ingress_nodes_ephemeral_ports = {
description = "Node to node ephemeral ports"
protocol = "tcp"
from_port = 1025
to_port = 65535
type = "ingress"
source_node_security_group = true
}
}
Add CloudWatch logging configuration
Implement proper metrics collection
Set up alerts for cluster health
Improvement: Add more detailed documentation
Improvement: Consider adding test environments
Add README files in each major directory
Document deployment procedures
Add troubleshooting guides
Document network architecture

resource "aws_eks_cluster" "main" {
# ...existing code...
vpc_config {
endpoint_private_access = true
endpoint_public_access = false # Force private endpoint only
security_group_ids = [aws_security_group.cluster.id]
subnet_ids = var.private_subnet_ids
}

encryption_config {
provider {
key_arn = aws_kms_key.eks.arn
}
resources = ["secrets"]
}
}

24m Warning FailedGetResourceMetric horizontalpodautoscaler/loki-write failed to get cpu utilization: unable to get metrics for resource cpu: no metrics returned from resource metrics API
24m Warning FailedComputeMetricsReplicas horizontalpodautoscaler/loki-write invalid metrics (1 invalid out of 1), first error is: failed to get cpu resource metric value: failed to get cpu utilization: unable to get metrics for resource cpu: no metrics returned from resource metrics API
22m Warning FailedGetResourceMetric horizontalpodautoscaler/loki-write failed to get cpu utilization: did not receive metrics for targeted pods (pods might be unready)
2
29m Warning FailedGetResourceMetric horizontalpodautoscaler/istiod failed to get cpu utilization: unable to get metrics for resource cpu: unable to fetch metrics from resource metrics API: the server could not find the requested resource (get pods.metrics.k8s.io)
29m Warning FailedComputeMetricsReplicas horizontalpodautoscaler/istiod invalid metrics (1 invalid out of 1), first error is: failed to get cpu resource metric value: failed to get cpu utilization: unable to get metrics for resource cpu: unable to fetch metrics from resource metrics API: the server could not find the requested resource (get pods.metrics.k8s.io)
29m Warning FailedGetResourceMetric horizontalpodautoscaler/istiod failed to get cpu utilization: unable to get metrics for resource cpu: unable to fetch metrics from resource metrics API: the server is currently unable to handle the request (get pods.metrics.k8s.io)
29m Warning FailedComputeMetricsReplicas horizontalpodautoscaler/istiod invalid metrics (1 invalid out of 1), first error is: failed to get cpu resource metric value: failed to get cpu utilization: unable to get metrics for resource cpu: unable to fetch metrics from resource metrics API: the server is currently unable to handle the request (get pods.metrics.k8s.io)
2
* Failed to execute "terraform_current apply -lock-timeout=20m -auto-approve -input=false -auto-approve" in ./.terragrunt-cache/jrM5TqaHxjlphT8vQ1DicmFp6eM/1NbRS_ankC8AcxKegXNWAnjyQEg
│ Error: Unable to continue with install: Certificate "platform-eng-eks-mcm" in namespace "istio-system" exists and cannot be imported into the current release: invalid ownership metadata; annotation validation error: key "meta.helm.sh/release-name" must equal "grafana-grafana-ingress": current value is "k8s-dashboard-k8s-dashboard-ingress"; annotation validation error: key "meta.helm.sh/release-namespace" must equal "grafana": current value is "k8s-dashboard"
│ with module.ingress_resources.helm_release.ingress,
│ on .terraform/modules/ingress_resources/main.tf line 6, in resource "helm_release" "ingress":
│ 6: resource "helm_release" "ingress" {

exit status 1

0 comments on commit 788748d

Please sign in to comment.