From b8a5c256b131d65b79710fcfa499dba382289ede Mon Sep 17 00:00:00 2001 From: Matthew Creal Morgan Date: Mon, 17 Mar 2025 14:05:02 -0700 Subject: [PATCH] Helm Refactor + autoscaling (#11) * autoscaling * use bitnami image: * lower requests * refactor chart values * cleanup * kill the canary * remove canary * unified config * fix values * template error * remove extras * add both storage configs * add some back * add back required images * add cleanup_on_fail * add replace true * add path_prefix * give more mem to backend and write * sort * update requests resources * set timeout to 10m * more values * more values * shorter timeout again * fix config parsing * fix schema * add s3 prefix * no thanos * more testing * delete store fix * fix volumes * update s3 * guess * less is more * increase requests --- .github/workflows/terragrunt-cicd.yml | 101 +++++++++++++++ README.md | 8 +- copy_images.tf | 28 ++-- main.tf | 180 ++++++-------------------- s3.tf | 2 +- values/loki.yaml | 104 --------------- values/loki.yml.tpl | 167 ++++++++++++++++++++++++ variables.tf | 11 -- 8 files changed, 316 insertions(+), 285 deletions(-) create mode 100644 .github/workflows/terragrunt-cicd.yml delete mode 100644 values/loki.yaml create mode 100644 values/loki.yml.tpl diff --git a/.github/workflows/terragrunt-cicd.yml b/.github/workflows/terragrunt-cicd.yml new file mode 100644 index 0000000..a78523e --- /dev/null +++ b/.github/workflows/terragrunt-cicd.yml @@ -0,0 +1,101 @@ +name: 'Terraform Module CI' + +on: + push: + branches: + - main + paths: + - '**/*.hcl' + - '**/*.tf' + pull_request: + branches: + - main + paths: + - '**/*.hcl' + - '**/*.tf' + +permissions: + contents: read + pull-requests: write + +jobs: + validate: + name: 'Validate Module' + runs-on: self-hosted + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Setup Terraform + uses: hashicorp/setup-terraform@v2 + with: + terraform_version: 1.5.0 + + - name: Terraform Init + run: | + terraform init -backend=false + + - name: Terraform Format + run: | + terraform fmt -check + + - name: Terraform Validate + run: | + terraform validate + + - name: Run tflint + uses: terraform-linters/setup-tflint@v3 + if: github.event_name == 'pull_request' + + - name: Lint Terraform + if: github.event_name == 'pull_request' + run: | + tflint --format compact + + release: + name: 'Create Release' + needs: validate + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + runs-on: self-hosted + permissions: + contents: write + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Install Commitizen + run: | + pip install commitizen + + - name: Configure Git + run: | + git config --local user.email "action@github.com" + git config --local user.name "GitHub Action" + + - name: Bump Version and Generate Changelog + id: cz + run: | + cz bump --yes + echo "new_version=$(cz version --project)" >> $GITHUB_OUTPUT + echo "changelog=$(cz changelog --dry-run)" >> $GITHUB_OUTPUT + + - name: Create Release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: v${{ steps.cz.outputs.new_version }} + release_name: Release v${{ steps.cz.outputs.new_version }} + draft: false + prerelease: false + body: ${{ steps.cz.outputs.changelog }} diff --git a/README.md b/README.md index e02ee99..efb515c 100644 --- a/README.md +++ b/README.md @@ -25,16 +25,15 @@ to loki. | Name | Version | |------|---------| -| [aws](#provider\_aws) | 5.87.0 | +| [aws](#provider\_aws) | 5.89.0 | | [helm](#provider\_helm) | 2.17.0 | -| [kubernetes](#provider\_kubernetes) | 2.35.1 | ## Modules | Name | Source | Version | |------|--------|---------| | [images](#module\_images) | git@github.e.it.census.gov:terraform-modules/aws-ecr-copy-images.git/ | tf-upgrade | -| [loki\_irsa\_role](#module\_loki\_irsa\_role) | git@github.e.it.census.gov:SCT-Engineering/tfmod-custom-iam-role-for-service-account-eks.git | n/a | +| [loki\_irsa\_role](#module\_loki\_irsa\_role) | git@github.e.it.census.gov:SCT-Engineering/tfmod-custom-iam-role-for-service-account-eks.git// | main | | [loki\_s3](#module\_loki\_s3) | git@github.e.it.census.gov:terraform-modules/aws-s3.git//standard | tf-upgrade | ## Resources @@ -42,7 +41,6 @@ to loki. | Name | Type | |------|------| | [helm_release.loki](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource | -| [kubernetes_namespace.ns](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace) | resource | | [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | | [aws_s3_bucket.s3_server_access_logs](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/s3_bucket) | data source | @@ -50,7 +48,6 @@ to loki. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [canary\_tag](#input\_canary\_tag) | The tag of the grafana/loki-canary image to use. | `string` | `"3.0.0"` | no | | [cluster\_name](#input\_cluster\_name) | EKS cluster name name component used through out the EKS cluster describing its purpose (ex: dice-dev) | `string` | n/a | yes | | [enterprise\_logs\_provisioner\_tag](#input\_enterprise\_logs\_provisioner\_tag) | The version of the grafana/enterprise-logs-provisioner image to use. | `string` | `"v1.7.0"` | no | | [exporter\_tag](#input\_exporter\_tag) | The version of prom/memcached-exporter to use for the gateway. | `string` | `"v0.14.4"` | no | @@ -64,7 +61,6 @@ to loki. | [region](#input\_region) | The region holding these resources (for the s3 bucket.) | `string` | n/a | yes | | [rwo\_storage\_class](#input\_rwo\_storage\_class) | Specify the storage class for read/write/once persistent volumes. | `string` | `"gp3-encrypted"` | no | | [sidecar\_tag](#input\_sidecar\_tag) | The version of kiwigrid/k8s-sidecar to use for the gateway. | `string` | `"1.27.4"` | no | -| [tag\_costallocation](#input\_tag\_costallocation) | Tag CostAllocation (default) | `string` | `"csvd:infrastructure"` | no | | [tags](#input\_tags) | Additional tags to add to resources created in AWS (s3 bucket, ...) | `map(string)` | `{}` | no | ## Outputs diff --git a/copy_images.tf b/copy_images.tf index fa07696..b1547be 100644 --- a/copy_images.tf +++ b/copy_images.tf @@ -1,37 +1,27 @@ locals { + exporter_key = format("%v#%v", "prom/memcached-exporter", var.exporter_tag) + gateway_key = format("%v#%v", "grafana/nginx-unprivileged", var.gateway_tag) loki_key = format("%v#%v", "grafana/loki", var.loki_tag) - canary_key = format("%v#%v", "grafana/loki-canary", var.canary_tag) + memcached_key = format("%v#%v", "memcached", var.memcached_tag) provisioner_key = format("%v#%v", "grafana/enterprise-logs-provisioner", var.enterprise_logs_provisioner_tag) - gateway_key = format("%v#%v", "grafana/nginx-unprivileged", var.gateway_tag) sidecar_key = format("%v#%v", "kiwigrid/k8s-sidecar", var.sidecar_tag) - memcached_key = format("%v#%v", "memcached", var.memcached_tag) - exporter_key = format("%v#%v", "prom/memcached-exporter", var.exporter_tag) image_config = [ { enabled = true dest_path = null name = "grafana/loki" - source_image = "grafana/loki" - source_registry = "docker.io" + source_image = "bitnami/grafana-loki" + source_registry = "public.ecr.aws" source_tag = var.loki_tag tag = var.loki_tag }, - { - enabled = true - dest_path = null - name = "grafana/loki-canary" - source_image = "grafana/loki-canary" - source_registry = "docker.io" - source_tag = var.canary_tag - tag = var.canary_tag - }, { enabled = true dest_path = null name = "memcached" - source_image = "memcached" - source_registry = "docker.io" + source_image = "bitnami/memcached" + source_registry = "public.ecr.aws" source_tag = var.memcached_tag tag = var.memcached_tag }, @@ -66,8 +56,8 @@ locals { enabled = true dest_path = null name = "grafana/nginx-unprivileged" - source_image = "nginxinc/nginx-unprivileged" - source_registry = "docker.io" + source_image = "nginx/nginx-unprivileged" + source_registry = "public.ecr.aws" source_tag = var.gateway_tag tag = var.gateway_tag }, diff --git a/main.tf b/main.tf index 368566e..5be6fda 100644 --- a/main.tf +++ b/main.tf @@ -4,22 +4,9 @@ locals { gateway_internal_url = format("http://%v:%v", local.gateway_internal_hostname, local.gateway_internal_port_number) } -locals { - tags = merge({ - "boc:eks-cluster-name" = var.cluster_name - "boc:tf_module_name" = local.module_name - "boc:tf_module_version" = local.module_version - "boc:created_by" = "terraform" - CostAllocation = var.tag_costallocation - }, var.tags) - -} - module "loki_irsa_role" { - # source = "git@github.it.census.gov:SOA/tfmod-custom-iam-role-for-service-account-eks.git/?ref=1.0.0" - # tflint-ignore: terraform_module_version # tflint-ignore: terraform_module_pinned_source - source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-custom-iam-role-for-service-account-eks.git" + source = "git@github.e.it.census.gov:SCT-Engineering/tfmod-custom-iam-role-for-service-account-eks.git//?ref=main" role_name = "r-${var.cluster_name}-loki" @@ -40,140 +27,45 @@ module "loki_irsa_role" { namespace_service_accounts = ["${var.namespace}:loki"] } } - tags = local.tags + tags = var.tags } resource "helm_release" "loki" { - chart = "loki" - version = var.loki_chart_version - name = "loki" - namespace = var.namespace - repository = "https://grafana.github.io/helm-charts" - wait = true + atomic = true + chart = "loki" + cleanup_on_fail = true + name = "loki" + namespace = var.namespace + replace = true + repository = "https://grafana.github.io/helm-charts" + timeout = 300 + version = var.loki_chart_version + wait = true values = [ - file("${path.module}/values/loki.yaml") + templatefile("${path.module}/values/loki.yml.tpl", { + # Image references + global_image_registry = module.images.images[local.loki_key].dest_registry + loki_image_registry = module.images.images[local.loki_key].dest_registry + loki_image_repository = module.images.images[local.loki_key].dest_repository + loki_image_tag = module.images.images[local.loki_key].tag + provisioner_image_repository = split(":", module.images.images[local.provisioner_key].dest_full_path)[0] + provisioner_image_tag = module.images.images[local.provisioner_key].tag + gateway_image_repository = module.images.images[local.gateway_key].dest_repository + gateway_image_tag = module.images.images[local.gateway_key].tag + sidecar_image_repository = split(":", module.images.images[local.sidecar_key].dest_full_path)[0] + sidecar_image_tag = module.images.images[local.sidecar_key].tag + memcached_image_repository = split(":", module.images.images[local.memcached_key].dest_full_path)[0] + memcached_image_tag = module.images.images[local.memcached_key].tag + exporter_image_repository = split(":", module.images.images[local.exporter_key].dest_full_path)[0] + exporter_image_tag = module.images.images[local.exporter_key].tag + # Storage configuration + s3_bucket_name = module.loki_s3.s3_requested_bucket_name + region = var.region + # Storage classes + rwo_storage_class = var.rwo_storage_class + # IAM role + iam_role_arn = module.loki_irsa_role.iam_role_arn + }) ] - - # Dynamic values that depend on Terraform variables or computed values - set { - name = "global.image.registry" - value = module.images.images[local.loki_key].dest_registry - } - - set { - name = "loki.image.repository" - value = module.images.images[local.loki_key].dest_repository - } - set { - name = "loki.image.tag" - value = module.images.images[local.loki_key].tag - } - - set { - name = "serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn" - value = module.loki_irsa_role.iam_role_arn - } - - # Storage-related dynamic configurations - set { - name = "loki.storage.bucketNames.chunks" - value = module.loki_s3.s3_requested_bucket_name - } - set { - name = "loki.storage.bucketNames.ruler" - value = module.loki_s3.s3_requested_bucket_name - } - set { - name = "loki.storage.bucketNames.admin" - value = module.loki_s3.s3_requested_bucket_name - } - set { - name = "loki.storage.type" - value = "s3" - } - set { - name = "loki.storage.s3.s3" - value = format("s3://%v", var.region) - } - set { - name = "loki.storage.s3.region" - value = var.region - } - set { - name = "loki.storage_config.aws.s3" - value = format("s3://%v/%v", - var.region, - module.loki_s3.s3_requested_bucket_name - ) - } - - # Storage class configurations - set { - name = "write.persistence.storageClass" - value = var.rwo_storage_class - } - set { - name = "backend.persistence.storageClass" - value = var.rwo_storage_class - } - set { - name = "read.persistence.storageClass" - value = var.rwo_storage_class - } - - # Image configurations for additional components - set { - name = "loki.provisioner.image.repository" - value = split(":", module.images.images[local.provisioner_key].dest_full_path)[0] - } - set { - name = "loki.provisioner.image.tag" - value = module.images.images[local.provisioner_key].tag - } - - set { - name = "gateway.image.repository" - value = module.images.images[local.gateway_key].dest_repository - } - set { - name = "gateway.image.tag" - value = module.images.images[local.gateway_key].tag - } - - set { - name = "lokiCanary.image.repository" - value = module.images.images[local.canary_key].dest_repository - } - set { - name = "lokiCanary.image.tag" - value = module.images.images[local.canary_key].tag - } - - set { - name = "sidecar.image.repository" - value = split(":", module.images.images[local.sidecar_key].dest_full_path)[0] - } - set { - name = "sidecar.image.tag" - value = module.images.images[local.sidecar_key].tag - } - - set { - name = "memcached.image.repository" - value = split(":", module.images.images[local.memcached_key].dest_full_path)[0] - } - set { - name = "memcached.image.tag" - value = module.images.images[local.memcached_key].tag - } - - set { - name = "memcachedExporter.image.repository" - value = split(":", module.images.images[local.exporter_key].dest_full_path)[0] - } - set { - name = "memcachedExporter.image.tag" - value = module.images.images[local.exporter_key].tag - } } diff --git a/s3.tf b/s3.tf index cac37c6..dd5a704 100644 --- a/s3.tf +++ b/s3.tf @@ -15,5 +15,5 @@ module "loki_s3" { bucket_name = format("%v-loki", var.cluster_name) access_log_bucket = data.aws_s3_bucket.s3_server_access_logs.id - tags = local.tags + tags = var.tags } diff --git a/values/loki.yaml b/values/loki.yaml deleted file mode 100644 index ab5e963..0000000 --- a/values/loki.yaml +++ /dev/null @@ -1,104 +0,0 @@ ---- -loki: - auth_enabled: false - analytics: - reporting_enabled: true - - schemaConfig: - configs: - - from: 2024-04-01 - index: - period: 24h - prefix: loki_sb_index_ - object_store: s3 - schema: v13 - store: tsdb - - limits_config: - ingestion_rate_strategy: local - max_global_streams_per_user: 5000 - max_query_parallelism: 32 - max_streams_per_user: 10000 - -write: - persistence: - enabled: true - autoscaling: - enabled: true - resources: - requests: - cpu: 100m - memory: 128Mi - extraVolumesMounts: - - name: data - mountPath: /loki - extraVolumes: - - name: loki - -read: - persistence: - enabled: true - autoscaling: - enabled: true - minReplicas: 1 - resources: - requests: - cpu: 100m - memory: 128Mi - -backend: - autoscaling: - enabled: true - resources: - requests: - cpu: 100m - memory: 128Mi - -gateway: - resources: - requests: - cpu: 50m - memory: 64Mi - -compactor: - working_directory: /loki/compactor - shared_store: s3 - compaction_interval: 10m - retention_enabled: true - retention_delete_delay: 2h - retention_delete_worker_count: 150 - resources: - requests: - cpu: 100m - memory: 128Mi - -sidecar: - resources: - requests: - cpu: 500m - memory: 512Mi - -ruler: - resources: - requests: - cpu: 500m - memory: 512Mi - -monitoring: - dashboards: - enabled: false - rules: - enabled: false - serviceMonitor: - enabled: false - selfMonitoring: - enabled: false - lokiCanary: - enabled: false - -memberlist: - service: - publishNotReadyAddresses: false - -test: - enabled: false diff --git a/values/loki.yml.tpl b/values/loki.yml.tpl new file mode 100644 index 0000000..2a89401 --- /dev/null +++ b/values/loki.yml.tpl @@ -0,0 +1,167 @@ +--- +deploymentMode: SimpleScalable + +# Global settings +global: + image: + registry: ${loki_image_registry} + +# Main Loki configuration +loki: + auth_enabled: false + image: + repository: ${loki_image_repository} + tag: ${loki_image_tag} + ingester: + chunk_encoding: snappy + limits_config: + allow_structured_metadata: true + ingestion_rate_strategy: local + max_query_length: 2160h + max_query_parallelism: 32 + max_streams_per_user: 1000 + query_timeout: 300s + retention_period: 2160h + # Provisioner settings + provisioner: + image: + repository: ${provisioner_image_repository} + tag: ${provisioner_image_tag} + querier: + max_concurrent: 4 + replication_factor: 1 + schemaConfig: + configs: + - from: "2024-04-01" + store: tsdb + object_store: s3 + schema: v13 + index: + prefix: index_ + period: 24h + # Storage configuration + storage: + type: s3 + bucketNames: + admin: ${s3_bucket_name} + chunks: ${s3_bucket_name} + ruler: ${s3_bucket_name} + s3: + s3: s3://${region} + bucketName: ${s3_bucket_name} + region: ${region} + s3ForcePathStyle: false + tracing: + enabled: true + + +backend: + autoscaling: + enabled: true + minReplicas: 1 + targetCPUUtilizationPercentage: 80 + persistence: + storageClass: ${rwo_storage_class} + replicas: 3 + resources: + requests: + cpu: 10m + memory: 128Mi + limits: + cpu: 1000m + memory: 512Mi +bloomCompactor: + replicas: 0 +bloomGateway: + replicas: 0 +chunksCache: + enabled: false +compactor: + replicas: 0 +distributer: + replicas: 0 +gateway: + image: + repository: ${gateway_image_repository} + tag: ${gateway_image_tag} + autoscaling: + enabled: true + targetCPUUtilizationPercentage: 80 + replicas: 1 + resources: + requests: + cpu: 10m + memory: 128Mi + limits: + cpu: 100m + memory: 128Mi +indexGateway: + replicas: 0 +ingester: + replicas: 0 +lokiCanary: + enabled: false +memcached: + enabled: false + chunk_cache: + enabled: false + results_cache: + enabled: false +memcachedExporter: + enabled: false +querier: + replicas: 0 +queryFrontend: + replicas: 0 +queryScheduler: + replicas: 0 +read: + autoscaling: + enabled: true + minReplicas: 1 + targetCPUUtilizationPercentage: 80 + replicas: 3 + resources: + requests: + cpu: 10m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi +resultsCache: + enabled: false +ruler: + enabled: false +serviceAccount: + annotations: + eks.amazonaws.com/role-arn: ${iam_role_arn} +sidecar: + image: + repository: ${sidecar_image_repository} + tag: ${sidecar_image_tag} + resources: + requests: + cpu: 10m + memory: 128Mi + limits: + cpu: 200m + memory: 256Mi +singleBinary: + replicas: 0 +test: + enabled: false +write: + persistence: + storageClass: ${rwo_storage_class} + replicas: 3 + autoscaling: + enabled: true + minReplicas: 1 + targetCPUUtilizationPercentage: 80 + resources: + requests: + cpu: 10m + memory: 128Mi + limits: + cpu: 1000m + memory: 512Mi diff --git a/variables.tf b/variables.tf index 650fac4..35009fe 100644 --- a/variables.tf +++ b/variables.tf @@ -1,8 +1,3 @@ -variable "tag_costallocation" { - description = "Tag CostAllocation (default)" - type = string - default = "csvd:infrastructure" -} variable "tags" { description = "Additional tags to add to resources created in AWS (s3 bucket, ...)" @@ -58,12 +53,6 @@ variable "loki_tag" { default = "3.1.1" } -variable "canary_tag" { - description = "The tag of the grafana/loki-canary image to use." - type = string - default = "3.0.0" -} - variable "enterprise_logs_provisioner_tag" { description = "The version of the grafana/enterprise-logs-provisioner image to use." type = string