From 6c45644d1a025c83b3528f383d1686d7900944c1 Mon Sep 17 00:00:00 2001 From: badra001 Date: Wed, 23 Aug 2023 15:18:07 -0400 Subject: [PATCH] setup cloudwatch-agent --- .../cloudwatch-agent/.tf-control | 20 ++ .../cloudwatch-agent/.tf-control.tfrc | 24 ++ .../cloudwatch-agent/cloudwatch-agent.tf | 129 +++++----- .../cloudwatch-agent/fluentbit.tf | 186 ++++++++++++++ .../cloudwatch-agent/fluentbit.values.yml | 229 ++++++++++++++++++ .../cloudwatch-agent/region.tf | 3 + .../templates/fluentbit.env.yml.tpl | 13 + .../variables.cloudwatch-agent.auto.tfvars | 14 ++ .../variables.cloudwatch-agent.tf | 30 ++- .../variables.fluentbit.auto.tfvars | 14 ++ .../cloudwatch-agent/variables.fluentbit.tf | 57 +++++ 11 files changed, 662 insertions(+), 57 deletions(-) create mode 100644 examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/.tf-control create mode 100644 examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/.tf-control.tfrc create mode 100644 examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/fluentbit.tf create mode 100644 examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/fluentbit.values.yml create mode 100644 examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/region.tf create mode 100644 examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/templates/fluentbit.env.yml.tpl create mode 100644 examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/variables.fluentbit.auto.tfvars create mode 100644 examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/variables.fluentbit.tf diff --git a/examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/.tf-control b/examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/.tf-control new file mode 100644 index 0000000..280f449 --- /dev/null +++ b/examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/.tf-control @@ -0,0 +1,20 @@ +# .tf-control +# allows for setting a specific command to be used for tf-* commands under this git repo +# see tf-control.sh help for more info + +TFCONTROL_VERSION="1.0.5" + +TFCOMMAND="terraform_latest" +# TF_CLI_CONFIG_FILE=PATH-TO-FILE/.tf-control.tfrc +# TFARGS="" +# TFNOLOG="" +# TFNOCOLOR="" + +# use the following to force a specific version. An upgrade of an existing 0.12.31 to 1.x +# needs you to cycle through 0.13.17, 0.14.11, and then latest (0.15.5 not needed). Other +# steps in between. See https://github.e.it.census.gov/terraform/support/tree/master/docs/how-to/terraform-upgrade for details +# +#TFCOMMAND="terraform_0.12.31" +#TFCOMMAND="terraform_0.13.7" +#TFCOMMAND="terraform_0.14.11" +#TFCOMMAND="terraform_0.15.5" diff --git a/examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/.tf-control.tfrc b/examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/.tf-control.tfrc new file mode 100644 index 0000000..7425488 --- /dev/null +++ b/examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/.tf-control.tfrc @@ -0,0 +1,24 @@ +TFCONTROL_VERSION="1.0.5" + +# https://www.terraform.io/docs/cli/config/config-file.html +plugin_cache_dir = "/data/terraform/terraform.d/plugin-cache" +#disable_checkpoint = true + +provider_installation { +# filesystem_mirror { +# path = "/apps/terraform/terraform.d/providers" +# include = [ "*/*/*" ] +# } + filesystem_mirror { + path = "/data/terraform/terraform.d/providers" + include = [ "*/*/*" ] + } +# filesystem_mirror { +# path = "/apps/terraform/terraform.d/providers" +# include = [ "external.terraform.census.gov/*/*" ] +# } + direct { + include = [ "*/*/*" ] + } +} + diff --git a/examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/cloudwatch-agent.tf b/examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/cloudwatch-agent.tf index ffc6001..8eeacf3 100644 --- a/examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/cloudwatch-agent.tf +++ b/examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/cloudwatch-agent.tf @@ -1,37 +1,4 @@ # https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/Container-Insights-prerequisites.html -# https://registry.terraform.io/modules/bailey84j/cloudwatch-agent/kubernetes/latest -# cannot let this create the role, as it tries to attache a policy that does not exist -# we need this policy: arn:aws-us-gov:iam::aws:policy/CloudWatchAgentServerPolicy - -# need to hack the module for now -# main.tf -## container { -## name = "${var.name}-agent" -## # image = "amazon/${var.image_name}:${var.image_version}" -## image = "${var.image_name}:${var.image_version}" -# -# want to replace this with var.image_repository -# -# also want to do the same for the role, to allow a different managed role(s) besides -# the appsync one to be used - -module "cloudwatch-agent" { - source = "bailey84j/cloudwatch-agent/kubernetes" - version = "1.0.1" - - eks_cluster_name = var.cluster_name - create_namespace = false - image_name = split(":", local.cloudwatch_agent_images_output["cloudwatch-agent"].dest_full_path)[0] - image_version = local.cloudwatch_agent_images_output["cloudwatch-agent"].tag - create_iam_role = false - iam_role_arn = module.role_cloudwatch-agent.iam_role_arn - - tags = merge( - local.base_tags, - local.common_tags, - var.application_tags, - ) -} data "aws_iam_policy" "policy_cloudwatch-agent" { name = "CloudWatchAgentServerPolicy" @@ -41,7 +8,7 @@ module "role_cloudwatch-agent" { source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" role_description = "EKS IAM Role for ${var.cluster_name} for service account ${var.cloudwatch_agent_namespace}:${var.cloudwatch_agent_name}" - role_name = format("%v%v-irsa__%v", local._prefixes["eks"], var.cluster_name, var.cloudwatch_agent_name) + role_name = format("%v%v-irsa__%v", local._prefixes["eks-role"], var.cluster_name, var.cloudwatch_agent_name) role_policy_arns = { policy = data.aws_iam_policy.policy_cloudwatch-agent.arn @@ -65,28 +32,6 @@ module "role_cloudwatch-agent" { ) } -## module "role_cloudwatch-agent" { -## source = "git@github.e.it.census.gov:terraform-modules/aws-iam-role.git?ref=tf-upgrade" -## -## role_description = "EKS IAM Role for ${var.cluster_name} for service account ${var.cloudwatch_agent_namespace}:${var.cloudwatch_agent_name}" -## role_name = format("%v%v-irsa__%v", local._prefixes["eks"], var.cluster_name, var.cloudwatch_agent_name) -## enable_ldap_creation = false -## assume_policy_document = data.aws_iam_policy_document.assume_role_cloudwatch-agent.json -## attached_policies = [aws_iam_policy.policy_cloudwatch-agent.arn] -## -## tags = merge( -## local.base_tags, -## local.common_tags, -## var.tags, -## var.application_tags, -## { -## "eks:namespace" = var.namespace -## "eks:user" = var.name -## } -## ) -## } - - locals { cloudwatch_agent_images_output = { for k, v in module.images_cloudwatch-agent.images : v.name => v } } @@ -104,3 +49,75 @@ module "images_cloudwatch-agent" { var.application_tags, ) } + +resource "aws_cloudwatch_log_group" "cloudwatch_agent_logs" { + for_each = toset(var.cloudwatch_agent_log_names) + name = format("/aws/containerinsights/%v/%v", var.cluster_name, each.key) + retention_in_days = var.cloudwatch_agent_log_retention_days + tags = merge( + local.base_tags, + local.common_tags, + var.application_tags, + ) +} + +resource "kubernetes_namespace" "cloudwatch-agent" { + metadata { + name = var.cloudwatch_agent_namespace + } +} + +# chart +# https://github.com/aws/eks-charts/tree/master/stable/aws-cloudwatch-metrics +resource "helm_release" "cloudwatch-agent" { + chart = "aws-cloudwatch-metrics" + name = "aws-cloudwatch-metrics" + namespace = var.cloudwatch_agent_namespace + repository = var.cloudwatch_agent_charts["cloudwatch-agent"].use_remote ? var.cloudwatch_agent_charts["cloudwatch-agent"].repository : "${path.module}/charts" + version = var.cloudwatch_agent_charts["cloudwatch-agent"].use_remote ? var.cloudwatch_agent_charts["cloudwatch-agent"].version : null + + depends_on = [kubernetes_namespace.cloudwatch-agent,module.images_cloudwatch-agent] + set { + name = "image.repository" + value = split(":", local.cloudwatch_agent_images_output["cloudwatch-agent"].dest_full_path)[0] + } + + set { + name = "image.tag" + value = local.cloudwatch_agent_images_output["cloudwatch-agent"].tag + } + + set { + name = "clusterName" + value = var.cluster_name + } + set { + name = "serviceAccount.name" + value = var.cloudwatch_agent_name + } + set { + name = "serviceAccount.create" + value = "true" + } + set { + name = "serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn" + value = module.role_cloudwatch-agent.iam_role_arn + } + timeout = 300 +} + +data "aws_iam_policy_document" "cloudwatch_agent_policy_extra" { + statement { + sid = "DescribeVolumes" + effect = "Allow" + actions = [ "ec2:DescribeVolumes" ] + resources = [ "*" ] + } +} + +resource "aws_iam_role_policy" "cloudwatch_agent_policy_extra" { + name = "extra" + role = module.role_cloudwatch-agent.iam_role_name + + policy = data.aws_iam_policy_document.cloudwatch_agent_policy_extra.json +} diff --git a/examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/fluentbit.tf b/examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/fluentbit.tf new file mode 100644 index 0000000..26e20f1 --- /dev/null +++ b/examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/fluentbit.tf @@ -0,0 +1,186 @@ +# https://github.com/aws/aws-for-fluent-bit + +## % tf-aws ssm get-parameters-by-path --path /aws/service/aws-for-fluent-bit/ --query 'Parameters[*].Name'|grep 2.31.12 +## "/aws/service/aws-for-fluent-bit/2.31.12-windowsservercore", +## "/aws/service/aws-for-fluent-bit/init-2.31.12.20230629", +## "/aws/service/aws-for-fluent-bit/2.31.12.20230727", +## "/aws/service/aws-for-fluent-bit/2.31.12.20230629", +## "/aws/service/aws-for-fluent-bit/2.31.12", +## "/aws/service/aws-for-fluent-bit/init-2.31.12", +## "/aws/service/aws-for-fluent-bit/init-2.31.12.20230727" +## +## % tf-aws ssm get-parameter --name /aws/service/aws-for-fluent-bit/2.31.12.20230629 +## { +## "Parameter": { +## "Name": "/aws/service/aws-for-fluent-bit/2.31.12.20230629", +## "Type": "String", +## "Value": "161423150738.dkr.ecr.us-gov-west-1.amazonaws.com/aws-for-fluent-bit:2.31.12.20230629", +## "Version": 1, +## "LastModifiedDate": "2023-06-29T20:54:07.770000-04:00", +## "ARN": "arn:aws-us-gov:ssm:us-gov-west-1::parameter/aws/service/aws-for-fluent-bit/2.31.12.20230629", +## "DataType": "text" +## } +## } + + +data "aws_ssm_parameter" "fluentbit_image" { + name = format("/aws/service/aws-for-fluent-bit/%v", var.fluentbit_tag) + + lifecycle { + precondition { + condition = var.fluentbit_tag != null && var.fluentbit_tag != "" + error_message = "var.fluentbit_tag must be provided and not null or empty." + } + } +} + + +module "role_fluentbit" { + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + + role_description = "EKS IAM Role for ${var.cluster_name} for service account ${var.fluentbit_namespace}:${var.fluentbit_name}" + role_name = format("%v%v-irsa__%v", local._prefixes["eks-role"], var.cluster_name, var.fluentbit_name) + + role_policy_arns = { + policy = aws_iam_policy.policy_fluentbit.arn + } + + oidc_providers = { + main = { + provider_arn = local.oidc_provider_arn + namespace_service_accounts = [format("%v:%v", var.fluentbit_namespace, var.fluentbit_name)] + } + } + + tags = merge( + local.base_tags, + local.common_tags, + var.application_tags, + { + "eks:namespace" = var.fluentbit_namespace + "eks:user" = var.fluentbit_name + } + ) +} + +resource "aws_iam_policy" "policy_fluentbit" { + name = format("%v%v-irsa__%v", local._prefixes["eks-policy"], var.cluster_name, var.fluentbit_name) + description = "EKS IAM Policy for ${var.cluster_name} for service account ${var.fluentbit_namespace}:${var.fluentbit_name}" + path = "/" + policy = data.aws_iam_policy_document.policy_fluentbit.json + + tags = merge( + local.base_tags, + local.common_tags, + var.application_tags, + { + "Name" = format("%v%v-irsa__%v", local._prefixes["eks-policy"], var.cluster_name, var.fluentbit_name) + "eks:namespace" = var.fluentbit_namespace + "eks:user" = var.fluentbit_name + } + ) +} + + +# https://aws.amazon.com/blogs/opensource/centralized-container-logging-fluent-bit/ +data "aws_iam_policy_document" "policy_fluentbit" { + statement { + sid = "AllowFirehose" + effect = "Allow" + actions = [ + "firehose:PutRecordBatch" + ] + resources = ["*"] + } + ## statement { + ## sid = "PutLogEvents" + ## effect = "Allow" + ## actions = [ + ## "logs:PutLogEvents" + ## ] + ## resources = [ format("arn:%v:logs:*:*:log-group:*:*:*",data.aws_arn.current.partition) ] + ## } + statement { + sid = "CreateStreams" + effect = "Allow" + actions = [ + "logs:CreateLogStream", + "logs:DescribeLogStreams", + "logs:PutLogEvents" + ] + # resources = [ format("arn:%v:logs:*:*:log-group:*",data.aws_arn.current.partition) ] + resources = [for k, v in aws_cloudwatch_log_group.fluentbit_logs : format("%v:*", v.arn)] + } + ## statement { + ## sid = "CreateLogGroup" + ## effect = "Allow" + ## actions = [ + ## "logs:CreateLogGroup" + ## ] + ## resources = [ "*" ] + ## } +} + +resource "aws_cloudwatch_log_group" "fluentbit_logs" { + for_each = toset(var.fluentbit_log_names) + name = format("/aws/containerinsights/%v/%v", var.cluster_name, each.key) + retention_in_days = var.fluentbit_log_retention_days + tags = merge( + local.base_tags, + local.common_tags, + var.application_tags, + ) +} + +## helm, reference ssm image +# https://github.com/aws/eks-charts/tree/master/stable/aws-for-fluent-bit + +resource "helm_release" "fluentbit" { + chart = "aws-for-fluent-bit" + name = var.fluentbit_name + namespace = var.fluentbit_namespace + repository = var.fluentbit_charts["fluent-bit"].use_remote ? var.fluentbit_charts["fluent-bit"].repository : "${path.module}/charts" + version = var.fluentbit_charts["fluent-bit"].use_remote ? var.fluentbit_charts["fluent-bit"].version : null + + values = [ + file("fluentbit.values.yml"), + templatefile("${path.root}/templates/fluentbit.env.yml.tpl",{ + region = local.region + cluster_name = var.cluster_name + }) + ] + + set { + name = "cluster.name" + value = var.cluster_name + } + set { + name = "logs.region" + value = var.region + } + set { + name = "image.repository" + value = split(":", data.aws_ssm_parameter.fluentbit_image.value)[0] + } + set { + name = "image.tag" + value = var.fluentbit_tag + } + set { + name = "cloudWatchLogs.enabled" + value = "false" + } + set { + name = "serviceAccount.name" + value = var.fluentbit_name + } + set { + name = "serviceAccount.create" + value = "true" + } + set { + name = "serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn" + value = module.role_fluentbit.iam_role_arn + } + timeout = 300 +} diff --git a/examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/fluentbit.values.yml b/examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/fluentbit.values.yml new file mode 100644 index 0000000..029164e --- /dev/null +++ b/examples/full-cluster-tf-upgrade/1.25/common-services/cloudwatch-agent/fluentbit.values.yml @@ -0,0 +1,229 @@ +# https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/ContainerInsights-use-kubelet.html +# networkign needs to be enablrd for the kubernetes filter. The chart does not enable this and has comments about enabling +hostNetwork: true +dnsPolicy: ClusterFirstWithHostNet +# disable starndard input and filter +input: + enabled: false +filter: + enabled: false +# https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/Container-Insights-setup-logs-FluentBit.html +# https://raw.githubusercontent.com/aws-samples/amazon-cloudwatch-container-insights/latest/k8s-deployment-manifest-templates/deployment-mode/daemonset/container-insights-monitoring/cloudwatch-namespace.yaml +# https://raw.githubusercontent.com/aws-samples/amazon-cloudwatch-container-insights/latest/k8s-deployment-manifest-templates/deployment-mode/daemonset/container-insights-monitoring/fluent-bit/fluent-bit.yaml +# takes volumes, volumeMounts, and inputs,outputs,filters, and parsers from these sample +# note there seems not to be away to pass the labels and selector.matchLabels to this chart +# +volumeMounts: +# Please don't change below read-only permissions + - name: fluentbitstate + mountPath: /var/fluent-bit/state + - name: varlog + mountPath: /var/log + readOnly: true + - name: varlibdockercontainers + mountPath: /var/lib/docker/containers + readOnly: true + - name: runlogjournal + mountPath: /run/log/journal + readOnly: true + - name: dmesg + mountPath: /var/log/dmesg + readOnly: true +volumes: + - name: fluentbitstate + hostPath: + path: /var/fluent-bit/state + - name: varlog + hostPath: + path: /var/log + - name: varlibdockercontainers + hostPath: + path: /var/lib/docker/containers + - name: runlogjournal + hostPath: + path: /run/log/journal + - name: dmesg + hostPath: + path: /var/log/dmesg +additionalInputs: |- + [INPUT] + Name tail + Tag application.* + Exclude_Path /var/log/containers/cloudwatch-agent*, /var/log/containers/fluent-bit*, /var/log/containers/aws-node*, /var/log/containers/kube-proxy* + Path /var/log/containers/*.log + multiline.parser docker, cri + DB /var/fluent-bit/state/flb_container.db + Mem_Buf_Limit 50MB + Skip_Long_Lines On + Refresh_Interval 10 + Rotate_Wait 30 + storage.type filesystem + Read_from_Head ${READ_FROM_HEAD} + [INPUT] + Name tail + Tag application.* + Path /var/log/containers/fluent-bit* + multiline.parser docker, cri + DB /var/fluent-bit/state/flb_log.db + Mem_Buf_Limit 5MB + Skip_Long_Lines On + Refresh_Interval 10 + Read_from_Head ${READ_FROM_HEAD} + [INPUT] + Name tail + Tag application.* + Path /var/log/containers/cloudwatch-agent* + multiline.parser docker, cri + DB /var/fluent-bit/state/flb_cwagent.db + Mem_Buf_Limit 5MB + Skip_Long_Lines On + Refresh_Interval 10 + Read_from_Head ${READ_FROM_HEAD} + [INPUT] + Name systemd + Tag dataplane.systemd.* + Systemd_Filter _SYSTEMD_UNIT=docker.service + Systemd_Filter _SYSTEMD_UNIT=containerd.service + Systemd_Filter _SYSTEMD_UNIT=kubelet.service + DB /var/fluent-bit/state/systemd.db + Path /var/log/journal + Read_From_Tail ${READ_FROM_TAIL} + [INPUT] + Name tail + Tag dataplane.tail.* + Path /var/log/containers/aws-node*, /var/log/containers/kube-proxy* + multiline.parser docker, cri + DB /var/fluent-bit/state/flb_dataplane_tail.db + Mem_Buf_Limit 50MB + Skip_Long_Lines On + Refresh_Interval 10 + Rotate_Wait 30 + storage.type filesystem + Read_from_Head ${READ_FROM_HEAD} + [INPUT] + Name tail + Tag host.dmesg + Path /var/log/dmesg + Key message + DB /var/fluent-bit/state/flb_dmesg.db + Mem_Buf_Limit 5MB + Skip_Long_Lines On + Refresh_Interval 10 + Read_from_Head ${READ_FROM_HEAD} + [INPUT] + Name tail + Tag host.messages + Path /var/log/messages + Parser syslog + DB /var/fluent-bit/state/flb_messages.db + Mem_Buf_Limit 5MB + Skip_Long_Lines On + Refresh_Interval 10 + Read_from_Head ${READ_FROM_HEAD} + [INPUT] + Name tail + Tag host.secure + Path /var/log/secure + Parser syslog + DB /var/fluent-bit/state/flb_secure.db + Mem_Buf_Limit 5MB + Skip_Long_Lines On + Refresh_Interval 10 + Read_from_Head ${READ_FROM_HEAD} +additionalOutputs: |- + [OUTPUT] + Name cloudwatch_logs + Match application.* + region ${AWS_REGION} + log_group_name /aws/containerinsights/${CLUSTER_NAME}/application + log_stream_prefix ${HOST_NAME}- + auto_create_group true + extra_user_agent container-insights + [OUTPUT] + Name cloudwatch_logs + Match dataplane.* + region ${AWS_REGION} + log_group_name /aws/containerinsights/${CLUSTER_NAME}/dataplane + log_stream_prefix ${HOST_NAME}- + auto_create_group true + extra_user_agent container-insights + [OUTPUT] + Name cloudwatch_logs + Match host.* + region ${AWS_REGION} + log_group_name /aws/containerinsights/${CLUSTER_NAME}/host + log_stream_prefix ${HOST_NAME}. + auto_create_group true + extra_user_agent container-insights +additionalFilters: |- + [FILTER] + Name kubernetes + Match application.* + Kube_URL https://kubernetes.default.svc:443 + Kube_Tag_Prefix application.var.log.containers. + Merge_Log On + Merge_Log_Key log_processed + K8S-Logging.Parser On + K8S-Logging.Exclude Off + Labels Off + Annotations Off + Use_Kubelet On + Kubelet_Port 10250 + Buffer_Size 0 + Kube_CA_File /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + Kube_Token_File /var/run/secrets/kubernetes.io/serviceaccount/token + [FILTER] + Name modify + Match dataplane.systemd.* + Rename _HOSTNAME hostname + Rename _SYSTEMD_UNIT systemd_unit + Rename MESSAGE message + Remove_regex ^((?!hostname|systemd_unit|message).)*$ + [FILTER] + Name aws + Match dataplane.* + imds_version v1 + [FILTER] + Name aws + Match host.* + imds_version v1 +service: + extraParsers: |- + [PARSER] + Name syslog + Format regex + Regex ^(?