Hi,
I am using EKS Fargate and have a requirement to spread the pod allocation across different AZs. However, this is not happening for all pods.
➜ terraform-aws-eks git:(master) ✗ kubectl get node -o wide
NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME
fargate-ip-10-124-2-169.ap-northeast-1.compute.internal Ready <none> 4m5s v1.27.7-eks-4f4795d 10.124.2.169 <none> Amazon Linux 2 5.10.201-191.748.amzn2.x86_64 containerd://1.6.6
fargate-ip-10-124-2-172.ap-northeast-1.compute.internal Ready <none> 4m10s v1.27.7-eks-4f4795d 10.124.2.172 <none> Amazon Linux 2 5.10.201-191.748.amzn2.x86_64 containerd://1.6.6
fargate-ip-10-124-2-189.ap-northeast-1.compute.internal Ready <none> 2m40s v1.27.7-eks-4f4795d 10.124.2.189 <none> Amazon Linux 2 5.10.201-191.748.amzn2.x86_64 containerd://1.6.6
fargate-ip-10-124-2-231.ap-northeast-1.compute.internal Ready <none> 2m40s v1.27.7-eks-4f4795d 10.124.2.231 <none> Amazon Linux 2 5.10.201-191.748.amzn2.x86_64 containerd://1.6.6
fargate-ip-10-124-4-16.ap-northeast-1.compute.internal Ready <none> 2m46s v1.27.7-eks-4f4795d 10.124.4.16 <none> Amazon Linux 2 5.10.201-191.748.amzn2.x86_64 containerd://1.6.6
➜ terraform-aws-eks git:(master) ✗ kubectl get po -A -o wide
NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
kube-system aws-load-balancer-controller-84d8758b8d-9gs8b 1/1 Running 0 4m54s 10.124.2.231 fargate-ip-10-124-2-231.ap-northeast-1.compute.internal <none> <none>
kube-system aws-load-balancer-controller-84d8758b8d-b2xm9 1/1 Running 0 4m54s 10.124.2.189 fargate-ip-10-124-2-189.ap-northeast-1.compute.internal <none> <none>
kube-system coredns-695c856f5f-4kp4k 1/1 Running 0 6m22s 10.124.2.172 fargate-ip-10-124-2-172.ap-northeast-1.compute.internal <none> <none>
kube-system coredns-695c856f5f-tjfh2 1/1 Running 0 6m22s 10.124.2.169 fargate-ip-10-124-2-169.ap-northeast-1.compute.internal <none> <none>
observability adot-collector-0 1/1 Running 0 5m1s 10.124.4.16 fargate-ip-10-124-4-16.ap-northeast-1.compute.internal <none> <none>
This is my code and I am referring to coredns as an example. Hopefully the same fix can be applied to other add-ons.
provider "aws" {
region = local.region
}
data "aws_availability_zones" "available" {}
locals {
name = "ex-${replace(basename(path.cwd), "_", "-")}"
cluster_version = "1.27"
region = "ap-northeast-1"
vpc_id = "vpc-0980d9fa546fbfxxx"
vpc_cidr = "10.124.0.0/16"
azs = slice(data.aws_availability_zones.available.names, 0, 3)
priv_subnet_ids = ["subnet-00dc6c03ef361a3xxx","subnet-08ce13e3ac1d42xxx","subnet-0cf82687fcb285xxx"]
tags = {
Example = local.name
GithubRepo = "terraform-aws-eks"
GithubOrg = "terraform-aws-modules"
}
app_namespace = [ "abc-application" ]
}
################################################################################
# EKS Module
################################################################################
data "aws_eks_addon_version" "latest" {
for_each = toset(["kube-proxy", "vpc-cni", "coredns"])
addon_name = each.value
#kubernetes_version = data.aws_eks_cluster.eks.version
kubernetes_version = module.eks.cluster_version
most_recent = true
}
data "aws_eks_cluster_auth" "eks" {
name = module.eks.cluster_name
}
provider "kubernetes" {
host = module.eks.cluster_endpoint
cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
exec {
api_version = "client.authentication.k8s.io/v1beta1"
args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name]
command = "aws"
}
}
provider "helm" {
kubernetes {
host = module.eks.cluster_endpoint
cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
exec {
api_version = "client.authentication.k8s.io/v1beta1"
args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name]
command = "aws"
}
}
}
resource "kubernetes_namespace" "application_namespace" {
for_each = toset(local.app_namespace)
metadata {
name = each.key
}
}
module "eks" {
source = "./modules/aws-eks"
cluster_name = local.name
cluster_version = local.cluster_version
cluster_endpoint_public_access = true
vpc_id = local.vpc_id
subnet_ids = local.priv_subnet_ids
control_plane_subnet_ids = []
fargate_profile_defaults = {
iam_role_additional_policies = {
additional = aws_iam_policy.additional.arn
}
}
fargate_profiles = merge(
{
application = {
name = local.app_namespace[0]
selectors = [
{
namespace = local.app_namespace[0]
labels = {
tenant = "tcs"
environment = "uat-test"
zone = "abc-iac"
},
},
]
# Using specific subnets instead of the subnets supplied for the cluster itself
# subnet_ids = [local.priv_subnet_ids[0],local.priv_subnet_ids[1]]
subnet_ids = local.priv_subnet_ids
tags = {
Owner = "LDD"
}
additional_tags = {
ExtraTag = "Fargate"
}
timeouts = {
create = "20m"
delete = "20m"
}
}
},
{
default = {
name = "default"
selectors = [
{
namespace = "default"
labels = {
tenant = "tcs"
environment = "uat-test"
zone = "abc-iac"
},
},
]
# Using specific subnets instead of the subnets supplied for the cluster itself
subnet_ids = local.priv_subnet_ids
tags = {
Owner = "LDD"
}
additional_tags = {
ExtraTag = "Fargate"
}
timeouts = {
create = "20m"
delete = "20m"
}
}
},
{
observability = {
name = "observability"
selectors = [
{
namespace = "observability"
labels = {},
},
]
# Using specific subnets instead of the subnets supplied for the cluster itself
subnet_ids = local.priv_subnet_ids
tags = {
Owner = "LDD"
}
additional_tags = {
ExtraTag = "Fargate"
}
timeouts = {
create = "20m"
delete = "20m"
}
}
},
{
argo = {
name = "argo"
selectors = [
{
namespace = "argo-rollouts"
labels = {},
},
]
# Using specific subnets instead of the subnets supplied for the cluster itself
subnet_ids = local.priv_subnet_ids
tags = {
Owner = "LDD"
}
additional_tags = {
ExtraTag = "Fargate"
}
timeouts = {
create = "20m"
delete = "20m"
}
}
},
{
kyverno = {
name = "kyverno"
selectors = [
{
namespace = "kyverno"
labels = {},
},
]
# Using specific subnets instead of the subnets supplied for the cluster itself
subnet_ids = local.priv_subnet_ids
tags = {
Owner = "LDD"
}
additional_tags = {
ExtraTag = "Fargate"
}
timeouts = {
create = "20m"
delete = "20m"
}
}
},
{ for i in range(3) :
"kube-system-${element(split("-", local.azs[i]), 2)}" => {
selectors = [
{ namespace = "kube-system" }
]
# We want to create a profile per AZ for high availability
subnet_ids = [element(local.priv_subnet_ids, i)]
tags = {
Owner = "LDD"
}
additional_tags = {
ExtraTag = "Fargate"
}
timeouts = {
create = "20m"
delete = "20m"
}
}
}
)
tags = local.tags
}
module "eks_blueprints_kubernetes_addons" {
depends_on = [module.eks]
source = "./modules/kubernetes-addons"
eks_cluster_id = module.eks.cluster_name
## Fluentbit
app_names = [
"kube-system",
"kyverno"
]
## Application Namespace
app_namespace = "abc-application"
enable_amazon_eks_vpc_cni = true
amazon_eks_vpc_cni_config = {
addon_version = data.aws_eks_addon_version.latest["vpc-cni"].version
resolve_conflicts = "OVERWRITE"
}
enable_amazon_eks_kube_proxy = true
amazon_eks_kube_proxy_config = {
addon_version = data.aws_eks_addon_version.latest["kube-proxy"].version
resolve_conflicts = "OVERWRITE"
}
enable_amazon_eks_coredns = true
amazon_eks_coredns_config = {
addon_version = data.aws_eks_addon_version.latest["coredns"].version
resolve_conflicts = "OVERWRITE"
configuration_values = jsonencode({
replicaCount = 4
tolerations = [
{
key = "dedicated",
operator = "Equal",
effect = "NoSchedule",
value = "orchestration-seb"
}
]
topologySpreadConstraints = [
{
maxSkew = 1
topologyKey = "topology.kubernetes.io/zone"
whenUnsatisfiable = "ScheduleAnyway"
labelSelector = {
matchLabels = {
k8s-app: "kube-dns"
}
}
}
]
affinity = {
nodeAffinity = {
requiredDuringSchedulingIgnoredDuringExecution = {
nodeSelectorTerms = [
{
matchExpressions = [
{
key = "kubernetes.io/os"
operator = "In"
values = ["linux"]
},
{
key = "kubernetes.io/arch"
operator = "In"
values = ["amd64"]
}
]
}]
}
}
podAffinity = {
requiredDuringSchedulingIgnoredDuringExecution = [{
labelSelector = {
matchExpressions = [
{
key = "k8s-app"
operator = "NotIn"
values = ["kube-dns"]
}
]
}
topologyKey = "kubernetes.io/hostname"
}
]
}
podAntiAffinity = {
preferredDuringSchedulingIgnoredDuringExecution = [{
podAffinityTerm = {
labelSelector = {
matchExpressions = [
{
key = "k8s-app"
operator = "In"
values = ["kube-dns"]
}
]
}
topologyKey = "kubernetes.io/hostname"
}
weight = 100
}
]
requiredDuringSchedulingIgnoredDuringExecution = [{
labelSelector = {
matchExpressions = [
{
key = "k8s-app"
operator = "In"
values = ["kube-dns"]
}
]
}
topologyKey = "kubernetes.io/hostname"
}
]
}
}
resources = {
limits = {
cpu = "100m"
memory = "150Mi"
}
requests = {
cpu = "100m"
memory = "150Mi"
}
}
computeType = "Fargate"
})
}
enable_fargate_fluentbit = true
enable_aws_load_balancer_controller = true
enable_central_irsa = true
enable_fargate_adot_collector = true
enable_k8s_role-mapping = true
}
I searched around and found this link [EKS/Fargate] [request]: Schedule evenly pod replicas across AZs · Issue #824 · aws/containers-roadmap · GitHub but was not sure how to label each pod
from the add-ons so that it will be tied to a different Fargate profile as proposed in Approach 2.
Also tried the topologySpreadConstraints from Amazon EKS Addons - Amazon EKS Blueprints Addons but it is not respecting this setting too.
I tried to play around with hardcording by using nodeSelector to bring the pods to the second AZ but it is not taking effect.
nodeSelector = {
“topology.kubernetes.io/zone” = “ap-northeast-1b”
}
Appreciate if anyone can share how to schedule Fargate pods evenly across multiple AZs via Terraform.
Thanks.