We use a k8s production vault in HA mode deployed on EKS cluster
Values file like below
lobal:
tlsDisable: false
####################
server:
enabled: true
extraLabels:
vault-active: "true"
logLevel: "debug"
resources:
requests:
memory: 256Mi
cpu: 250m
limits:
memory: 1Gi
cpu: 1
ingress:
enabled: true
annotations:
cert-manager.io/issuer: letsencrypt
kubernetes.io/tls-acme: "true"
kubernetes.io/ingress.class: nginx
nginx.ingress.kubernetes.io/backend-protocol: HTTPS
paths:
- /
pathType: ImplementationSpecific
hosts:
- host: x.y.z.com
tls:
- secretName: vault-cert
hosts:
- x.y.z.com
auditStorage:
enabled: true
size: 10Gi
mountPath: "/vault/audit"
accessMode: ReadWriteOnce
volumes:
- name: tls
secret:
secretName: vault-internal
volumeMounts:
- name: tls
mountPath: "/vault/userconfig/vault-internal"
readOnly: true
ha:
enabled: true
replicas: 3
config: |
ui = true
listener "tcp" {
tls_cert_file = "/vault/userconfig/vault-internal/tls.crt"
tls_key_file = "/vault/userconfig/vault-internal/tls.key"
tls_client_ca_file = "/vault/userconfig/vault-internal/ca.crt"
tls_min_version = "tls12"
address = "[::]:8200"
cluster_address = "[::]:8201"
}
storage "dynamodb" {
ha_enabled = "true"
region = "xyz"
table = "xyz"
}
seal "awskms" {
region = "xyz"
kms_key_id = "xyz"
}
telemetry {
prometheus_retention_time = "30s"
disable_hostname = true
}
serviceAccount:
create: true
name: "vault"
annotations:
eks.amazonaws.com/role-arn: arn:aws:iam::xyz
extraEnvironmentVars:
VAULT_ADDR: https://127.0.0.1:8200
VAULT_CACERT: /vault/userconfig/vault-internal/ca.crt
####################
injector:
enabled: false
We are not sure why the active pod is changing all we see that tls errors happens after the active pod and after a while everything is self healed, I would appreciate any help as we are stuck not sure if we need to fine tune anything to avoid the timeouts