Pipelines retreiving secrets from vault having timeouts at same time when active pod is changed

We use a k8s production vault in HA mode deployed on EKS cluster

Values file like below

lobal:
  tlsDisable: false

####################

server:
  enabled: true
  


  extraLabels:
    vault-active: "true"

  logLevel: "debug"

  resources:
    requests:
      memory: 256Mi
      cpu: 250m
    limits:
      memory: 1Gi
      cpu: 1
  
  ingress:
    enabled: true
    annotations: 
      cert-manager.io/issuer: letsencrypt
      kubernetes.io/tls-acme: "true"
      kubernetes.io/ingress.class: nginx
      nginx.ingress.kubernetes.io/backend-protocol: HTTPS
    paths: 
      -  / 
    pathType: ImplementationSpecific
    hosts:
      - host: x.y.z.com
    tls:
      - secretName: vault-cert
        hosts:
          -  x.y.z.com

  auditStorage:
    enabled: true
    size: 10Gi
    mountPath: "/vault/audit"
    accessMode: ReadWriteOnce

  volumes: 
    - name: tls
      secret:
        secretName: vault-internal

  volumeMounts: 
    - name: tls
      mountPath: "/vault/userconfig/vault-internal"
      readOnly: true
      
  ha:
    enabled: true 
    replicas: 3

    config: |
      ui = true

      listener "tcp" {
        tls_cert_file = "/vault/userconfig/vault-internal/tls.crt"
        tls_key_file = "/vault/userconfig/vault-internal/tls.key"
        tls_client_ca_file = "/vault/userconfig/vault-internal/ca.crt"
        tls_min_version = "tls12"
        address = "[::]:8200"
        cluster_address = "[::]:8201"
      }

      storage "dynamodb" {
        ha_enabled = "true"
        region     = "xyz"
        table      = "xyz"
        
      }

      seal "awskms" {
      region     = "xyz"
      kms_key_id = "xyz"
      }

      telemetry {
          prometheus_retention_time = "30s"
          disable_hostname = true
        }

  serviceAccount:
    create: true
    name: "vault"
    annotations: 
      eks.amazonaws.com/role-arn: arn:aws:iam::xyz
  
  extraEnvironmentVars: 
    VAULT_ADDR: https://127.0.0.1:8200
    VAULT_CACERT: /vault/userconfig/vault-internal/ca.crt

####################

injector:
  enabled: false

We are not sure why the active pod is changing all we see that tls errors happens after the active pod and after a while everything is self healed, I would appreciate any help as we are stuck not sure if we need to fine tune anything to avoid the timeouts

Please help if possible.

It is normal for there to be some downtime when the active node changes.

We have been in doubts about that as when we hit a high load a leader change vault pods takes too much time to be stable

We remove tls part and we still see tls errors

global:
  tlsDisable: true

####################

server:
  enabled: true
  
  image:
    repository: "hashicorp/vault"


  # extraLabels:
  #   vault-active: "true"

  logLevel: "debug"

  resources:
    requests:
      memory: 5Gi
      cpu: 1
    limits:
      memory: 12Gi
      cpu: 2
  
  ingress:
    enabled: true
    annotations: 
      cert-manager.io/issuer: letsencrypt
      kubernetes.io/tls-acme: "true"
      kubernetes.io/ingress.class: nginx
      nginx.ingress.kubernetes.io/backend-protocol: HTTP
    paths: 
      -  / 
    pathType: ImplementationSpecific
    hosts:
      - host: <hidden>
    tls:
      - secretName: vault-cert
        hosts:
          -  <hidden>

  auditStorage:
    enabled: true
    size: 10Gi
    mountPath: "/vault/audit"
    accessMode: ReadWriteOnce

  volumes: 
    - name: tls
      secret:
        secretName: vault-internal

  volumeMounts: 
    - name: tls
      mountPath: "/vault/userconfig/vault-internal"
      readOnly: true
      
  ha:
    enabled: true 
    replicas: 3

    config: |
      ui = true

      listener "tcp" {
        tls_disable = 1
        # tls_cert_file = "/vault/userconfig/vault-internal/tls.crt"
        # tls_key_file = "/vault/userconfig/vault-internal/tls.key"
        # tls_client_ca_file = "/vault/userconfig/vault-internal/ca.crt"
        # tls_min_version = "tls12"
        address = "[::]:8200"
        cluster_address = "[::]:8201"
      }

      storage "dynamodb" {
        ha_enabled = "true"
        region     = "<hidden>"
        table      = "<hidden>"
        endpoint   = "<hidden>"
      }

      seal "awskms" {
      region     = "<hidden>"
      kms_key_id = "<hidden>"
      }

      telemetry {
          prometheus_retention_time = "30s"
          disable_hostname = true
        }
      
      service_registration "kubernetes" {}
      
  serviceAccount:
    create: true
    name: "vault"
    annotations: 
      eks.amazonaws.com/role-arn: <hidden>
  
  extraEnvironmentVars: 
    VAULT_ADDR: http://127.0.0.1:8200
    # VAULT_CACERT: /vault/userconfig/vault-internal/ca.crt

  readinessProbe:
    enabled: true
    path: "/v1/sys/health?standbyok=true&sealedcode=204&uninitcode=204"
    initialDelaySeconds: 120
    timeoutSeconds: 10
    scheme: http

  livenessProbe:
    enabled: true
    path: "/v1/sys/health?standbyok=true"
    # When a probe fails, Kubernetes will try failureThreshold times before giving up
    failureThreshold: 2
    # Number of seconds after the container has started before probe initiates
    initialDelaySeconds: 120
    # How often (in seconds) to perform the probe
    periodSeconds: 10
    # Minimum consecutive successes for the probe to be considered successful after having failed
    successThreshold: 1
    # Number of seconds after which the probe times out.
    timeoutSeconds: 10
    scheme: http

####################

injector:
  enabled: false