Envoy proxy "Permission denied: anonymous token lacks permission" with workload identity

Hello,

I’ve recently set up a new cluster with nomad 1.7.x and consul 1.17.x.

I am attempting to correct multiple issues that plagues our current production environment, mainly:

  • mTLS for nomad and consul control plane
  • Use of workload identity for nomad jobs
  • Cluster peering for different workloads on several AWS accounts.

As I’ve managed to clean up most of my issues so far, one remains that have bugged me for the past few days.

Whenever I set up a nomad job making use of consul connect, i get the following error repeated:

DeltaAggregatedResources gRPC config stream to local_agent closed: 7, Permission denied: anonymous token lacks permission 'service:write' on "test-server". The anonymous token is used implicitly when a request does not specify a token.

My initial thought was that I needed to put a default consul ACL token on the local consul agent running on my node but to no avail, given I am using envoy through the consul gRPC interface, according to this issue

I’ve then tried to setup nomad workload identities, and I can see my service identity being created on consul dashboard, with the service being registered, and with the missing permission mentioned before. This hasn’t stopped envoy from complaining about the missing token.

This is my job definition

job "test-server" {
  datacenters = ["eu-west-3a", "eu-west-3b", "eu-west-3c"]

  type = "service"

  update {
    stagger      = "30s"
    max_parallel = 2
  }

  group "app" {
    count = 1

    network {
      mode = "bridge"
      
      port "http" {
        to = 8080
      }
    }

    service {
      port = 8080
      name = "test-server"
      task = "app"
      tags = [
        "http",
        "traefik.enable=true",
        "traefik.http.routers.test.rule=PathPrefix(`/test`)",
        "traefik.http.routers.test.middlewares=test_strip_prefix",
        "traefik.http.middlewares.test_strip_prefix.stripprefix.prefixes=/test",
        "traefik.consulcatalog.connect=true"
      ]

      connect {
        sidecar_service {}

        sidecar_task {
          resources {
            cpu    = 64
            memory = 320
          }
        }
      }

      check {
        type = "http"
        port = "http"
        path     = "/ping"
        interval = "10s"
        timeout  = "2s"
      }
    }

    task "app" {
      driver = "docker"

      config {
        image = "XXXXXXXX.dkr.ecr.eu-west-3.amazonaws.com/test-server:latest"
        ports = ["http"]

        command = "/main"
      }

      resources {
        cpu    = 500 # MHz
        memory = 128 # MB
      }
    }
  }
}

Nomad client

data_dir   = "/opt/nomad/data"
plugin_dir = "/opt/nomad/plugins"

region     = "ops"
datacenter = "eu-west-3b"

name = "i-040bc7525ff44b848"

bind_addr = "0.0.0.0"

leave_on_terminate = true

advertise {
  http = "{{ GetInterfaceIP \"ens5\" }}"
  rpc  = "{{ GetInterfaceIP \"ens5\" }}"
  serf = "{{ GetInterfaceIP \"ens5\" }}"
}

consul {
  address = "127.0.0.1:8501"
  auto_advertise = true

  grpc_address = "127.0.0.1:8503"
  grpc_ca_file = "/opt/consul/tls/consul-ca.pem"

  share_ssl = true
  ssl = true
  verify_ssl = true
  ca_file = "/opt/consul/tls/consul-ca.pem"
  key_file = "/opt/consul/tls/consul-key.pem"
  cert_file = "/opt/consul/tls/consul-cert.pem"
}

client {
  enabled = true

  node_class = "default"

  host_volume "docker-sock-ro" {
    path      = "/var/run/docker.sock"
    read_only = true
    policy    = "read"
  }

  chroot_env {
    "/bin"              = "/bin"
    "/lib"              = "/lib"
    "/lib64"            = "/lib64"
    "/etc/ld.so.cache"  = "/etc/ld.so.cache"
    "/etc/ld.so.conf"   = "/etc/ld.so.conf"
    "/etc/ld.so.conf.d" = "/etc/ld.so.conf.d"
    "/etc/passwd"       = "/etc/passwd"
  }

  options = {
    "driver.denylist" = "raw_exec"
  }
}

tls {
  http                   = true
  rpc                    = true
  verify_server_hostname = true
  verify_https_client    = false
  ca_file                = "/opt/nomad/tls/nomad-ca.pem"
  cert_file              = "/opt/nomad/tls/nomad-cert.pem"
  key_file               = "/opt/nomad/tls/nomad-key.pem"
}

telemetry {
  collection_interval        = "1s"
  disable_hostname           = true
  prometheus_metrics         = true
  publish_allocation_metrics = true
  publish_node_metrics       = true
}

plugin "docker" {
  config {

    allow_privileged = true

    volumes {
      enabled = true
    }

    extra_labels = ["job_name", "task_group_name", "task_name", "namespace", "node_name"]

    auth {
      config = "/opt/nomad/docker.config.json"
    }
  }
}

Consul agent config on the nomad client node

server = false

node_name                   = "i-040bc7525ff44b848"
bind_addr                   = "{{ GetInterfaceIP \"ens5\" }}"
advertise_addr              = "{{ GetInterfaceIP \"ens5\" }}"
client_addr                 = "0.0.0.0"
data_dir                    = "/opt/consul"
datacenter                  = "ops"
primary_datacenter          = "ops"
leave_on_terminate          = true
enable_agent_tls_for_checks = true


encrypt                 = "<redacted>"
encrypt_verify_incoming = true
encrypt_verify_outgoing = true

retry_join = [
  "provider=aws region=eu-west-3 tag_key=ConsulClusterID tag_value=<redacted> addr_type=private_v4 service=ec2"
]

acl {
  enabled                  = true
  default_policy           = "deny"
  down_policy              = "extend-cache"
  enable_token_persistence = true
}

connect {
  enabled     = true
  ca_provider = "consul"
}

ports {
  http     = 8500 # TCP only
  https    = 8501 # TCP only
  grpc     = 8502 # TCP only
  grpc_tls = 8503 # TCP only
  dns      = 8600 # TCP and UDP
  server   = 8300 # TCP only
  serf_lan = 8301 # TCP and UDP
  serf_wan = 8302 # TCP and UDP
}

node_meta {
  server_type       = "nomad-client"
  instance_type     = "t3.medium"
  availability_zone = "eu-west-3b"
  ami_id            = "ami-<redacted>"
}

autopilot {
  cleanup_dead_servers      = true
  last_contact_threshold    = "200ms"
  max_trailing_logs         = 250
  server_stabilization_time = "10s"
}

telemetry {
  prometheus_retention_time = "60s"
  disable_hostname          = true
}

ui_config {
  enabled = false
}

auto_encrypt {
  tls = true
}

peering {
  enabled = true
}

tls {
  defaults {
    verify_incoming        = true
    verify_outgoing        = true
    verify_server_hostname = true
    ca_file                = "/opt/consul/tls/consul-ca.pem"
    key_file               = "/opt/consul/tls/consul-key.pem"
    cert_file              = "/opt/consul/tls/consul-cert.pem"
  }

  grpc {
    verify_incoming = false
  }

  https {
    verify_incoming = false
  }
}

Nomad server config

data_dir = "/opt/nomad/data"

region     = "ops"
datacenter = "eu-west-3a"

name = "i-06d5eeb006c024855"

bind_addr = "0.0.0.0"

leave_on_terminate = true

advertise {
  http = "{{ GetInterfaceIP \"ens5\" }}"
  rpc  = "{{ GetInterfaceIP \"ens5\" }}"
  serf = "{{ GetInterfaceIP \"ens5\" }}"
}

consul {
  address = "127.0.0.1:8501"
  auto_advertise = true

  ssl = true
  verify_ssl = true
  ca_file = "/opt/consul/tls/consul-ca.pem"
  key_file = "/opt/consul/tls/consul-key.pem"
  cert_file = "/opt/consul/tls/consul-cert.pem"

  service_identity {
    aud = ["nomad.<redacted>"]
    ttl = "1h"
  }

  task_identity {
    env = true
    ttl = "1h"
    aud = ["nomad.<redacted>"]
    file = true
  }
}

server {
  enabled              = true
  bootstrap_expect     = 3
  node_gc_threshold    = "30m"
  rejoin_after_leave   = true
}

acl {
  enabled    = true
  token_ttl  = "30s"
  policy_ttl = "60s"
}

tls {
    http                   = true
    rpc                    = true
    verify_server_hostname = true
    verify_https_client    = false
    ca_file                = "/opt/nomad/tls/nomad-ca.pem"
    cert_file              = "/opt/nomad/tls/nomad-cert.pem"
    key_file               = "/opt/nomad/tls/nomad-key.pem"
}

telemetry {
  collection_interval        = "1s"
  disable_hostname           = true
  prometheus_metrics         = true
  publish_allocation_metrics = true
  publish_node_metrics       = true
}

Have someone encountered this issue before ?