Hello,
I’ve recently set up a new cluster with nomad 1.7.x and consul 1.17.x.
I am attempting to correct multiple issues that plagues our current production environment, mainly:
- mTLS for nomad and consul control plane
- Use of workload identity for nomad jobs
- Cluster peering for different workloads on several AWS accounts.
As I’ve managed to clean up most of my issues so far, one remains that have bugged me for the past few days.
Whenever I set up a nomad job making use of consul connect, i get the following error repeated:
DeltaAggregatedResources gRPC config stream to local_agent closed: 7, Permission denied: anonymous token lacks permission 'service:write' on "test-server". The anonymous token is used implicitly when a request does not specify a token.
My initial thought was that I needed to put a default consul ACL token on the local consul agent running on my node but to no avail, given I am using envoy through the consul gRPC interface, according to this issue
I’ve then tried to setup nomad workload identities, and I can see my service identity being created on consul dashboard, with the service being registered, and with the missing permission mentioned before. This hasn’t stopped envoy from complaining about the missing token.
This is my job definition
job "test-server" {
datacenters = ["eu-west-3a", "eu-west-3b", "eu-west-3c"]
type = "service"
update {
stagger = "30s"
max_parallel = 2
}
group "app" {
count = 1
network {
mode = "bridge"
port "http" {
to = 8080
}
}
service {
port = 8080
name = "test-server"
task = "app"
tags = [
"http",
"traefik.enable=true",
"traefik.http.routers.test.rule=PathPrefix(`/test`)",
"traefik.http.routers.test.middlewares=test_strip_prefix",
"traefik.http.middlewares.test_strip_prefix.stripprefix.prefixes=/test",
"traefik.consulcatalog.connect=true"
]
connect {
sidecar_service {}
sidecar_task {
resources {
cpu = 64
memory = 320
}
}
}
check {
type = "http"
port = "http"
path = "/ping"
interval = "10s"
timeout = "2s"
}
}
task "app" {
driver = "docker"
config {
image = "XXXXXXXX.dkr.ecr.eu-west-3.amazonaws.com/test-server:latest"
ports = ["http"]
command = "/main"
}
resources {
cpu = 500 # MHz
memory = 128 # MB
}
}
}
}
Nomad client
data_dir = "/opt/nomad/data"
plugin_dir = "/opt/nomad/plugins"
region = "ops"
datacenter = "eu-west-3b"
name = "i-040bc7525ff44b848"
bind_addr = "0.0.0.0"
leave_on_terminate = true
advertise {
http = "{{ GetInterfaceIP \"ens5\" }}"
rpc = "{{ GetInterfaceIP \"ens5\" }}"
serf = "{{ GetInterfaceIP \"ens5\" }}"
}
consul {
address = "127.0.0.1:8501"
auto_advertise = true
grpc_address = "127.0.0.1:8503"
grpc_ca_file = "/opt/consul/tls/consul-ca.pem"
share_ssl = true
ssl = true
verify_ssl = true
ca_file = "/opt/consul/tls/consul-ca.pem"
key_file = "/opt/consul/tls/consul-key.pem"
cert_file = "/opt/consul/tls/consul-cert.pem"
}
client {
enabled = true
node_class = "default"
host_volume "docker-sock-ro" {
path = "/var/run/docker.sock"
read_only = true
policy = "read"
}
chroot_env {
"/bin" = "/bin"
"/lib" = "/lib"
"/lib64" = "/lib64"
"/etc/ld.so.cache" = "/etc/ld.so.cache"
"/etc/ld.so.conf" = "/etc/ld.so.conf"
"/etc/ld.so.conf.d" = "/etc/ld.so.conf.d"
"/etc/passwd" = "/etc/passwd"
}
options = {
"driver.denylist" = "raw_exec"
}
}
tls {
http = true
rpc = true
verify_server_hostname = true
verify_https_client = false
ca_file = "/opt/nomad/tls/nomad-ca.pem"
cert_file = "/opt/nomad/tls/nomad-cert.pem"
key_file = "/opt/nomad/tls/nomad-key.pem"
}
telemetry {
collection_interval = "1s"
disable_hostname = true
prometheus_metrics = true
publish_allocation_metrics = true
publish_node_metrics = true
}
plugin "docker" {
config {
allow_privileged = true
volumes {
enabled = true
}
extra_labels = ["job_name", "task_group_name", "task_name", "namespace", "node_name"]
auth {
config = "/opt/nomad/docker.config.json"
}
}
}
Consul agent config on the nomad client node
server = false
node_name = "i-040bc7525ff44b848"
bind_addr = "{{ GetInterfaceIP \"ens5\" }}"
advertise_addr = "{{ GetInterfaceIP \"ens5\" }}"
client_addr = "0.0.0.0"
data_dir = "/opt/consul"
datacenter = "ops"
primary_datacenter = "ops"
leave_on_terminate = true
enable_agent_tls_for_checks = true
encrypt = "<redacted>"
encrypt_verify_incoming = true
encrypt_verify_outgoing = true
retry_join = [
"provider=aws region=eu-west-3 tag_key=ConsulClusterID tag_value=<redacted> addr_type=private_v4 service=ec2"
]
acl {
enabled = true
default_policy = "deny"
down_policy = "extend-cache"
enable_token_persistence = true
}
connect {
enabled = true
ca_provider = "consul"
}
ports {
http = 8500 # TCP only
https = 8501 # TCP only
grpc = 8502 # TCP only
grpc_tls = 8503 # TCP only
dns = 8600 # TCP and UDP
server = 8300 # TCP only
serf_lan = 8301 # TCP and UDP
serf_wan = 8302 # TCP and UDP
}
node_meta {
server_type = "nomad-client"
instance_type = "t3.medium"
availability_zone = "eu-west-3b"
ami_id = "ami-<redacted>"
}
autopilot {
cleanup_dead_servers = true
last_contact_threshold = "200ms"
max_trailing_logs = 250
server_stabilization_time = "10s"
}
telemetry {
prometheus_retention_time = "60s"
disable_hostname = true
}
ui_config {
enabled = false
}
auto_encrypt {
tls = true
}
peering {
enabled = true
}
tls {
defaults {
verify_incoming = true
verify_outgoing = true
verify_server_hostname = true
ca_file = "/opt/consul/tls/consul-ca.pem"
key_file = "/opt/consul/tls/consul-key.pem"
cert_file = "/opt/consul/tls/consul-cert.pem"
}
grpc {
verify_incoming = false
}
https {
verify_incoming = false
}
}
Nomad server config
data_dir = "/opt/nomad/data"
region = "ops"
datacenter = "eu-west-3a"
name = "i-06d5eeb006c024855"
bind_addr = "0.0.0.0"
leave_on_terminate = true
advertise {
http = "{{ GetInterfaceIP \"ens5\" }}"
rpc = "{{ GetInterfaceIP \"ens5\" }}"
serf = "{{ GetInterfaceIP \"ens5\" }}"
}
consul {
address = "127.0.0.1:8501"
auto_advertise = true
ssl = true
verify_ssl = true
ca_file = "/opt/consul/tls/consul-ca.pem"
key_file = "/opt/consul/tls/consul-key.pem"
cert_file = "/opt/consul/tls/consul-cert.pem"
service_identity {
aud = ["nomad.<redacted>"]
ttl = "1h"
}
task_identity {
env = true
ttl = "1h"
aud = ["nomad.<redacted>"]
file = true
}
}
server {
enabled = true
bootstrap_expect = 3
node_gc_threshold = "30m"
rejoin_after_leave = true
}
acl {
enabled = true
token_ttl = "30s"
policy_ttl = "60s"
}
tls {
http = true
rpc = true
verify_server_hostname = true
verify_https_client = false
ca_file = "/opt/nomad/tls/nomad-ca.pem"
cert_file = "/opt/nomad/tls/nomad-cert.pem"
key_file = "/opt/nomad/tls/nomad-key.pem"
}
telemetry {
collection_interval = "1s"
disable_hostname = true
prometheus_metrics = true
publish_allocation_metrics = true
publish_node_metrics = true
}
Have someone encountered this issue before ?