Wrong virtual IP in consul dataplane envoy configuration

Hello,

We’re having a strange issue with Consul’s service mesh: the wrong virtual IP address is being added to the Envoy configuration (consul-dataplane), preventing communication between services.
Kubernetes version 1.28 (EKS), Consul version 1.17.1, Consul dataplane 1.3.1 and we are using Karpenter for node auto-scaling.

When looking at the service sidecar configuration in Consul, the IP address is correct: 240.0.0.25.

curl 127.0.0.1:8500/v1/catalog/service/broken-service-sidecar-proxy|jq
 
[
  {
    "ID": "",
    "Node": "ip-10-16-xx-xxx.eu-central-1.compute.internal-virtual",
    "Address": "10.16.xx.xxx",
    "Datacenter": "datacenter",
    "TaggedAddresses": null,
    "NodeMeta": {
      "synthetic-node": "true"
    },
    "ServiceKind": "connect-proxy",
    "ServiceID": "broken-service-6958df5d68-5hpsd-broken-service-sidecar-proxy",
    "ServiceName": "broken-service-sidecar-proxy",
    "ServiceTags": [],
    "ServiceAddress": "10.30.29.117",
    "ServiceTaggedAddresses": {
      "consul-virtual": {
        "Address": "240.0.0.25",
        "Port": 20000
      },
      "virtual": {
        "Address": "10.97.38.160",
        "Port": 9000
      }
    },
    "ServiceWeights": {
      "Passing": 1,
      "Warning": 1
    },
    "ServiceMeta": {
      "k8s-namespace": "broken-service",
      "k8s-service-name": "broken-service",
      "managed-by": "consul-k8s-endpoints-controller",
      "pod-name": "broken-service-6958df5d68-5hpsd",
      "pod-uid": "9eeb2ba6-63d6-4957-8059-57cfee6266a9",
      "synthetic-node": "true"
    },
    "ServicePort": 20000,
    "ServiceSocketPath": "",
    "ServiceEnableTagOverride": false,
    "ServiceProxy": {
      "DestinationServiceName": "broken-service",
      "DestinationServiceID": "broken-service-6958df5d68-5hpsd-broken-service",
      "LocalServiceAddress": "127.0.0.1",
      "LocalServicePort": 9000,
      "Mode": "transparent",
      "Config": {
        "envoy_prometheus_bind_addr": "0.0.0.0:20200"
      },
      "MeshGateway": {},
      "Expose": {}
    },
    "ServiceConnect": {},
    "ServiceLocality": {
      "Region": "eu-central-1",
      "Zone": "eu-central-1b"
    },
    "CreateIndex": 5778295,
    "ModifyIndex": 5778295
  },
  {
    "ID": "",
    "Node": "ip-10-16-x-xxx.eu-central-1.compute.internal-virtual",
    "Address": "10.16.x.xxx",
    "Datacenter": "datacenter",
    "TaggedAddresses": null,
    "NodeMeta": {
      "synthetic-node": "true"
    },
    "ServiceKind": "connect-proxy",
    "ServiceID": "broken-service-6958df5d68-6tsjc-broken-service-sidecar-proxy",
    "ServiceName": "broken-service-sidecar-proxy",
    "ServiceTags": [],
    "ServiceAddress": "10.30.10.117",
    "ServiceTaggedAddresses": {
      "consul-virtual": {
        "Address": "240.0.0.25",
        "Port": 20000
      },
      "virtual": {
        "Address": "10.97.38.160",
        "Port": 9000
      }
    },
    "ServiceWeights": {
      "Passing": 1,
      "Warning": 1
    },
    "ServiceMeta": {
      "k8s-namespace": "broken-service",
      "k8s-service-name": "broken-service",
      "managed-by": "consul-k8s-endpoints-controller",
      "pod-name": "broken-service-6958df5d68-6tsjc",
      "pod-uid": "af61d800-f2b5-4bdf-91b3-3a7930a57f40",
      "synthetic-node": "true"
    },
    "ServicePort": 20000,
    "ServiceSocketPath": "",
    "ServiceEnableTagOverride": false,
    "ServiceProxy": {
      "DestinationServiceName": "broken-service",
      "DestinationServiceID": "broken-service-6958df5d68-6tsjc-broken-service",
      "LocalServiceAddress": "127.0.0.1",
      "LocalServicePort": 9000,
      "Mode": "transparent",
      "Config": {
        "envoy_prometheus_bind_addr": "0.0.0.0:20200"
      },
      "MeshGateway": {},
      "Expose": {}
    },
    "ServiceConnect": {},
    "ServiceLocality": {
      "Region": "eu-central-1",
      "Zone": "eu-central-1a"
    },
    "CreateIndex": 5778298,
    "ModifyIndex": 5778298
  },
  {
    "ID": "",
    "Node": "ip-10-16-x-xxx.eu-central-1.compute.internal-virtual",
    "Address": "10.16.x.xxx",
    "Datacenter": "datacenter",
    "TaggedAddresses": null,
    "NodeMeta": {
      "synthetic-node": "true"
    },
    "ServiceKind": "connect-proxy",
    "ServiceID": "broken-service-6958df5d68-m78p6-broken-service-sidecar-proxy",
    "ServiceName": "broken-service-sidecar-proxy",
    "ServiceTags": [],
    "ServiceAddress": "10.30.10.116",
    "ServiceTaggedAddresses": {
      "consul-virtual": {
        "Address": "240.0.0.25",
        "Port": 20000
      },
      "virtual": {
        "Address": "10.97.38.160",
        "Port": 9000
      }
    },
    "ServiceWeights": {
      "Passing": 1,
      "Warning": 1
    },
    "ServiceMeta": {
      "k8s-namespace": "broken-service",
      "k8s-service-name": "broken-service",
      "managed-by": "consul-k8s-endpoints-controller",
      "pod-name": "broken-service-6958df5d68-m78p6",
      "pod-uid": "cd7b71b7-066a-40a9-b349-3c82568a6224",
      "synthetic-node": "true"
    },
    "ServicePort": 20000,
    "ServiceSocketPath": "",
    "ServiceEnableTagOverride": false,
    "ServiceProxy": {
      "DestinationServiceName": "broken-service",
      "DestinationServiceID": "broken-service-6958df5d68-m78p6-broken-service",
      "LocalServiceAddress": "127.0.0.1",
      "LocalServicePort": 9000,
      "Mode": "transparent",
      "Config": {
        "envoy_prometheus_bind_addr": "0.0.0.0:20200"
      },
      "MeshGateway": {},
      "Expose": {}
    },
    "ServiceConnect": {},
    "ServiceLocality": {
      "Region": "eu-central-1",
      "Zone": "eu-central-1a"
    },
    "CreateIndex": 5778305,
    "ModifyIndex": 5778305
  }
]

Service intentions are configured using wildcard:

--
apiVersion: consul.hashicorp.com/v1alpha1
kind: ServiceDefaults
metadata:
  name: broken-service
  namespace: broken-service
spec:
  protocol: grpc
  transparentProxy:
    dialedDirectly: true
---
apiVersion: consul.hashicorp.com/v1alpha1
kind: ServiceIntentions
metadata:
  name: broken-service
  namespace: broken-service
spec:
  destination:
    name: broken-service
  sources:
    - name: "*"
      action: allow

DNS also resolves correctly:

# nslookup broken-service.virtual.consul
Server:		127.0.0.53
Address:	127.0.0.53#53

Non-authoritative answer:
Name:	broken-service.virtual.consul
Address: 240.0.0.25

But if we then look at the service B consul dataplane envoy configuration in some pods, we see that the correct configuration is applied.:

       "filter_chain_match": {
          "prefix_ranges": [
           {
            "address_prefix": "10.97.38.160",
            "prefix_len": 32
           },
           {
            "address_prefix": "240.0.0.25",
            "prefix_len": 32
           }
          ]
         },

And in some pods not:

       "filter_chain_match": {
          "prefix_ranges": [
           {
            "address_prefix": "10.97.38.160",
            "prefix_len": 32
           },
           {
            "address_prefix": "240.0.0.23",
            "prefix_len": 32
           }
          ]
         },
kubectl -n broken-service get svc
NAME                TYPE        CLUSTER-IP     EXTERNAL-IP   PORT(S)    AGE
broken-service   ClusterIP   10.97.38.160   <none>        9000/TCP   65m

Tried to remove and add the app back but the problem still exists. Don’t know where to look future, any advice is greatly appreciated.

Hi @sergeij,

Thanks for providing such detailed information. I’m sorry you’re seeing this issue.

In order to help us troubleshoot this, could you tell me if the service defaults or service intentions being modified as part of the autoscaling process?

Thanks.

Hi @blake,

No, service defaults and service intentions are not modified during autoscaling. They are always the same as described above.
We are using Kubernetes horizontal pod autoscaler and Service B, which has these “bad” entries in envoy configuration, scales up/down quite frequently during peak hours.
It should be noted that in Kubernetes and Consul there are no other services with the same name as broken-service. Also already tried to clean up some dead nodes which I thought might be causing the problem.
As before, there were such issues Nodes and dead services remaining in the consul catalog · Issue #2065 · hashicorp/consul-k8s · GitHub :slight_smile:
So just don’t know where to look anymore :slight_smile: Another interesting thing is that virtual IP 240.0.0.23 actually belongs to another service “C”, which does not have any service intentions defined, only service defaults:

---
apiVersion: consul.hashicorp.com/v1alpha1
kind: ServiceDefaults
metadata:
  name: service-c
  namespace: service-c
spec:
  transparentProxy:
    dialedDirectly: true

Should be also noted that Consul servers are running on VMs with the following config:

{
    "server": true,
    "ui_config": {
      "enabled": true,
      "metrics_provider": "prometheus",
      "metrics_proxy": {
        "base_url": "http://prom.service.consul:9090"
      },
      "dashboard_url_templates": {
        "service": "https://grafana-datacenter/d/service-overview/service-overview?orgId=1&refresh=30s&var-service={{Service.Name}}&var-dc={{Datacenter}}"
      } 
    },
    "bootstrap_expect": 3,
    "datacenter": "datacenter",
    "domain": "consul",
    "data_dir": "/var/lib/consul",
    "bind_addr": "0.0.0.0",
    "client_addr": "0.0.0.0",
    "advertise_addr": "x.x.x.x",
    "node_name": "consul-1",
    "ports": {
      "http": 8500,
      "https": 8501,
      "grpc_tls": 8502
    
    },
    "retry_join": ["x.x.x.x","x.x.x.x","x.x.x.x"],
    "encrypt": "encryption_key",
    "log_level": "ERROR",
    "enable_syslog": true,
    "verify_incoming": false,
    "verify_outgoing": true,
    "verify_server_hostname": true,
    "ca_file": "ca_file",
    "cert_file": "cert_file",
    "key_file": "key_file",
    "auto_encrypt": {
      "allow_tls": true
    },
    "telemetry": {
      "prometheus_retention_time": "1m",
      "disable_hostname": true
    },
    "peering": {
      "enabled": true
    },
    "recursors" : ["x.x.x.x"],
    "leave_on_terminate": false,
    "acl": {
      "enabled" : true,
      "default_policy" : "deny",
      "down_policy" : "extend-cache"
    }
}

And Kubernetes Helm chart values look like this:

global:
  name: consul
  enabled: false
  acls:
    manageSystemACLs: true
    bootstrapToken:
      secretName: consul-k8s
      secretKey: bootstrap-token
  gossipEncryption:
    secretName: consul-k8s
    secretKey: gossip-encryption-key
  domain: scoro
  logLevel: info
  tls:
    enabled: true
    enableAutoEncrypt: true
    httpsOnly: true
    caCert:
      secretName: consul-k8s
      secretKey: ca-cert
    caKey:
      secretName: consul-k8s
      secretKey: ca-key
client:
  enabled: false
externalServers:
  enabled: true
  externalServers:
    hosts: ["x.x.x.x"]
    k8sAuthMethodHost: ""
syncCatalog:
  enabled: true
  default: false
connectInject:
  enabled: true
  replicas: 2
  envoyExtraArgs: "--component-log-level upstream:debug,http:debug,router:debug,config:debug"
  metrics:
    defaultEnabled: true
    defaultEnableMerging: false
    defaultPrometheusScrapePort: 20200
    defaultMergedMetricsPort: 20100
    defaultPrometheusScrapePath: "/metrics"
  updateStrategy: |
    type: RollingUpdate
    rollingUpdate:
      maxUnavailable: 1
  disruptionBudget:
    enabled: true
    maxUnavailable: 1
  resources:
    requests:
      memory: "500Mi"
      cpu: "250m"
    limits:
      memory: "500Mi"
      cpu: "250m"
  sidecarProxy:
    resources:
      requests:
        memory: 100Mi
        cpu: 100m
      limits:
        memory: 200Mi
        cpu: 200m
dns:
  enabled: true
  enableRedirection: false
meshGateway:
  enabled: true
  service:
    type: LoadBalancer
    annotations: |
      service.beta.kubernetes.io/aws-load-balancer-type: "external"
      service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
      service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: "ip"
      service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "443"