Consul connect with health checks

How would you run a consul-connect enabled job with health checks?

The typical count dash from the documentation

job "countdash" {
  datacenters = ["dc1"]

  group "api" {
    network {
      mode = "bridge"
    }

    service {
      name = "count-api"
      port = "9001"

      connect {
        sidecar_service {}
      }
    }

    task "web" {
      driver = "docker"

      config {
        image = "hashicorpnomad/counter-api:v1"
      }
    }
  }

  group "dashboard" {
    network {
      mode = "bridge"

      port "http" {
        static = 9002
        to     = 9002
      }
    }

    service {
      name = "count-dashboard"
      port = "9002"

      connect {
        sidecar_service {
          proxy {
            upstreams {
              destination_name = "count-api"
              local_bind_port  = 8080
            }
          }
        }
      }
    }

    task "dashboard" {
      driver = "docker"

      env {
        COUNTING_SERVICE_URL = "http://${NOMAD_UPSTREAM_ADDR_count_api}"
      }

      config {
        image = "hashicorpnomad/counter-dashboard:v1"
      }
    }
  }
}

Attempt 1

This results in the health check never goes green

  group "api" {
    network {
      mode = "bridge"
      port "http" {
        to = "9001"
      }
    }
    service {
      name = "count-api"
      port = "http"
      check {
        port = "http"
        type = "http"
        path = "/"
        interval = "5s"
        timeout = "2s"
      }
      connect {
        sidecar_service {}
      }
    }
...

Attempt 2

Add the service to just the task (not the group)

Error
port label "http" referenced by services countdash-bad1-api-web does not exist

  group "api" {
    network {
      mode = "bridge"
      port "http" {
        to = "9001"
      }
    }

    service {
      name = "count-api"
      port = "http"
      connect {
        sidecar_service {}
      }
    }

    task "web" {
      driver = "docker"

      config {
        image = "hashicorpnomad/counter-api:v1"
      }
      service {
        check {
          port = "http"
          type = "http"
          path = "/"
          interval = "5s"
          timeout = "2s"
        }
      }
    }
  }

Use the address_mode = "driver" service option.

    service {
      name = "count-api"
      port = "http"
      check {
        port = "http"
        type = "http"
        path = "/"
        interval = "5s"
        timeout = "2s"
        address_mode = "driver"
      }
      connect {
        sidecar_service {}
      }
    }

Full example

job "countdash3" {
  datacenters = ["dc1"]

  group "api" {
    network {
      mode = "bridge"
      port "http" {
        to = "9001"
      }
    }

    service {
      name = "count-api"
      port = "http"
      check {
        port = "http"
        type = "http"
        path = "/"
        interval = "5s"
        timeout = "2s"
        address_mode = "driver"
      }
      connect {
        sidecar_service {}
      }
    }

    task "web" {
      driver = "docker"

      config {
        image = "hashicorpnomad/counter-api:v1"
      }
    }
  }

  group "dashboard" {
    network {
      mode = "bridge"

      port "http" {
        static = 9002
        to     = 9002
      }
    }

    service {
      name = "count-dashboard"
      port = "9002"

      connect {
        sidecar_service {
          proxy {
            upstreams {
              destination_name = "count-api"
              local_bind_port  = 8080
            }
          }
        }
      }
    }

    task "dashboard" {
      driver = "docker"

      env {
        COUNTING_SERVICE_URL = "http://${NOMAD_UPSTREAM_ADDR_count_api}"
      }

      config {
        image = "hashicorpnomad/counter-dashboard:v1"
      }
    }
  }
}

I believe address_mode = "driver" is required because the task is running in a separate network namespace than the consul agent (which does the health checking). By setting address_mode it allows consul to communicate over the docker task’s ip:port instead of the hosts ip:port.

I have had mixed results using this in jobs. It seems like the health checking is not fully baked with network namespaces

Related issue: https://github.com/hashicorp/nomad/issues/7556