Nomad not cleaning up allocations on job purge

I’ve got a job that registers two services: the application and database server for the application.

When the application queries the nomad service registry to get the port, it ends up looking like this: nc: bad port '20521248333188625509'

This list just grows and grows as each job failure and subsequent attempt to purge to clean up and fix the issue leads to additional allocations that don’t clean up and seemingly additional service entries in to Nomad:

Here’s the job definition:

job "ansible_home_lan" {
    datacenters = ["primary"]
    type = "service"

    group "semaphore" {
        network {
            port "http" {
                to = 3000
                host_network = "private"
            }
        }

        count = 1

        service {
            provider = "nomad"
            port = "http"
            name = "semaphore"

            tags = [
                "traefik.enable=true",
                "traefik.http.routers.ansible.rule=Host(`ansible.home.lan`)"
            ]

            check {
                name = "Semaphore HTTP"
                type = "tcp"
                port = "http"
                interval = "10s"
                timeout = "2s"
            }

            check_restart {
                limit = 3
                grace = "90s"
                ignore_warnings = false
            }
        }

        task "run" {
            driver = "docker"

            config {
                image = "semaphoreui/semaphore:v2.8.90"
                network_mode = "bridge"

                ports = ["http"]

                volumes = [
                    "/opt/nomad/storage/ansible.home.lan/playbooks:/tmp/playbooks",
                    "/opt/nomad/storage/ansible.home.lan/bin:/local/bin"
                ]
            }

            template {
                destination = "${NOMAD_SECRETS_DIR}/env.vars"
                env = true
                data = <<EOT
                SEMAPHORE_DB_USER={{ with nomadVar "nomad/jobs/ansible_home_lan"}}{{ .MYSQL_USER }}{{ end }}
                SEMAPHORE_DB_PASS={{ with nomadVar "nomad/jobs/ansible_home_lan"}}{{ .MYSQL_PASSWORD }}{{ end }}
                SEMAPHORE_DB_HOST={{ range nomadService "semaphore-mysql" }}{{ .Address }}{{ end }}
                SEMAPHORE_DB_PORT={{ range nomadService "semaphore-mysql" }}{{ .Port }}{{ end }}
                SEMAPHORE_DB_DIALECT=mysql
                SEMAPHORE_DB={{ with nomadVar "nomad/jobs/ansible_home_lan"}}{{ .MYSQL_DATABASE }}{{ end }}
                SEMAPHORE_PLAYBOOK_PATH=/tmp/playbooks
                SEMAPHORE_ADMIN_PASSWORD={{ with nomadVar "nomad/jobs/ansible_home_lan" }}{{ .SEMAPHORE_ADMIN_PASSWORD }}{{ end }}
                SEMAPHORE_ADMIN_NAME={{ with nomadVar "nomad/jobs/ansible_home_lan" }}{{ .SEMAPHORE_ADMIN_NAME }}{{ end }}
                SEMAPHORE_ADMIN_EMAIL={{ with nomadVar "nomad/jobs/ansible_home_lan" }}{{ .SEMAPHORE_ADMIN_EMAIL }}{{ end }}
                SEMAPHORE_ADMIN={{ with nomadVar "nomad/jobs/ansible_home_lan" }}{{ .SEMAPHORE_ADMIN_USER }}{{ end }}
                SEMAPHORE_ACCESS_KEY_ENCRYPTION={{ with nomadVar "nomad/jobs/ansible_home_lan" }}{{ .SEMAPHORE_ACCESS_KEY_ENCRYPTION }}{{ end }}
                SEMAPHORE_LDAP_ACTIVATED="no"
                EOT
            }

            resources {
                cpu = 300
                memory = 1024
            }
        }
    }

    group "database" {
        network {
            port "mysql" {
                to = 3306
                host_network = "private"
            }
        }

        count = 1

        service {
            provider = "nomad"
            port = "mysql"
            name = "semaphore-mysql"

            check {
                name = "MySQL"
                type = "tcp"
                port = "MySQL"
                interval = "10s"
                timeout = "2s"
            }

            check_restart {
                limit = 3
                grace = "90s"
                ignore_warnings = false
            }
        }

        task "run" {
            driver = "docker"

            config {
                image = "mysql:8.0"
                network_mode = "bridge"

                ports = ["mysql"]

                volumes = [
                    "/opt/nomad/storage/ansible.home.lan/database:/var/lib/mysql"
                ]
            }

            template {
                destination = "${NOMAD_SECRETS_DIR}/env.vars"
                env = true
                data = <<EOT
                MYSQL_RANDOM_ROOT_PASSWORD="yes"
                MYSQL_USER={{ with nomadVar "nomad/jobs/ansible_home_lan"}}{{ .MYSQL_USER }}{{ end }}
                MYSQL_PASSWORD={{ with nomadVar "nomad/jobs/ansible_home_lan"}}{{ .MYSQL_PASSWORD }}{{ end }}
                MYSQL_DB={{ with nomadVar "nomad/jobs/ansible_home_lan"}}{{ .MYSQL_DATABASE }}{{ end }}
                EOT
            }

            resources {
                cpu = 300
                memory = 1024
            }
        }
    }
}

Kind of figured it out: Seems like Nomad doesn’t like letting go of service registrations, which can also be seen in Traefik:

I reconfigured Traefik to use Consul Catalog, fixed the typo in the above port name, and things worked just fine.