Nomad Group Allocations

Hi,

I’ve got the following jobspec:

job "${subdomain}/environment" {
  datacenters = [ "${datacenter}" ]

  type = "service"

  migrate {
    max_parallel = 1
    health_check = "checks"
    min_healthy_time = "10s"
    healthy_deadline = "5m"
  }

  group "api" {

    update {
      max_parallel     = 1
      canary           = 2
      health_check      = "checks"
      min_healthy_time  = "10s"
      healthy_deadline  = "5m"
      progress_deadline = "10m"
      auto_revert       = true
    }

    restart {
      attempts = 3
      delay    = "10s"
      interval = "90s"
      mode     = "fail"
    }

    meta {
      version = "${version_label}"
      region = "${aws_region}"
      service = "api"
    }

    task "api" {
      driver = "docker"

      config {
        image = "api:${version_label}"
        force_pull = true

        dns_servers = [ "$${NOMAD_IP_http}" ]

        port_map = {
          http = 8080
          metrics = 8081
        }
      }

      resources {
        cpu    = 1500 # MHz
        memory = 1024 # MB

        network {
          port "http" {}
          port "metrics" {}
        }
      }

      service {
        name = "api"
        port = "http"

        tags = [
          "env=${subdomain}",
          "dc=${datacenter}",
          "public=true",
          "version=${version_label}",
          "vpc=${vpc_name}"
        ]

        canary_tags = [
          "canary=true",
          "env=${subdomain}",
          "dc=${datacenter}",
          "public=true",
          "version=${version_label}",
          "vpc=${vpc_name}" 
        ]

        check {
          type     = "http"
          port     = "metrics"
          path     = "/health"
          interval = "1m"
          timeout  = "5s"

          check_restart {
            limit = 3
            grace = "90s"
            ignore_warnings = false
          }
        }
      }

      service {
        name = "api-metrics"
        port = "metrics"

        tags = [
          "env=${subdomain}",
          "dc=${datacenter}",
          "version=${version_label}",
          "vpc=${vpc_name}"
        ]

        canary_tags = [
          "canary=true",
          "env=${subdomain}",
          "dc=${datacenter}",
          "version=${version_label}",
          "vpc=${vpc_name}" 
        ] 

        check {
          type     = "http"
          port     = "metrics"
          path     = "/metrics"
          interval = "1m"
          timeout  = "5s"
        }
      }
    }
  }

  group "worker" {
    
    update {
      max_parallel     = 1
      canary           = 1
      health_check      = "checks"
      min_healthy_time  = "10s"
      healthy_deadline  = "5m"
      progress_deadline = "10m"
      auto_revert       = true
    }

    restart {
      attempts = 3
      delay    = "10s"
      interval = "90s"
      mode     = "fail"
    }

    meta {
      version = "${version_label}"
      region = "${aws_region}"
      service = "worker"
    }

    task "worker" {
      driver = "docker"

      config {
        image = "worker:${version_label}"
        force_pull = true

        dns_servers = [ "$${NOMAD_IP_http}" ]

        port_map = {
          http = 8600
          metrics = 8601
        }
      }

      resources {
        cpu    = 1000 # MHz
        memory = 500 # MB

        network {
          port "http" {}
          port "metrics" {}
        }
      }

      service {
        name = "worker"
        port = "http"

        tags = [
          "env=${subdomain}",
          "dc=${datacenter}",
          "version=${version_label}",
          "vpc=${vpc_name}"
        ]

        canary_tags = [
          "canary=true",
          "env=${subdomain}",
          "dc=${datacenter}",
          "version=${version_label}",
          "vpc=${vpc_name}" 
        ]

        check {
          type     = "http"
          port     = "metrics"
          path     = "/health"
          interval = "1m"
          timeout  = "5s"

          check_restart {
            limit = 3
            grace = "90s"
            ignore_warnings = false
          }
        } 
      }

      service {
        name = "worker-metrics"
        port = "metrics"

        tags = [
          "env=${subdomain}",
          "dc=${datacenter}",
          "version=${version_label}",
          "vpc=${vpc_name}"
        ]

        canary_tags = [
          "canary=true",
          "env=${subdomain}",
          "dc=${datacenter}",
          "version=${version_label}",
          "vpc=${vpc_name}" 
        ]

        check {
          type     = "http"
          port     = "metrics"
          path     = "/metrics"
          interval = "1m"
          timeout  = "5s"
        }
      }
    }
  }
}

Recently a worker got Out Of Memory killed by the Docker daemon and also took the API allocation offline as well and re-scheduled it. My understanding was groups are isolated and executed on the same Nomad clients. Is there a way to make groups standalone so that one group failing (memory or cpu limits) doesn’t de-allocate / effect a group within the same job spec?

Here is the list of states from the killed allocation:

Jun 20, '19 14:55:40 -0500 	Killed 	Task successfully killed
Jun 20, '19 14:55:40 -0500 	Terminated 	Exit Code: 137, Exit Message: "Docker container exited with non-zero exit code: 137"
Jun 20, '19 14:55:34 -0500 	Killing 	Sent interrupt
Jun 20, '19 11:49:51 -0500 	Started 	Task started by client

and here is the API allocation:

Jun 20, '19 14:55:38 -0500 	Killed 	Task successfully killed
Jun 20, '19 14:55:38 -0500 	Terminated 	Exit Code: 130, Exit Message: "Docker container exited with non-zero exit code: 130"
Jun 20, '19 14:55:34 -0500 	Killing 	Sent interrupt
Jun 20, '19 11:49:48 -0500 	Started 	Task started by client

My hunch right now is that when one allocation is killed all other groups within a jobspec will get re-scheduled as well?

Versions:

Nomad 0.9.2
Consul 1.5.1