Keep the managed process alive in a nomad job after shutting down the nomad agent

We have a nomad job with both kill_signal, kill_timeout configured in the main task, which is to control the process shutdown behavior when nomad needs to kill it. I removed these configs in the hopes to not kill the process when I shut down the nomad agent on the host (I have a special use case that requires me to do it this way such that I can do controlled manual upgrade of the process). However, every time I go in and shut down the nomad agent, nomad will kill my process first before killing the agent. I’m not sure which configuration in my nomad job is controlling this behavior.

here’s my nomad job (without the details):

job "myjob" {
  type             = "service"

  region          = "${nomad_region}"
  datacenters     = ["${nomad_datacenter}"]

  constraint {
    operator  = "distinct_hosts"
    value     = "true"
  }

  meta {
  ...
  }

  group "main" {
    count = 3

    constraint {
      attribute = "$${meta.ResourceId}"
      operator     = "=="
      value     = "${resource_id}"
    }

    update {
      max_parallel        = 1
      health_check        = "checks"
      min_healthy_time    = "30s"
      healthy_deadline    = "24h"
      progress_deadline   = "25h"
      auto_revert         = false
      canary              = 0
    }

    # using `delay` mode to have nomad keep trying on the same node instead of
    # trying to schedule the task group onto another node
    #
    # ref: https://www.nomadproject.io/docs/job-specification/restart.html
    restart {
      attempts  = 10
      interval  = "15m"
      delay     = "5s"
      mode      = "delay"
    }

    reschedule {
      unlimited      = false
      attempts       = 0
      interval       = "1h"
      delay          = "30s"
      delay_function = "exponential"
      max_delay      = "30m"
    }

    ephemeral_disk {
      size = 2048 #MB
    }

    task "my-prestart" {
      lifecycle {
        hook    = "prestart"
        sidecar = false
      }
      driver = "raw_exec"

      config {
        command = "/bin/bash"
        args = [
          "local/start",
        ]
      }

      template {
        destination = "local/start"
        data = <<EOH
#!/bin/bash
set -x
...
echo 'done preparing for kafka...'
EOH
    }

    }

    task "main" {
      leader = true
      driver = "raw_exec"

      artifact {
        source      = "..."
        destination = "local/kafka"
      }

      artifact {
        source      = "..."
        destination = "local/zookeeper"
      }

      logs {
        max_files     = 2
        max_file_size = 500
      }

      env {
        ...
      }

      config {
        command = "/bin/bash"
        args = [
          "local/start"
        ]
      }

      template {
        destination = "local/start"
        data = <<EOH
#!/bin/bash
...
EOH
      }

      template {
        destination = "local/kafka-server.properties"
        change_mode = "noop"
        splay = "40m"
        data = <<EOH
############################# Server Basics #############################
...
EOH
      }

      resources {
        network {
          port "kafka" {
              static = ${port_kafka_client}
          }
        }
      }

      service {
        name            = "$${NOMAD_JOB_NAME}"
        tags            = ["resource_type:$${meta.ResourceType}", "version:$${NOMAD_META_KAFKA_VERSION}", "release_watermark:${release_watermark}"]
        address_mode    = "host"
        port            = "kafka"

        check {
          name      = "kafka port check"
          type      = "tcp"
          port      = "kafka"
          interval  = "15s"
          timeout   = "5s"
        } # end of check

        check {
          name      = "zk broker registration check"
          type      = "script"
          command   = "/bin/bash"
          args      = [
            "-c",
            "docker run --rm -e KAFKA_BROKER_IP=$${attr.unique.network.ip-address} -e ZK_SVC_NAME=$${NOMAD_META_ZK_SVC_NAME} -e ZK_KAFKA_ROOT=$${NOMAD_META_ZK_KAFKA_ROOT} --network host docker.amz.relateiq.com/ops/kzcheck:86d98c8"
            ]
          interval  = "30s"
          # for frankfurt pulling the hc docker image original 5s timeout could
          # be too short, increased timeout to 15s
          timeout   = "15s"
        }

      } # end of service
    } # end of task

  } # end of group

  migrate {
    max_parallel         = 1
    health_check         = "checks"
    min_healthy_time     = "10s"
    healthy_deadline     = "5m"
  }

}

Never mind, it’s not a real problem. I forgot that we had a custom script that actually drains the node whenever we stop nomad agent.

Sounds like you got it figured out @wfeng-fsde! Just to follow up, Nomad client does not terminate the tasks it is managing when it shuts down … unless it is being run with the -dev or -dev-connect flags.

1 Like