We have a nomad job with both kill_signal
, kill_timeout
configured in the main task, which is to control the process shutdown behavior when nomad needs to kill it. I removed these configs in the hopes to not kill the process when I shut down the nomad agent on the host (I have a special use case that requires me to do it this way such that I can do controlled manual upgrade of the process). However, every time I go in and shut down the nomad agent, nomad will kill my process first before killing the agent. I’m not sure which configuration in my nomad job is controlling this behavior.
here’s my nomad job (without the details):
job "myjob" {
type = "service"
region = "${nomad_region}"
datacenters = ["${nomad_datacenter}"]
constraint {
operator = "distinct_hosts"
value = "true"
}
meta {
...
}
group "main" {
count = 3
constraint {
attribute = "$${meta.ResourceId}"
operator = "=="
value = "${resource_id}"
}
update {
max_parallel = 1
health_check = "checks"
min_healthy_time = "30s"
healthy_deadline = "24h"
progress_deadline = "25h"
auto_revert = false
canary = 0
}
# using `delay` mode to have nomad keep trying on the same node instead of
# trying to schedule the task group onto another node
#
# ref: https://www.nomadproject.io/docs/job-specification/restart.html
restart {
attempts = 10
interval = "15m"
delay = "5s"
mode = "delay"
}
reschedule {
unlimited = false
attempts = 0
interval = "1h"
delay = "30s"
delay_function = "exponential"
max_delay = "30m"
}
ephemeral_disk {
size = 2048 #MB
}
task "my-prestart" {
lifecycle {
hook = "prestart"
sidecar = false
}
driver = "raw_exec"
config {
command = "/bin/bash"
args = [
"local/start",
]
}
template {
destination = "local/start"
data = <<EOH
#!/bin/bash
set -x
...
echo 'done preparing for kafka...'
EOH
}
}
task "main" {
leader = true
driver = "raw_exec"
artifact {
source = "..."
destination = "local/kafka"
}
artifact {
source = "..."
destination = "local/zookeeper"
}
logs {
max_files = 2
max_file_size = 500
}
env {
...
}
config {
command = "/bin/bash"
args = [
"local/start"
]
}
template {
destination = "local/start"
data = <<EOH
#!/bin/bash
...
EOH
}
template {
destination = "local/kafka-server.properties"
change_mode = "noop"
splay = "40m"
data = <<EOH
############################# Server Basics #############################
...
EOH
}
resources {
network {
port "kafka" {
static = ${port_kafka_client}
}
}
}
service {
name = "$${NOMAD_JOB_NAME}"
tags = ["resource_type:$${meta.ResourceType}", "version:$${NOMAD_META_KAFKA_VERSION}", "release_watermark:${release_watermark}"]
address_mode = "host"
port = "kafka"
check {
name = "kafka port check"
type = "tcp"
port = "kafka"
interval = "15s"
timeout = "5s"
} # end of check
check {
name = "zk broker registration check"
type = "script"
command = "/bin/bash"
args = [
"-c",
"docker run --rm -e KAFKA_BROKER_IP=$${attr.unique.network.ip-address} -e ZK_SVC_NAME=$${NOMAD_META_ZK_SVC_NAME} -e ZK_KAFKA_ROOT=$${NOMAD_META_ZK_KAFKA_ROOT} --network host docker.amz.relateiq.com/ops/kzcheck:86d98c8"
]
interval = "30s"
# for frankfurt pulling the hc docker image original 5s timeout could
# be too short, increased timeout to 15s
timeout = "15s"
}
} # end of service
} # end of task
} # end of group
migrate {
max_parallel = 1
health_check = "checks"
min_healthy_time = "10s"
healthy_deadline = "5m"
}
}