I have a job that consumes a lot of memory and apparently gets OOM-killed when the system memory resource is very low (Please see the allocation details below). In this case, Nomad reports my job as complete and spawns another allocation to rerun the job.
In this case, I expect the job to be reported as failed and stop completely. Is the current behavior expected or is there a config to disable this?
Allocation details:
{
"ID": "a749b520-7fdb-7a33-cca5-402dc04171e7",
"Namespace": "default",
"EvalID": "35f41251-4377-a243-2560-5f122b6f75c2",
"Name": "dagnomad-job-10169--pcap_merge-from-20200604-to-20200604--653920.dagnomad-job-10169--pcap_merge-from-20200604-to-20200604--653920[0]",
"NodeID": "23417318-7f8a-7fcf-e54e-962749654fa9",
"NodeName": "ny5-dtldata05",
"JobID": "dagnomad-job-10169--pcap_merge-from-20200604-to-20200604--653920",
"Job": {
"Stop": false,
"Region": "global",
"Namespace": "default",
"ID": "dagnomad-job-10169--pcap_merge-from-20200604-to-20200604--653920",
"ParentID": "",
"Name": "dagnomad-job-10169--pcap_merge-from-20200604-to-20200604--653920",
"Type": "batch",
"Priority": 50,
"AllAtOnce": false,
"Datacenters": [
"*"
],
"NodePool": "default",
"Constraints": [
{
"LTarget": "${attr.unique.hostname}",
"RTarget": "...",
"Operand": "set_contains_any"
}
],
"Affinities": null,
"Spreads": null,
"TaskGroups": [
{
"Name": "dagnomad-job-10169--pcap_merge-from-20200604-to-20200604--653920",
"Count": 1,
"Update": null,
"Migrate": null,
"Constraints": null,
"Scaling": null,
"RestartPolicy": {
"Attempts": 3,
"Interval": 86400000000000,
"Delay": 15000000000,
"Mode": "fail",
"RenderTemplates": false
},
"Tasks": [
{
"Name": "dagnomad-job-10169--pcap_merge-from-20200604-to-20200604--653920",
"Driver": "docker",
"User": "10077:20000",
"Config": {
"command": "/bin/bash",
"args": [...
],
"image": "...",
"force_pull": false,
"image_pull_timeout": "30m",
"auth": {
"username": "...",
"password": "..."
},
"labels": [
{
"task_id": 4592,
"job_id": 10169,
"run_id": "dagnomad-job-10169--pcap_merge-from-20200604-to-20200604--653920"
}
],
"logging": null,
"mounts": [...
],
"volumes": [
"local/.netrc:${NOMAD_TASK_DIR}/.netrc"
],
"pids_limit": "${meta.docker_pids_limit}"
},
"Env": {
"NETRC": "${NOMAD_TASK_DIR}/.netrc",
"QUIRRELL_DATE": "20200604"
},
"Services": null,
"Vault": null,
"Consul": null,
"Templates": [
{
"SourcePath": "",
"DestPath": "local/.netrc",
"EmbeddedTmpl": "...",
"ChangeMode": "restart",
"ChangeSignal": "",
"ChangeScript": null,
"Splay": 5000000000,
"Perms": "0600",
"Uid": 10077,
"Gid": 20000,
"LeftDelim": "{{",
"RightDelim": "}}",
"Envvars": false,
"VaultGrace": 0,
"Wait": null,
"ErrMissingKey": false
}
],
"Constraints": null,
"Affinities": null,
"Resources": {
"CPU": 400,
"Cores": 0,
"MemoryMB": 10240,
"MemoryMaxMB": 1000000,
"DiskMB": 0,
"IOPS": 0,
"Networks": null,
"Devices": null,
"NUMA": null
},
"RestartPolicy": {
"Attempts": 3,
"Interval": 86400000000000,
"Delay": 15000000000,
"Mode": "fail",
"RenderTemplates": false
},
"DispatchPayload": null,
"Lifecycle": null,
"Meta": null,
"KillTimeout": 5000000000,
"LogConfig": {
"MaxFiles": 10,
"MaxFileSizeMB": 10,
"Disabled": false
},
"Artifacts": null,
"Leader": false,
"ShutdownDelay": 0,
"VolumeMounts": null,
"ScalingPolicies": null,
"KillSignal": "",
"Kind": "",
"CSIPluginConfig": null,
"Identity": {
"Name": "default",
"Audience": [
"nomadproject.io"
],
"ChangeMode": "",
"ChangeSignal": "",
"Env": false,
"File": false,
"ServiceName": "",
"TTL": 0
},
"Identities": null,
"Actions": null
}
],
"EphemeralDisk": {
"Sticky": false,
"SizeMB": 300,
"Migrate": false
},
"Meta": null,
"ReschedulePolicy": {
"Attempts": 0,
"Interval": 86400000000000,
"Delay": 5000000000,
"DelayFunction": "constant",
"MaxDelay": 0,
"Unlimited": false
},
"Affinities": null,
"Spreads": null,
"Networks": null,
"Consul": {
"Namespace": "",
"Cluster": "default",
"Partition": ""
},
"Services": null,
"Volumes": null,
"ShutdownDelay": null,
"StopAfterClientDisconnect": null,
"MaxClientDisconnect": null,
"PreventRescheduleOnLost": false
}
],
"Update": {
"Stagger": 0,
"MaxParallel": 0,
"HealthCheck": "",
"MinHealthyTime": 0,
"HealthyDeadline": 0,
"ProgressDeadline": 0,
"AutoRevert": false,
"AutoPromote": false,
"Canary": 0
},
"Multiregion": null,
"Periodic": null,
"ParameterizedJob": null,
"Dispatched": false,
"DispatchIdempotencyToken": "",
"Payload": null,
"Meta": {
"version": "1",
"__force_re-evaluation": "57379b5e-4156-4e78-8a18-fa14ff72cb5c"
},
"ConsulToken": "",
"ConsulNamespace": "",
"VaultToken": "",
"VaultNamespace": "",
"NomadTokenID": "",
"Status": "pending",
"StatusDescription": "",
"Stable": false,
"Version": 0,
"SubmitTime": 1713340172777083600,
"CreateIndex": 4199282,
"ModifyIndex": 4199282,
"JobModifyIndex": 4199282
},
"TaskGroup": "dagnomad-job-10169--pcap_merge-from-20200604-to-20200604--653920",
"Resources": {
"CPU": 400,
"Cores": 0,
"MemoryMB": 10240,
"MemoryMaxMB": 1000000,
"DiskMB": 300,
"IOPS": 0,
"Networks": null,
"Devices": null,
"NUMA": null
},
"SharedResources": {
"CPU": 0,
"Cores": 0,
"MemoryMB": 0,
"MemoryMaxMB": 0,
"DiskMB": 300,
"IOPS": 0,
"Networks": null,
"Devices": null,
"NUMA": null
},
"TaskResources": {
"dagnomad-job-10169--pcap_merge-from-20200604-to-20200604--653920": {
"CPU": 400,
"Cores": 0,
"MemoryMB": 10240,
"MemoryMaxMB": 1000000,
"DiskMB": 0,
"IOPS": 0,
"Networks": null,
"Devices": null,
"NUMA": null
}
},
"AllocatedResources": {
"Tasks": {
"dagnomad-job-10169--pcap_merge-from-20200604-to-20200604--653920": {
"Cpu": {
"CpuShares": 400,
"ReservedCores": null
},
"Memory": {
"MemoryMB": 10240,
"MemoryMaxMB": 1000000
},
"Networks": null,
"Devices": null
}
},
"TaskLifecycles": {
"dagnomad-job-10169--pcap_merge-from-20200604-to-20200604--653920": null
},
"Shared": {
"Networks": null,
"DiskMB": 300,
"Ports": null
}
},
"Metrics": {
"NodesEvaluated": 6,
"NodesFiltered": 5,
"NodesInPool": 6,
"NodesAvailable": {
"ny5": 6
},
"ClassFiltered": null,
"ConstraintFiltered": {
"${attr.unique.hostname} set_contains_any ny5-dtldata05": 5
},
"NodesExhausted": 0,
"ClassExhausted": null,
"DimensionExhausted": null,
"QuotaExhausted": null,
"ResourcesExhausted": null,
"Scores": null,
"ScoreMetaData": [
{
"NodeID": "23417318-7f8a-7fcf-e54e-962749654fa9",
"Scores": {
"node-affinity": 0,
"binpack": 0.060045159433777764,
"job-anti-affinity": 0,
"node-reschedule-penalty": 0
},
"NormScore": 0.060045159433777764
}
],
"AllocationTime": 1072662,
"CoalescedFailures": 0
},
"DesiredStatus": "stop",
"DesiredDescription": "alloc is lost since its node is down",
"ClientStatus": "complete",
"ClientDescription": "All tasks have completed",
"TaskStates": {
"dagnomad-job-10169--pcap_merge-from-20200604-to-20200604--653920": {
"State": "dead",
"Failed": false,
"Restarts": 0,
"LastRestart": null,
"StartedAt": "2024-04-17T07:49:33.385520199Z",
"FinishedAt": "2024-04-17T08:26:03.473453299Z",
"Events": [
{
"Type": "Received",
"Time": 1713340172887148800,
"Message": "",
"DisplayMessage": "Task received by client",
"Details": {},
"FailsTask": false,
"RestartReason": "",
"SetupError": "",
"DriverError": "",
"ExitCode": 0,
"Signal": 0,
"KillTimeout": 0,
"KillError": "",
"KillReason": "",
"StartDelay": 0,
"DownloadError": "",
"ValidationError": "",
"DiskLimit": 0,
"FailedSibling": "",
"VaultError": "",
"TaskSignalReason": "",
"TaskSignal": "",
"DriverMessage": "",
"GenericSource": ""
},
{
"Type": "Task Setup",
"Time": 1713340172889865500,
"Message": "Building Task Directory",
"DisplayMessage": "Building Task Directory",
"Details": {
"message": "Building Task Directory"
},
"FailsTask": false,
"RestartReason": "",
"SetupError": "",
"DriverError": "",
"ExitCode": 0,
"Signal": 0,
"KillTimeout": 0,
"KillError": "",
"KillReason": "",
"StartDelay": 0,
"DownloadError": "",
"ValidationError": "",
"DiskLimit": 0,
"FailedSibling": "",
"VaultError": "",
"TaskSignalReason": "",
"TaskSignal": "",
"DriverMessage": "",
"GenericSource": ""
},
{
"Type": "Started",
"Time": 1713340173385470000,
"Message": "",
"DisplayMessage": "Task started by client",
"Details": {},
"FailsTask": false,
"RestartReason": "",
"SetupError": "",
"DriverError": "",
"ExitCode": 0,
"Signal": 0,
"KillTimeout": 0,
"KillError": "",
"KillReason": "",
"StartDelay": 0,
"DownloadError": "",
"ValidationError": "",
"DiskLimit": 0,
"FailedSibling": "",
"VaultError": "",
"TaskSignalReason": "",
"TaskSignal": "",
"DriverMessage": "",
"GenericSource": ""
},
{
"Type": "Killing",
"Time": 1713342315902477600,
"Message": "",
"DisplayMessage": "Sent interrupt. Waiting 5s before force killing",
"Details": {
"kill_timeout": "5s"
},
"FailsTask": false,
"RestartReason": "",
"SetupError": "",
"DriverError": "",
"ExitCode": 0,
"Signal": 0,
"KillTimeout": 5000000000,
"KillError": "",
"KillReason": "",
"StartDelay": 0,
"DownloadError": "",
"ValidationError": "",
"DiskLimit": 0,
"FailedSibling": "",
"VaultError": "",
"TaskSignalReason": "",
"TaskSignal": "",
"DriverMessage": "",
"GenericSource": ""
},
{
"Type": "Terminated",
"Time": 1713342363445686300,
"Message": "Docker container exited with non-zero exit code: 137",
"DisplayMessage": "Exit Code: 137, Exit Message: \"Docker container exited with non-zero exit code: 137\"",
"Details": {
"exit_code": "137",
"signal": "0",
"oom_killed": "false",
"exit_message": "Docker container exited with non-zero exit code: 137"
},
"FailsTask": false,
"RestartReason": "",
"SetupError": "",
"DriverError": "",
"ExitCode": 137,
"Signal": 0,
"KillTimeout": 0,
"KillError": "",
"KillReason": "",
"StartDelay": 0,
"DownloadError": "",
"ValidationError": "",
"DiskLimit": 0,
"FailedSibling": "",
"VaultError": "",
"TaskSignalReason": "",
"TaskSignal": "",
"DriverMessage": "",
"GenericSource": ""
},
{
"Type": "Killed",
"Time": 1713342363471816400,
"Message": "",
"DisplayMessage": "Task successfully killed",
"Details": {},
"FailsTask": false,
"RestartReason": "",
"SetupError": "",
"DriverError": "",
"ExitCode": 0,
"Signal": 0,
"KillTimeout": 0,
"KillError": "",
"KillReason": "",
"StartDelay": 0,
"DownloadError": "",
"ValidationError": "",
"DiskLimit": 0,
"FailedSibling": "",
"VaultError": "",
"TaskSignalReason": "",
"TaskSignal": "",
"DriverMessage": "",
"GenericSource": ""
},
{
"Type": "Killing",
"Time": 1713342602622158300,
"Message": "",
"DisplayMessage": "Sent interrupt. Waiting 5s before force killing",
"Details": {
"kill_timeout": "5s"
},
"FailsTask": false,
"RestartReason": "",
"SetupError": "",
"DriverError": "",
"ExitCode": 0,
"Signal": 0,
"KillTimeout": 5000000000,
"KillError": "",
"KillReason": "",
"StartDelay": 0,
"DownloadError": "",
"ValidationError": "",
"DiskLimit": 0,
"FailedSibling": "",
"VaultError": "",
"TaskSignalReason": "",
"TaskSignal": "",
"DriverMessage": "",
"GenericSource": ""
}
],
"TaskHandle": null
}
},
"NetworkStatus": {
"InterfaceName": "",
"Address": "",
"DNS": null
},
"SignedIdentities": {
"dagnomad-job-10169--pcap_merge-from-20200604-to-20200604--653920": "eyJhbGciOiJFZERTQSIsImtpZCI6Ijk0MjQxZjk1LTgwODUtZjgyZi01YTU0LWVhZTcxNjBiZTVkNCIsInR5cCI6IkpXVCJ9.eyJhdWQiOiJub21hZHByb2plY3QuaW8iLCJpYXQiOjE3MTMzNDAxNzIsImp0aSI6ImJjOGY3NDFmLTUyNTktMThlOC1kODUxLTFiYjU2ZjYxZmYwYyIsIm5iZiI6MTcxMzM0MDE3Miwibm9tYWRfYWxsb2NhdGlvbl9pZCI6ImE3NDliNTIwLTdmZGItN2EzMy1jY2E1LTQwMmRjMDQxNzFlNyIsIm5vbWFkX2pvYl9pZCI6ImRhZ25vbWFkLWpvYi0xMDE2OS0tcGNhcF9tZXJnZS1mcm9tLTIwMjAwNjA0LXRvLTIwMjAwNjA0LS02NTM5MjAiLCJub21hZF9uYW1lc3BhY2UiOiJkZWZhdWx0Iiwibm9tYWRfdGFzayI6ImRhZ25vbWFkLWpvYi0xMDE2OS0tcGNhcF9tZXJnZS1mcm9tLTIwMjAwNjA0LXRvLTIwMjAwNjA0LS02NTM5MjAiLCJzdWIiOiJnbG9iYWw6ZGVmYXVsdDpkYWdub21hZC1qb2ItMTAxNjktLXBjYXBfbWVyZ2UtZnJvbS0yMDIwMDYwNC10by0yMDIwMDYwNC0tNjUzOTIwOmRhZ25vbWFkLWpvYi0xMDE2OS0tcGNhcF9tZXJnZS1mcm9tLTIwMjAwNjA0LXRvLTIwMjAwNjA0LS02NTM5MjA6ZGFnbm9tYWQtam9iLTEwMTY5LS1wY2FwX21lcmdlLWZyb20tMjAyMDA2MDQtdG8tMjAyMDA2MDQtLTY1MzkyMDpkZWZhdWx0In0.4YHuE55xalxtEOzAoBOVLaTzeCMw0AgmamRO33WrkN6ial-fATsrYvo7C5x5Lk08CS-muWztl70RP3Flu2mrBw"
},
"SigningKeyID": "94241f95-8085-f82f-5a54-eae7160be5d4",
"CreateIndex": 4199283,
"ModifyIndex": 4209371,
"AllocModifyIndex": 4206496,
"CreateTime": 1713340172864965400,
"ModifyTime": 1713342602759739100
}