Nomad task pending for few minutes

I create a job with constraint to a specific host to launch a batch job/task. But when I submit the job to nomad cluster, the alloc has been pending for few minutes to schedue and be distributed to the client agent. This is not an accident as I can reproduce it without too much effort.
As the following show:


And after 13 min, the client received the alloc, and ready to launch task:

This is another case of stuck alloc:


  1. My job file:
{
  "Affinities": null,
  "AllAtOnce": false,
  "Constraints": [
    {
      "LTarget": "${attr.unique.network.ip-address}",
      "Operand": "=",
      "RTarget": "xx.xxx.xxx.xxx"
    }
  ],
  "ConsulToken": "",
  "CreateIndex": 2567666,
  "Datacenters": [
    "chongqing"
  ],
  "Dispatched": false,
  "ID": "ScCmdJob-default-f631f9b2-779e-422a-b4a3-9982175c8092",
  "JobModifyIndex": 2567666,
  "Meta": null,
  "ModifyIndex": 2567668,
  "Multiregion": null,
  "Name": "ScCmdJob-default-f631f9b2-779e-422a-b4a3-9982175c8092",
  "Namespace": "default",
  "NomadTokenID": "71d2e1c5-63ff-3931-d575-3cbcad0d7cfb",
  "ParameterizedJob": null,
  "ParentID": "",
  "Payload": null,
  "Periodic": null,
  "Priority": 50,
  "Region": "chongqing",
  "Spreads": null,
  "Stable": false,
  "Status": "running",
  "StatusDescription": "",
  "Stop": false,
  "SubmitTime": 1645066384993949200,
  "TaskGroups": [
    {
      "Affinities": null,
      "Constraints": null,
      "Count": 1,
      "EphemeralDisk": {
        "Migrate": false,
        "SizeMB": 300,
        "Sticky": false
      },
      "Meta": null,
      "Migrate": null,
      "Name": "ScCmdJob-default-f631f9b2-779e-422a-b4a3-9982175c8092",
      "Networks": null,
      "ReschedulePolicy": {
        "Attempts": 0,
        "Delay": 5000000000,
        "DelayFunction": "constant",
        "Interval": 86400000000000,
        "MaxDelay": 0,
        "Unlimited": false
      },
      "RestartPolicy": {
        "Attempts": 0,
        "Delay": 15000000000,
        "Interval": 86400000000000,
        "Mode": "fail"
      },
      "Scaling": null,
      "Services": null,
      "ShutdownDelay": null,
      "Spreads": null,
      "StopAfterClientDisconnect": null,
      "Tasks": [
        {
          "Affinities": null,
          "Artifacts": null,
          "CSIPluginConfig": null,
          "Config": {
            "consul_token": "xxxxxxxxxxxxx",
            "local_path": "",
            "persistence": false,
            "result_path": "system/job_state/results/ScCmdJob-default-f631f9b2-779e-422a-b4a3-9982175c8092",
            "args": [
              "-c",
              "${NOMAD_TASK_DIR}/ScCmdJob-default-f631f9b2-779e-422a-b4a3-9982175c8092.sh"
            ],
            "command": "/bin/bash"
          },
          "Constraints": null,
          "DispatchPayload": null,
          "Driver": "sc_cmd",
          "Env": null,
          "KillSignal": "",
          "KillTimeout": 5000000000,
          "Kind": "",
          "Leader": false,
          "Lifecycle": null,
          "LogConfig": {
            "MaxFileSizeMB": 10,
            "MaxFiles": 10
          },
          "Meta": null,
          "Name": "ScCmdJob-default-f631f9b2-779e-422a-b4a3-9982175c8092",
          "Resources": {
            "CPU": 50,
            "Devices": null,
            "DiskMB": 0,
            "IOPS": 0,
            "MemoryMB": 100,
            "Networks": null
          },
          "RestartPolicy": {
            "Attempts": 0,
            "Delay": 15000000000,
            "Interval": 86400000000000,
            "Mode": "fail"
          },
          "Services": null,
          "ShutdownDelay": 0,
          "Templates": [
            {
              "ChangeMode": "restart",
              "ChangeSignal": "",
              "DestPath": "local/ScCmdJob-default-f631f9b2-779e-422a-b4a3-9982175c8092.sh",
              "EmbeddedTmpl": "{{ key \"system/job_state/scripts/ScCmdJob-default-f631f9b2-779e-422a-b4a3-9982175c8092\" }}",
              "Envvars": false,
              "LeftDelim": "{{",
              "Perms": "0777",
              "RightDelim": "}}",
              "SourcePath": "",
              "Splay": 5000000000,
              "VaultGrace": 0
            }
          ],
          "User": "root",
          "Vault": null,
          "VolumeMounts": null
        }
      ],
      "Update": null,
      "Volumes": null
    }
  ],
  "Type": "batch",
  "Update": {
    "AutoPromote": false,
    "AutoRevert": false,
    "Canary": 0,
    "HealthCheck": "",
    "HealthyDeadline": 0,
    "MaxParallel": 0,
    "MinHealthyTime": 0,
    "ProgressDeadline": 0,
    "Stagger": 0
  },
  "VaultToken": "",
  "Version": 0
}
  1. The log in one server is (another two server has no log about the job):
96511:2022-02-17T10:53:02.880+0800 [DEBUG] nomad.job.batch_sched: reconciled current state with desired state: eval_id=accf930d-128e-f9dc-0404-f2a3007325a5 job_id=ScCmdJob-default-4a24869a-9f85-41f4-91ed-f1837cfa017d namespace=default results="Total changes: (place 1) (destructive 0) (inplace 0) (stop 0)
96512:Desired Changes for "ScCmdJob-default-4a24869a-9f85-41f4-91ed-f1837cfa017d": (place 1) (inplace 0) (destructive 0) (stop 0) (migrate 0) (ignore 0) (canary 0)"
96513:2022-02-17T10:53:02.882+0800 [DEBUG] nomad.job.batch_sched: setting eval status: eval_id=accf930d-128e-f9dc-0404-f2a3007325a5 job_id=ScCmdJob-default-4a24869a-9f85-41f4-91ed-f1837cfa017d namespace=default status=complete
96515:2022-02-17T10:53:02.892+0800 [DEBUG] worker.batch_sched: reconciled current state with desired state: eval_id=d1a9df99-3758-8ebc-aeb1-f85d9b6121d6 job_id=ScCmdJob-default-4a24869a-9f85-41f4-91ed-f1837cfa017d namespace=default results="Total changes: (place 1) (destructive 0) (inplace 0) (stop 0)
96516:Desired Changes for "ScCmdJob-default-4a24869a-9f85-41f4-91ed-f1837cfa017d": (place 1) (inplace 0) (destructive 0) (stop 0) (migrate 0) (ignore 0) (canary 0)"
96518:2022-02-17T10:53:02.896+0800 [DEBUG] worker.batch_sched: setting eval status: eval_id=d1a9df99-3758-8ebc-aeb1-f85d9b6121d6 job_id=ScCmdJob-default-4a24869a-9f85-41f4-91ed-f1837cfa017d namespace=default status=complete
96519:2022-02-17T10:53:02.898+0800 [DEBUG] worker: updated evaluation: eval="<Eval "d1a9df99-3758-8ebc-aeb1-f85d9b6121d6" JobID: "ScCmdJob-default-4a24869a-9f85-41f4-91ed-f1837cfa017d" Namespace: "default">"
96762:2022-02-17T10:55:02.416+0800 [DEBUG] http: request complete: method=GET path=/v1/job/ScCmdJob-default-4a24869a-9f85-41f4-91ed-f1837cfa017d?region=chongqing duration=295.065µs
  1. The target nomad client does not contain any log when the alloc is pending.

PS: The target nomad client has launched many batch jobs (i.e., one-time shell script task, not long-running service), but I don’t know whether this matters.

Sorry for my poor english, and looking forward to a reply.


There is another problem after this:
Once the alloc launched on target client, it still will stuck in the step of “Build Task Directory”. As the following show, it has been stuck for nearly 10min.

The client complete log of task is:

13287:2022-02-17T13:15:13.128+0800 [DEBUG] client.alloc_runner.task_runner.task_hook.logmon: starting plugin: alloc_id=939c81d6-9bc2-2024-91fb-0a0e7e840d8c task=ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9 path=/data/workload_agent/nomad/nomad args=[/data/workload_agent/nomad/nomad, logmon]
13288:2022-02-17T13:15:13.128+0800 [DEBUG] client.alloc_runner.task_runner.task_hook.logmon: plugin started: alloc_id=939c81d6-9bc2-2024-91fb-0a0e7e840d8c task=ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9 path=/data/workload_agent/nomad/nomad pid=29814
13289:2022-02-17T13:15:13.128+0800 [DEBUG] client.alloc_runner.task_runner.task_hook.logmon: waiting for RPC address: alloc_id=939c81d6-9bc2-2024-91fb-0a0e7e840d8c task=ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9 path=/data/workload_agent/nomad/nomad
13296:2022-02-17T13:15:13.188+0800 [DEBUG] client.alloc_runner.task_runner.task_hook.logmon: using plugin: alloc_id=939c81d6-9bc2-2024-91fb-0a0e7e840d8c task=ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9 version=2
13297:2022-02-17T13:15:13.188+0800 [DEBUG] client.alloc_runner.task_runner.task_hook.logmon.nomad: plugin address: alloc_id=939c81d6-9bc2-2024-91fb-0a0e7e840d8c task=ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9 address=/tmp/plugin613100023 network=unix @module=logmon timestamp=2022-02-17T13:15:13.188+0800
13302:2022-02-17T13:15:13.193+0800 [INFO]  client.alloc_runner.task_runner.task_hook.logmon.nomad: opening fifo: alloc_id=939c81d6-9bc2-2024-91fb-0a0e7e840d8c task=ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9 @module=logmon path=/data/workload_agent/nomad/client/data/alloc/939c81d6-9bc2-2024-91fb-0a0e7e840d8c/alloc/logs/.ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9.stdout.fifo timestamp=2022-02-17T13:15:13.193+0800
13303:2022-02-17T13:15:13.193+0800 [INFO]  client.alloc_runner.task_runner.task_hook.logmon.nomad: opening fifo: alloc_id=939c81d6-9bc2-2024-91fb-0a0e7e840d8c task=ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9 @module=logmon path=/data/workload_agent/nomad/client/data/alloc/939c81d6-9bc2-2024-91fb-0a0e7e840d8c/alloc/logs/.ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9.stderr.fifo timestamp=2022-02-17T13:15:13.193+0800
13440:2022/02/17 13:24:05.824501 [DEBUG] (runner) final config: {"Consul":{"Address":"127.0.0.1:38500","Auth":{"Enabled":false,"Username":"","Password":""},"Retry":{"Attempts":12,"Backoff":250000000,"MaxBackoff":60000000000,"Enabled":true},"SSL":{"CaCert":"","CaPath":"","Cert":"","Enabled":false,"Key":"","ServerName":"","Verify":true},"Token":"base-token","Transport":{"DialKeepAlive":30000000000,"DialTimeout":30000000000,"DisableKeepAlives":false,"IdleConnTimeout":90000000000,"MaxIdleConns":100,"MaxIdleConnsPerHost":5,"TLSHandshakeTimeout":10000000000}},"Dedup":{"Enabled":false,"MaxStale":2000000000,"Prefix":"consul-template/dedup/","TTL":15000000000},"Exec":{"Command":"","Enabled":false,"Env":{"Blacklist":[],"Custom":[],"Pristine":false,"Whitelist":[]},"KillSignal":2,"KillTimeout":30000000000,"ReloadSignal":null,"Splay":0,"Timeout":0},"KillSignal":2,"LogLevel":"WARN","MaxStale":2000000000,"PidFile":"","ReloadSignal":1,"Syslog":{"Enabled":false,"Facility":"LOCAL0"},"Templates":[{"Backup":false,"Command":"","CommandTimeout":30000000000,"Contents":"{{ key \"system/job_state/scripts/ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9\" }}","CreateDestDirs":true,"Destination":"/data/workload_agent/nomad/client/data/alloc/939c81d6-9bc2-2024-91fb-0a0e7e840d8c/ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9/local/ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9.sh","ErrMissingKey":false,"Exec":{"Command":"","Enabled":false,"Env":{"Blacklist":[],"Custom":[],"Pristine":false,"Whitelist":[]},"KillSignal":2,"KillTimeout":30000000000,"ReloadSignal":null,"Splay":0,"Timeout":30000000000},"Perms":511,"Source":"","Wait":{"Enabled":false,"Min":0,"Max":0},"LeftDelim":"{{","RightDelim":"}}","FunctionBlacklist":["plugin"],"SandboxPath":"/data/workload_agent/nomad/client/data/alloc/939c81d6-9bc2-2024-91fb-0a0e7e840d8c/ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9"}],"Vault":{"Address":"","Enabled":false,"Namespace":"","RenewToken":false,"Retry":{"Attempts":12,"Backoff":250000000,"MaxBackoff":60000000000,"Enabled":true},"SSL":{"CaCert":"","CaPath":"","Cert":"","Enabled":true,"Key":"","ServerName":"","Verify":true},"Transport":{"DialKeepAlive":30000000000,"DialTimeout":30000000000,"DisableKeepAlives":false,"IdleConnTimeout":90000000000,"MaxIdleConns":100,"MaxIdleConnsPerHost":5,"TLSHandshakeTimeout":10000000000},"UnwrapToken":false},"Wait":{"Enabled":false,"Min":0,"Max":0},"Once":false}
13447:2022/02/17 13:24:05.824974 [DEBUG] (runner) missing dependency: kv.block(system/job_state/scripts/ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9)
13448:2022/02/17 13:24:05.824984 [DEBUG] (runner) add used dependency kv.block(system/job_state/scripts/ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9) to missing since isLeader but do not have a watcher
13450:2022/02/17 13:24:05.825007 [DEBUG] (watcher) adding kv.block(system/job_state/scripts/ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9)
13453:2022/02/17 13:24:05.927750 [DEBUG] (runner) receiving dependency kv.block(system/job_state/scripts/ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9)
13456:2022/02/17 13:24:05.928210 [DEBUG] (runner) rendering "(dynamic)" => "/data/workload_agent/nomad/client/data/alloc/939c81d6-9bc2-2024-91fb-0a0e7e840d8c/ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9/local/ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9.sh"
13457:2022/02/17 13:24:06.199004 [INFO] (runner) rendered "(dynamic)" => "/data/workload_agent/nomad/client/data/alloc/939c81d6-9bc2-2024-91fb-0a0e7e840d8c/ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9/local/ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9.sh"
13459:2022/02/17 13:24:06.199092 [DEBUG] (runner) kv.block(system/job_state/scripts/ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9) is still needed
13518:2022-02-17T13:27:43.800+0800 [DEBUG] client.driver_mgr.sccmd-driver: starting task: driver=sc_cmd driver_cfg="{ConsulToken:581d3d37-d13c-3a02-a2ab-54f2cef5e822 ResultPath:system/job_state/results/ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9 LocalPath: SourcePath: All:false Persistence:false Command:/bin/bash Args:[-c /data/workload_agent/nomad/client/data/alloc/939c81d6-9bc2-2024-91fb-0a0e7e840d8c/ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9/local/ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9.sh]}" @module=sc_cmd timestamp=2022-02-17T13:27:43.799+0800
13519:2022-02-17T13:27:43.800+0800 [DEBUG] client.driver_mgr.sccmd-driver: starting plugin: driver=sc_cmd @module=sc_cmd.executor alloc_id=939c81d6-9bc2-2024-91fb-0a0e7e840d8c args=[/data/workload_agent/nomad/drivers/sccmd-driver, executor, {"LogFile":"/data/workload_agent/nomad/client/data/alloc/939c81d6-9bc2-2024-91fb-0a0e7e840d8c/ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9/executor.out","LogLevel":"debug","FSIsolation":false}] path=/data/workload_agent/nomad/drivers/sccmd-driver task_name=ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9 timestamp=2022-02-17T13:27:43.800+0800
13520:2022-02-17T13:27:43.807+0800 [DEBUG] client.driver_mgr.sccmd-driver: plugin started: driver=sc_cmd pid=658 task_name=ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9 @module=sc_cmd.executor alloc_id=939c81d6-9bc2-2024-91fb-0a0e7e840d8c path=/data/workload_agent/nomad/drivers/sccmd-driver timestamp=2022-02-17T13:27:43.800+0800
13521:2022-02-17T13:27:43.807+0800 [DEBUG] client.driver_mgr.sccmd-driver: waiting for RPC address: driver=sc_cmd @module=sc_cmd.executor alloc_id=939c81d6-9bc2-2024-91fb-0a0e7e840d8c path=/data/workload_agent/nomad/drivers/sccmd-driver task_name=ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9 timestamp=2022-02-17T13:27:43.800+0800
13522:2022-02-17T13:27:43.815+0800 [DEBUG] client.driver_mgr.sccmd-driver: using plugin: driver=sc_cmd alloc_id=939c81d6-9bc2-2024-91fb-0a0e7e840d8c task_name=ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9 version=2 @module=sc_cmd.executor timestamp=2022-02-17T13:27:43.815+0800
13625:2022-02-17T13:33:33.400+0800 [WARN]  client.driver_mgr.sccmd-driver: received EOF, stopping recv loop: driver=sc_cmd @module=sc_cmd.executor.stdio alloc_id=939c81d6-9bc2-2024-91fb-0a0e7e840d8c err="rpc error: code = Unavailable desc = transport is closing" task_name=ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9 timestamp=2022-02-17T13:33:33.400+0800
13626:2022-02-17T13:33:33.400+0800 [ERROR] client.driver_mgr.sccmd-driver: error receiving stream from Stats executor RPC, closing stream: driver=sc_cmd task_name=ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9 @module=sc_cmd alloc_id=939c81d6-9bc2-2024-91fb-0a0e7e840d8c error="rpc error: code = Unavailable desc = transport is closing" timestamp=2022-02-17T13:33:33.400+0800
13627:2022-02-17T13:33:33.400+0800 [DEBUG] client.driver_mgr.sccmd-driver: plugin process exited: driver=sc_cmd pid=658 task_name=ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9 @module=sc_cmd.executor alloc_id=939c81d6-9bc2-2024-91fb-0a0e7e840d8c path=/data/workload_agent/nomad/drivers/sccmd-driver timestamp=2022-02-17T13:33:33.400+0800
13628:2022-02-17T13:33:33.400+0800 [DEBUG] client.driver_mgr.sccmd-driver: plugin exited: driver=sc_cmd @module=sc_cmd.executor alloc_id=939c81d6-9bc2-2024-91fb-0a0e7e840d8c task_name=ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9 timestamp=2022-02-17T13:33:33.400+0800
13630:2022-02-17T13:33:33.400+0800 [INFO]  client.alloc_runner.task_runner: not restarting task: alloc_id=939c81d6-9bc2-2024-91fb-0a0e7e840d8c task=ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9 reason="Policy allows no restarts"
13644:2022-02-17T13:33:33.768+0800 [DEBUG] http: request complete: method=GET path=/v1/client/fs/cat/939c81d6-9bc2-2024-91fb-0a0e7e840d8c?path=.%2Falloc%2Flogs%2FScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9.stdout.0&region=chongqing duration=3m47.327182306s
13646:2022-02-17T13:33:33.784+0800 [DEBUG] http: request complete: method=GET path=/v1/client/fs/cat/939c81d6-9bc2-2024-91fb-0a0e7e840d8c?path=.%2Falloc%2Flogs%2FScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9.stderr.0&region=chongqing duration=721.723µs
13649:2022-02-17T13:33:34.285+0800 [DEBUG] client.alloc_runner.task_runner.task_hook.logmon: plugin process exited: alloc_id=939c81d6-9bc2-2024-91fb-0a0e7e840d8c task=ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9 path=/data/workload_agent/nomad/nomad pid=29814
13650:2022-02-17T13:33:34.287+0800 [DEBUG] client.alloc_runner.task_runner.task_hook.logmon: plugin exited: alloc_id=939c81d6-9bc2-2024-91fb-0a0e7e840d8c task=ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9
13678:2022-02-17T13:34:06.136+0800 [DEBUG] client.alloc_runner.task_runner: task run loop exiting: alloc_id=939c81d6-9bc2-2024-91fb-0a0e7e840d8c task=ScCmdJob-default-263fbc7f-1179-4965-89c1-743a5262ffb9

Try to restart csi plugin nodes job.