Hello,
I’m trying to setup on-demand batch cluster autoscaling.
These are the log messages that I’m encountering on startup:
2021-12-06T23:02:25.975Z [DEBUG] policy_eval.broker: dequeue eval: queue=horizontal
2021-12-06T23:02:25.975Z [DEBUG] policy_eval.broker: waiting for eval: queue=horizontal
2021-12-06T23:02:25.977Z [INFO] file_policy_source: starting file policy monitor: file=/local/policies/policy.hcl name=cluster_policy policy_id=3429601c-3e58-97ca-f1b9-8235f770fe52
2021-12-06T23:02:35.978Z [WARN] policy_manager.policy_handler: failed to get target status: policy_id=3429601c-3e58-97ca-f1b9-8235f770fe52 error="failed to run Nomad node readiness check: node pool identification method required"
2021-12-06T23:02:55.981Z [ERROR] policy_manager: failed to call the Nomad list policies API: Unexpected response code: 504 (upstream request timeout)
2021-12-06T23:03:05.978Z [WARN] policy_manager.policy_handler: failed to get target status: policy_id=3429601c-3e58-97ca-f1b9-8235f770fe52 error="failed to run Nomad node readiness check: node pool identification method required"
2021-12-06T23:03:25.939Z [DEBUG] internal_plugin.nomad-target: triggering run of handler garbage collection
2021-12-06T23:03:35.967Z [ERROR] policy_manager: failed to call the Nomad list policies API: Unexpected response code: 408 (stream timeout)
What might these error mean?
We have ACL and Consul Service Mesh enabled in our cluster.
config.hcl
log_level = "DEBUG"
http {
bind_address = "0.0.0.0"
}
policy {
dir = "{{ env "NOMAD_TASK_DIR" }}/policies"
}
nomad {
address = "${var.nomad_address}"
namespace = "batch"
token = "redacted"
}
apm "prometheus" {
driver = "prometheus"
config = {
address = "http://localhost:9090"
}
}
target "gce-mig" {
driver = "gce-mig"
config = {
credentials = "secrets/sa.json"
}
}
strategy "pass-through" {
driver = "pass-through"
}
policy.hcl
scaling "cluster_policy" {
enabled = true
min = 1
max = 3
policy {
cooldown = "1m"
evaluation_interval = "10s"
check "batch_jobs_in_progess" {
source = "prometheus"
query = "sum(nomad_nomad_job_summary_queued{exported_job=~\".*dispatch.*\"} + nomad_nomad_job_summary_running{exported_job=~\".*dispatch.*\"}) OR on() vector(0)"
strategy "pass-through" {}
}
target "gce-mig" {
project = "${var.project}"
region = "${var.region}"
mig_name = "${var.mig_name}"
node_drain_deadline = "30m"
node_selector_strategy = "empty_ignore_system"
}
}
}
nomad policy:
namespace "batch" {
policy = "scale"
}
node {
policy = "write"
}
Thanks!