Hello is it possible to use one nomad-autoscaler task / container / job, to managed multi region nomad-client asg ?
config :
log_level = "INFO"
nomad {
address = "http://nomad-server.example.com:4646"
region = "global"
}
http {
bind_address = "0.0.0.0"
bind_port = 8080
}
telemetry {
disable_hostname = true
enable_hostname_label = true
collection_interval = "1s"
prometheus_metrics = true
prometheus_retention_time = "6h"
}
apm "prometheus" {
driver = "prometheus"
config = {
address = "http://prometheus.example.com:9090"
}
}
strategy "target-value" {
driver = "target-value"
}
target "aws-asg-us-west-2" {
driver = "aws-asg"
config = {
aws_region = "us-west-2"
}
}
target "aws-asg-us-east-1" {
driver = "aws-asg"
config = {
aws_region = "us-east-1"
}
}
policy :
scaling "cluster-policy-us-west-2" {
enabled = true
min = 1
max = 10
policy {
cooldown = "1m"
evaluation_interval = "1m"
check "cpu_allocated_percentage" {
source = "prometheus"
query = "sum(nomad_client_allocated_cpu{region=\"us-west-2\"}*100/(nomad_client_unallocated_cpu{region=\"us-west-2\"}+nomad_client_allocated_cpu{region=\"us-west-2\"}))/count(nomad_client_allocated_cpu{region=\"us-west-2\"})"
strategy "target-value" {
target = 70
}
}
check "mem_allocated_percentage" {
source = "prometheus"
query = "sum(nomad_client_allocated_memory{region=\"us-west-2\"}*100/(nomad_client_unallocated_memory{region=\"us-west-2\"}+nomad_client_allocated_memory{region=\"us-west-2\"}))/count(nomad_client_allocated_memory{region=\"us-west-2\"})"
strategy "target-value" {
target = 70
}
}
target "aws-asg-us-west-2" {
dry-run = "false"
aws_asg_name = "nomad-clients-us-west-2-888"
node_class = "us-west-2-aws"
node_purge = "true"
node_drain_deadline = "15m"
node_drain_ignore_system_jobs = "false"
node_selector_strategy = "empty_ignore_system"
}
}
}
scaling "cluster-policy-us-east-1" {
enabled = true
min = 1
max = 10
policy {
cooldown = "1m"
evaluation_interval = "1m"
check "cpu_allocated_percentage" {
source = "prometheus"
query = "sum(nomad_client_allocated_cpu{region=\"us-east-1\"}*100/(nomad_client_unallocated_cpu{region=\"us-east-1\"}+nomad_client_allocated_cpu{region=\"us-east-1\"}))/count(nomad_client_allocated_cpu{region=\"us-east-1\"})"
strategy "target-value" {
target = 70
}
}
check "mem_allocated_percentage" {
source = "prometheus"
query = "sum(nomad_client_allocated_memory{region=\"us-east-1\"}*100/(nomad_client_unallocated_memory{region=\"us-east-1\"}+nomad_client_allocated_memory{region=\"us-east-1\"}))/count(nomad_client_allocated_memory{region=\"us-east-1\"})"
strategy "target-value" {
target = 70
}
}
target "aws-asg-us-east-1" {
dry-run = "false"
aws_asg_name = "nomad-clients-us-east-1-999"
node_class = "us-east-1-aws"
node_purge = "true"
node_drain_deadline = "15m"
node_drain_ignore_system_jobs = "false"
node_selector_strategy = "empty_ignore_system"
}
}
}
already tried above config, but no luck till now
some logs:
2021-04-13T08:25:04.234Z [ERROR] policy_eval.worker: failed to evaluate policy: eval_id=ca908cdd-b966-2539-7875-e0fa5167f2c4 eval_token=50bb1578-9101-71ed-326c-8d4ba2a8e05b id=80ebb943-0b1c-c76e-f4bb-3e1ac59002ad policy_id=1c9a16e5-597a-eb07-3d63-8e816c3a63fa queue=cluster error="failed to scale target: failed to perform scaling action: 1 selected nodes are not found within ASG"
2021-04-13T08:25:04.234Z [WARN] policy_eval.broker: eval delivery limit reached: eval_id=ca908cdd-b966-2539-7875-e0fa5167f2c4 policy_id=1c9a16e5-597a-eb07-3d63-8e816c3a63fa token=50bb1578-9101-71ed-326c-8d4ba2a8e05b count=1 limit=1
2021-04-13T08:26:00.760Z [INFO] policy_eval.worker: scaling target: id=f67e05ce-2c38-60c1-8d84-50e4060c41e0 policy_id=1c9a16e5-597a-eb07-3d63-8e816c3a63fa queue=cluster target=aws-asg-us-west-2 from=6 to=1 reason="scaling down because factor is 0.051227" meta=map[nomad_policy_id:1c9a16e5-597a-eb07-3d63-8e816c3a63fa]
2021-04-13T08:26:04.434Z [ERROR] policy_eval.worker: failed to evaluate policy: eval_id=6987504c-7631-d0d8-4b41-fee71ce97ce9 eval_token=cccdea9f-0a46-9dcd-17aa-68881ea6c553 id=f67e05ce-2c38-60c1-8d84-50e4060c41e0 policy_id=1c9a16e5-597a-eb07-3d63-8e816c3a63fa queue=cluster error="failed to scale target: failed to perform scaling action: 1 selected nodes are not found within ASG"
Is it because the ASG Name differ with Launch Configuration Name ?