Nomad Autoscaler Job:
job "autoscaler" {
datacenters = ["dc1"]
group "autoscaler" {
count = 1
network {
port "http" {}
}
task "autoscaler" {
driver = "docker"
config {
image = "hashicorp/nomad-autoscaler:0.3.3"
command = "nomad-autoscaler"
args = [
"agent",
"-config",
"${NOMAD_TASK_DIR}/config.hcl",
"-http-bind-address",
"0.0.0.0",
"-http-bind-port",
"${NOMAD_PORT_http}",
"-policy-dir",
"${NOMAD_TASK_DIR}/policies/",
]
ports = ["http"]
}
template {
data = <<EOF
nomad {
address = "http://{{env "attr.unique.network.ip-address" }}:4646"
}
apm "prometheus" {
driver = "prometheus"
config = {
address = "http://{{ range service "prometheus" }}{{ .Address }}:{{ .Port }}{{ end }}"
}
}
target "aws-asg" {
driver = "aws-asg"
config = {
aws_region = "{{ $x := env "attr.platform.aws.placement.availability-zone" }}{{ $length := len $x |subtract 1 }}{{ slice $x 0 $length}}"
}
}
strategy "target-value" {
driver = "target-value"
}
EOF
destination = "${NOMAD_TASK_DIR}/config.hcl"
}
template {
data = <<EOF
scaling "cluster_policy" {
enabled = true
min = 1
max = 2
policy {
cooldown = "2m"
evaluation_interval = "1m"
check "cpu_allocated_percentage" {
source = "prometheus"
query = "sum(nomad_client_allocated_cpu{node_class=\"hashistack\"}*100/(nomad_client_unallocated_cpu{node_class=\"hashistack\"}+nomad_client_allocated_cpu{node_class=\"hashistack\"}))/count(nomad_client_allocated_cpu{node_class=\"hashistack\"})"
strategy "target-value" {
target = 70
}
}
check "mem_allocated_percentage" {
source = "prometheus"
query = "sum(nomad_client_allocated_memory{node_class=\"hashistack\"}*100/(nomad_client_unallocated_memory{node_class=\"hashistack\"}+nomad_client_allocated_memory{node_class=\"hashistack\"}))/count(nomad_client_allocated_memory{node_class=\"hashistack\"})"
strategy "target-value" {
target = 70
}
}
target "aws-asg" {
dry-run = "false"
aws_asg_name = "hashistack-nomad_client"
node_class = "hashistack"
node_drain_deadline = "1m"
node_drain_ignore_system_jobs = "true"
node_selector_strategy = "least_busy"
}
}
}
EOF
destination = "${NOMAD_TASK_DIR}/policies/hashistack.hcl"
}
resources {
cpu = 50
memory = 128
}
service {
name = "autoscaler"
port = "http"
check {
type = "http"
path = "/v1/health"
interval = "5s"
timeout = "2s"
}
}
}
}
}
After Deploying Few nomad jobs below :
job 1:
job "nginx1" {
datacenters = ["dc1"]
group "nginx1" {
count = 3
migrate {
max_parallel = 1
min_healthy_time = "10s"
healthy_deadline = "5m"
}
network {
port "http" {
}
}
service {
name = "nginx1"
port = "http"
}
task "nginx1" {
driver = "docker"
resources {
cpu = 60
memory = 200
}
config {
image = "nginx"
ports = ["http"]
volumes = [
"local:/etc/nginx1/conf.d",
]
}
template {
data = <<EOF
upstream backend {
{{ range service "demo-webapp" }}
server {{ .Address }}:{{ .Port }};
{{ else }}server 127.0.0.1:65535; # force a 502
{{ end }}
}
server {
listen 8080;
location / {
proxy_pass http://backend;
}
}
EOF
destination = "local/load-balancer.conf"
change_mode = "signal"
change_signal = "SIGHUP"
}
}
}
}
Job 2:
job "nginx" {
datacenters = ["dc1"]
group "nginx" {
count = 5
migrate {
max_parallel = 1
min_healthy_time = "10s"
healthy_deadline = "5m"
}
network {
port "http" {
}
}
service {
name = "nginx"
port = "http"
}
task "nginx" {
driver = "docker"
resources {
cpu = 80
memory = 80
}
config {
image = "nginx"
ports = ["http"]
volumes = [
"local:/etc/nginx/conf.d",
]
}
template {
data = <<EOF
upstream backend {
{{ range service "demo-webapp" }}
server {{ .Address }}:{{ .Port }};
{{ else }}server 127.0.0.1:65535; # force a 502
{{ end }}
}
server {
listen 8080;
location / {
proxy_pass http://backend;
}
}
EOF
destination = "local/load-balancer.conf"
change_mode = "signal"
change_signal = "SIGHUP"
}
}
}
}
Which Trigger Autoscaler to Deployed another aws node, currently i have two nodes:
Node 2:
ID = d46f063c-9f3d-d36d-6fc5-a1f72bbfeb5e
Class = hashistack
DC = dc1
Drain = false
Eligibility = eligible
Status = ready
CSI Controllers = <none>
CSI Drivers = <none>
Uptime = 1h8m17s
Host Volumes = <none>
Host Networks = <none>
CSI Volumes = <none>
Driver Status = docker,exec,java,raw_exec
Node Events
Time Subsystem Message
2023-02-14T05:55:38Z Cluster Node registered
Allocated Resources
CPU Memory Disk
120/5000 MHz 400 MiB/1.9 GiB 600 MiB/4.4 GiB
Allocation Resource Utilization
CPU Memory
0/5000 MHz 5.8 MiB/1.9 GiB
Host Resource Utilization
CPU Memory Disk
74/5000 MHz 341 MiB/1.9 GiB (/dev/root)
Allocations
ID Node ID Task Group Version Desired Status Created Modified
14adacc4 d46f063c nginx1 0 run running 1h7m ago 1h7m ago
d8901d97 d46f063c nginx1 0 run running 1h7m ago 1h7m ago
Node 1:
ID = efff8ea0-2884-09ed-f6f0-c02b03e00e0f
Class = hashistack
DC = dc1
Drain = false
Eligibility = eligible
Status = ready
CSI Controllers = <none>
CSI Drivers = <none>
Uptime = 1h39m18s
Host Volumes = <none>
Host Networks = <none>
CSI Volumes = <none>
Driver Status = docker,exec,java,raw_exec
Node Events
Time Subsystem Message
2023-02-14T04:58:08Z Cluster Node registered
Allocated Resources
CPU Memory Disk
710/5000 MHz 1.0 GiB/1.9 GiB 2.6 GiB/4.4 GiB
Allocation Resource Utilization
CPU Memory
5/5000 MHz 130 MiB/1.9 GiB
Host Resource Utilization
CPU Memory Disk
295/5000 MHz 545 MiB/1.9 GiB (/dev/root)
Allocations
ID Node ID Task Group Version Desired Status Created Modified
5f6e484e efff8ea0 nginx1 0 run running 42m55s ago 42m44s ago
4ad9b748 efff8ea0 nginx 1 stop complete 43m9s ago 29m2s ago
c6feb049 efff8ea0 nginx 1 stop complete 43m21s ago 29m3s ago
c159e3c8 efff8ea0 nginx 1 stop complete 43m34s ago 29m3s ago
dfbf2039 efff8ea0 nginx 1 stop complete 43m46s ago 29m3s ago
cbefae38 efff8ea0 nginx 1 stop complete 43m59s ago 29m2s ago
5f5679d3 efff8ea0 nginx 2 run running 44m11s ago 28m53s ago
04d3452d efff8ea0 nginx 2 run running 44m24s ago 28m53s ago
bc0de817 efff8ea0 nginx 2 run running 44m36s ago 28m53s ago
deefe146 efff8ea0 nginx 2 run running 44m49s ago 28m53s ago
38a66465 efff8ea0 nginx 2 run running 45m1s ago 28m53s ago
a6d93a04 efff8ea0 autoscaler 0 run running 1h29m ago 1h28m ago
d8a1bae2 efff8ea0 prometheus 0 run running 1h38m ago 1h37m ago
0b5fbbfa efff8ea0 grafana 0 run running 1h38m ago 1h37m ago
As per Logic. as per my autosacler logic the avg cpu / mem ultization is below 70, draining and allocation migration should happen.
But its not happenning, until i clean the nomad jobs from node 2 it go for draining