Bonjour,
I try to deploy dev cluster to learn… But… No cluster leader
In debug, I have this:
Dec 30 21:13:26 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:26.595+0100 [DEBUG] http: UI is enabled
Dec 30 21:13:26 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:26.595+0100 [WARN] http: enable_debug is set to true. This is insecure and should not be enabled in production
Dec 30 21:13:26 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:26.600+0100 [DEBUG] nomad: memberlist: Stream connection from=10.0.0.20:55068
Dec 30 21:13:26 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:26.604+0100 [ERROR] nomad: memberlist: failed to receive: No installed keys could decrypt the message from=10.0.0.20:55068
Dec 30 21:13:26 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:26.604+0100 [INFO] agent.joiner: starting retry join: servers="10.0.0.10 10.0.0.20 10.0.0.30"
Dec 30 21:13:26 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:26.604+0100 [DEBUG] nomad: memberlist: Initiating push/pull sync with: 10.0.0.10:4648
Dec 30 21:13:26 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:26.604+0100 [DEBUG] nomad: memberlist: Stream connection from=10.0.0.10:43938
Dec 30 21:13:26 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:26.605+0100 [INFO] agent.joiner: starting retry join: servers="10.0.0.10 10.0.0.20 10.0.0.30"
Dec 30 21:13:26 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:26.606+0100 [DEBUG] nomad: memberlist: Initiating push/pull sync with: 10.0.0.20:4648
Dec 30 21:13:26 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:26.608+0100 [DEBUG] client.server_mgr: new server list: new_servers=[10.0.0.10:4647, 10.0.0.20:4647, 10.0.0.30:4647] old_servers=[10.0.0.30:4647, 10.0.0.10:4647, 10.0.>
Dec 30 21:13:26 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:26.608+0100 [INFO] agent.joiner: retry join completed: initial_servers=3 agent_mode=client
Dec 30 21:13:26 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:26.608+0100 [DEBUG] nomad: memberlist: Failed to join 10.0.0.20:4648: No installed keys could decrypt the message
Dec 30 21:13:26 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:26.610+0100 [DEBUG] nomad: memberlist: Stream connection from=10.0.0.30:60072
Dec 30 21:13:26 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:26.610+0100 [DEBUG] nomad: memberlist: Initiating push/pull sync with: 10.0.0.30:4648
Dec 30 21:13:26 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:26.610+0100 [ERROR] nomad: memberlist: failed to receive: No installed keys could decrypt the message from=10.0.0.30:60072
Dec 30 21:13:26 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:26.612+0100 [DEBUG] nomad: memberlist: Failed to join 10.0.0.30:4648: No installed keys could decrypt the message
Dec 30 21:13:26 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:26.612+0100 [INFO] agent.joiner: retry join completed: initial_servers=1 agent_mode=server
Dec 30 21:13:29 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:29.451+0100 [DEBUG] http: request complete: method=GET path=/v1/agent/health?type=client duration="221.361µs"
Dec 30 21:13:30 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:30.954+0100 [DEBUG] nomad: memberlist: Stream connection from=10.0.0.10:43946
Dec 30 21:13:31 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:31.620+0100 [ERROR] worker: failed to dequeue evaluation: worker_id=28f5b85b-adeb-75ae-eb0d-cc4bca1f833b error="No cluster leader"
Dec 30 21:13:31 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:31.650+0100 [ERROR] worker: failed to dequeue evaluation: worker_id=392ecfd6-ad24-4190-c231-878bbd2e0e1e error="No cluster leader"
Dec 30 21:13:31 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:31.877+0100 [ERROR] client.rpc: error performing RPC to server: error="rpc error: No cluster leader" rpc=Node.Register server=10.0.0.30:4647
Dec 30 21:13:31 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:31.877+0100 [ERROR] client.rpc: error performing RPC to server, deadline exceeded, cannot retry: error="rpc error: No cluster leader" rpc=Node.Register
Dec 30 21:13:31 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:31.877+0100 [ERROR] client: error registering: error="rpc error: No cluster leader"
Dec 30 21:13:36 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:36.820+0100 [ERROR] worker: failed to dequeue evaluation: worker_id=392ecfd6-ad24-4190-c231-878bbd2e0e1e error="No cluster leader"
Dec 30 21:13:36 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:36.851+0100 [ERROR] worker: failed to dequeue evaluation: worker_id=28f5b85b-adeb-75ae-eb0d-cc4bca1f833b error="No cluster leader"
Dec 30 21:13:37 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:37.794+0100 [ERROR] http: request failed: method=GET path=/v1/agent/health?type=server error="{\"server\":{\"ok\":false,\"message\":\"No cluster leader\"}}" code=500
Dec 30 21:13:37 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:37.795+0100 [DEBUG] http: request complete: method=GET path=/v1/agent/health?type=server duration=5.149705577s
Dec 30 21:13:39 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:39.467+0100 [DEBUG] http: request complete: method=GET path=/v1/agent/health?type=client duration="88.619µs"
Dec 30 21:13:40 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:40.958+0100 [DEBUG] nomad: memberlist: Stream connection from=10.0.0.10:55928
Dec 30 21:13:42 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:42.103+0100 [ERROR] worker: failed to dequeue evaluation: worker_id=28f5b85b-adeb-75ae-eb0d-cc4bca1f833b error="No cluster leader"
Dec 30 21:13:42 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:42.174+0100 [ERROR] worker: failed to dequeue evaluation: worker_id=392ecfd6-ad24-4190-c231-878bbd2e0e1e error="No cluster leader"
Dec 30 21:13:47 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:47.539+0100 [ERROR] worker: failed to dequeue evaluation: worker_id=392ecfd6-ad24-4190-c231-878bbd2e0e1e error="No cluster leader"
Dec 30 21:13:47 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:47.659+0100 [ERROR] worker: failed to dequeue evaluation: worker_id=28f5b85b-adeb-75ae-eb0d-cc4bca1f833b error="No cluster leader"
Dec 30 21:13:49 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:49.495+0100 [DEBUG] http: request complete: method=GET path=/v1/agent/health?type=client duration="267.733µs"
Dec 30 21:13:50 dev-cluster-node01 nomad[36964]: 2023-12-30T21:13:50.961+0100 [DEBUG] nomad: memberlist: Stream connection from=10.0.0.10:37046
Stange thing: No installed keys could decrypt the message
?!
Other strange thing, in Consul, services not deregister, this is consul state when Nomad is stop:
This is my nomad configs files:
nomad.hcl
:
name = "dev-cluster-node01"
region = "global"
datacenter = "dc1"
disable_anonymous_signature = false
disable_update_check = false
data_dir = "/opt/nomad"
addresses {
http = "10.0.0.10"
rpc = "10.0.0.10"
serf = "10.0.0.10"
}
advertise {
http = "10.0.0.10"
rpc = "10.0.0.10"
serf = "10.0.0.10"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
enable_debug = true
log_file = "/var/log/nomad/nomad.log"
log_level = "DEBUG"
log_rotate_bytes = 0
log_rotate_duration = "24h"
log_rotate_max_files = 0
leave_on_terminate = true
leave_on_interrupt = true
acl {
enabled = true
token_ttl = "30s"
policy_ttl = "30s"
replication_token = ""
}
telemetry {
disable_hostname = false
collection_interval = "5s"
use_node_name = false
publish_allocation_metrics = true
publish_node_metrics = true
filter_default = true
prefix_filter = []
disable_dispatched_job_summary_metrics = false
statsite_address = ""
statsd_address = ""
datadog_address = ""
datadog_tags = []
prometheus_metrics = true
circonus_api_token = ""
circonus_api_app = "nomad"
circonus_api_url = "https://api.circonus.com/v2"
circonus_submission_interval = "10s"
circonus_submission_url = ""
circonus_check_id = ""
circonus_check_force_metric_activation = false
circonus_check_instance_id = ""
circonus_check_search_tag = ""
circonus_check_display_name = ""
circonus_check_tags = ""
circonus_broker_id = ""
circonus_broker_select_tag = ""
}
autopilot {
cleanup_dead_servers = true
last_contact_threshold = "200ms"
max_trailing_logs = 250
server_stabilization_time = "10s"
}
ui {
enabled = true
content_security_policy {
connect_src = ["*"]
default_src = ["'none'"]
form_action = ["'none'"]
frame_ancestors = ["'none'"]
img_src = ["'self'","data:"]
script_src = ["'self'"]
style_src = ["'self'","'unsafe-inline'"]
}
consul {
ui_url = "https://127.0.0.1:8501/ui"
}
vault {
ui_url = "https://127.0.0.1:8200/ui"
}
label {
text = "dev-cluster-node01"
background_color = "blue"
text_color = "white"
}
}
server.hcl
:
server {
enabled = true
bootstrap_expect = 3
server_join {
retry_join = [ "10.0.0.10", "10.0.0.20", "10.0.0.30" ]
retry_max = 3
retry_interval = "15s"
}
data_dir = "/opt/nomad/server"
rejoin_after_leave = true
enabled_schedulers = ["service","batch","system"]
# num_schedulers = 2
heartbeat_grace = "10s"
min_heartbeat_ttl = "10s"
failover_heartbeat_ttl = "5m"
max_heartbeats_per_second = 50.0
event_buffer_size = 100
node_gc_threshold = "24h"
eval_gc_threshold = "1h"
job_gc_threshold = "4h"
deployment_gc_threshold = "1h"
encrypt = "rtGgbEyFQtljvvmeyfMWF89WT/xo1o/RfVNWEQuEh6Q="
raft_protocol = 3
}
client.hcl
:
client {
enabled = true
servers = ["10.0.0.10", "10.0.0.20", "10.0.0.30"]
server_join {
retry_join = [ "10.0.0.10", "10.0.0.20", "10.0.0.30" ]
retry_max = 3
retry_interval = "15s"
}
state_dir = "/opt/nomad/client"
node_class = "dev-cluster-node01-client"
node_pool = "dev_cluster"
no_host_uuid = false
host_network "public" {
interface = "enp0s1"
}
host_network "cluster" {
interface = "br0"
}
host_network "private" {
interface = "br1"
}
drain_on_shutdown {
deadline = "1m"
force = true
ignore_system_jobs = true
}
max_kill_timeout = "30s"
cpu_total_compute = 0
memory_total_mb = 0
disk_total_mb = 0
disk_free_mb = 0
gc_interval = "1m"
gc_disk_usage_threshold = 80
gc_inode_usage_threshold = 70
gc_parallel_destroys = 2
reserved {
cpu = 0
memory = 0
disk = 0
}
}
tls.hcl
:
tls {
http = true
rpc = true
ca_file = "/etc/ssl/hashistack/hashistack-ca.pem"
cert_file = "/etc/ssl/hashistack/dc1-server-nomad.pem"
key_file = "/etc/ssl/hashistack/dc1-server-nomad.key"
rpc_upgrade_mode = false
verify_server_hostname = "true"
verify_https_client = "false"
}
and consul.hcl
:
consul {
address = "127.0.0.1:8501"
grpc_address = "127.0.0.1:8503"
ssl = true
grpc_ca_file = "/etc/ssl/hashistack/hashistack-ca.pem"
ca_file = "/etc/ssl/hashistack/hashistack-ca.pem"
cert_file = "/etc/ssl/hashistack/dc1-server-consul.pem"
key_file = "/etc/ssl/hashistack/dc1-server-consul.key"
token = "1f2aba75-a9d5-03e1-70c0-5173dd2dedd8"
tags = []
auto_advertise = true
server_auto_join = true
server_service_name = "nomad-servers"
service_identity {
aud = ["consul.io"]
ttl = "1h"
}
task_identity {
aud = ["consul.io"]
ttl = "1h"
}
client_auto_join = true
client_service_name = "nomad-clients"
service_auth_method = "nomad-workloads"
task_auth_method = "nomad-workloads"
}
Others nodes have similar configurations, except:
addresses {
http = "10.0.0.xx"
rpc = "10.0.0.xx"
serf = "10.0.0.xx"
}
advertise {
http = "10.0.0.xx"
rpc = "10.0.0.xx"
serf = "10.0.0.xx"
}
For each node with own IP.
When I started cluster, on Consul UI.
For Nomad clients, with no deregister “issue”, because I tried bind addr to 0.0.0.0 before, all clients work well.
and for Nomad servers:
HTTP
check not working. RPC
and SERF
are success
.
I don’t know where is my err0r