Nomad elasticsearch

Hello, this is my ‘es job’ file. There is no problem with the first run, but when I restart, it will automatically replace other clients, resulting in an error. Is there a good solution

{"type": "server", "timestamp": "2022-04-25T10:46:29,384Z", "level": "WARN", "component": "o.e.c.c.ClusterFormationFailureHelper", "cluster.name": "default", "node.name": "master-1", "message": "master not discovered yet, this node has not previously joined a bootstrapped (v7+) cluster, and this node must discover master-eligible nodes [default-master-0.service.consul:28303, default-master-1.service.consul:28303, default-master-2.service.consul:28303] to bootstrap a cluster: have discovered [{master-1}{albbOP9xT1aGcunqmpUv9A}{WPptKxoCTqCwTa7iGvmmnw}{10.103.3.41}{10.103.3.41:28303}{dilmrt}{ml.machine_memory=4294967296, xpack.installed=true, transform.node=true, ml.max_open_jobs=20}, {master-0}{MTOCKUwNSe2WxzHF3f0M3Q}{5SvgPiveTF6Hug7qEiItZg}{10.103.3.43}{10.103.3.43:28303}{dilmrt}{ml.machine_memory=4294967296, ml.max_open_jobs=20, xpack.installed=true, transform.node=true}, {master-2}{rw18eQbjQn69-G5P0BG76Q}{gE8UnY1sThaMkyrzPAE8dw}{10.103.3.42}{10.103.3.42:28303}{dilmrt}{ml.machine_memory=4294967296, ml.max_open_jobs=20, xpack.installed=true, transform.node=true}]; discovery will continue using [10.103.3.43:28303, 10.103.3.42:28303] from hosts providers and [{master-1}{albbOP9xT1aGcunqmpUv9A}{WPptKxoCTqCwTa7iGvmmnw}{10.103.3.41}{10.103.3.41:28303}{dilmrt}{ml.machine_memory=4294967296, xpack.installed=true, transform.node=true, ml.max_open_jobs=20}] from last-known cluster state; node term 0, last-accepted version 0 in term 0" }
$ cat /tmp/default-master-0.hcl 
type = "csi"
id   = "master-0"
name = "master-0"
capacity_min = "100GB"
capacity_max = "100GB"

capability {
  access_mode     = "single-node-writer"
  attachment_mode = "file-system"
}
capability {
  access_mode     = "single-node-writer"
  attachment_mode = "block-device"
}
plugin_id       = "ceph-csi"
secrets {
  userID  = "admin"
  userKey = "xxxxx=="
}
parameters {
  clusterID = "xxxxx"
  pool      = "nomad"
  imageFeatures = "layering"
}
job master-0 {
  namespace = "default"
  datacenters = [ "dc1" ]
  type        = "service"
  update {
    max_parallel     = 1
    health_check     = "task_states"   #"checks"
    min_healthy_time = "30s"
    healthy_deadline = "5m"
    auto_revert      = false
    canary           = 0
    stagger          = "30s"
  }
  ##################################### master-0 ############################################
  group "master-0" {
    count = 1
    restart {
      attempts = 3
      delay = "30s"
      interval = "5m"
      mode = "fail"
    }
    network {
      mode = "host"
      dns {
        servers = ["10.103.1.11"]
      }
      port "request" {
        static = 29303
      }
      port "communication" {
        static = 28303
      }
    }
    volume "ceph-volume" {
      type = "csi"
      read_only = false
      source = "master-0"
      access_mode     = "single-node-writer"
      attachment_mode = "file-system"
    }
    task "elasticsearch" {
      driver = "docker"
      kill_timeout = "300s"
      kill_signal = "SIGTERM"
      volume_mount {
        volume      = "ceph-volume"
        destination = "/srv"
        read_only   = false
      }
      env {
        LOG4J_FORMAT_MSG_NO_LOOKUPS = true
      }
      template {
        data = <<EOF
cluster:
  name: default
  publish:
    timeout: 300s
  join:
    timeout: 300s
  initial_master_nodes:
    - default-master-0.service.consul:28303
    - default-master-1.service.consul:28303
    - default-master-2.service.consul:28303
http:
  port: 29303
transport:
  port: 28303
node:
  name: master-0
  master: true
  data: true
  ingest: true
network:
  host: 0.0.0.0
discovery:
  seed_hosts:
    - default-master-0.service.consul:28303
    - default-master-1.service.consul:28303
    - default-master-2.service.consul:28303
path:
  data:
    - /srv/data
  logs: /srv/log
EOF
        destination = "local/elasticsearch.yml"
      }
      template {
        data = <<EOF
-Xms2g
-Xmx2g
8-13:-XX:+UseConcMarkSweepGC
8-13:-XX:CMSInitiatingOccupancyFraction=75
8-13:-XX:+UseCMSInitiatingOccupancyOnly
14-:-XX:+UseG1GC
-Djava.io.tmpdir=${ES_TMPDIR}
-XX:+HeapDumpOnOutOfMemoryError
-XX:HeapDumpPath=data
-XX:ErrorFile=logs/hs_err_pid%!p(MISSING).log
8:-XX:+PrintGCDetails
8:-XX:+PrintGCDateStamps
8:-XX:+PrintTenuringDistribution
8:-XX:+PrintGCApplicationStoppedTime
8:-Xloggc:logs/gc.log
8:-XX:+UseGCLogFileRotation
8:-XX:NumberOfGCLogFiles=32
8:-XX:GCLogFileSize=64m
9-:-Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,pid,tags:filecount=32,filesize=64m
EOF
        destination = "local/jvm.options"
      }
      config {
        image = "elasticsearch:7.8.1"
        hostname = "master-0"
        force_pull = false
        volumes = [
          "/data/10/default:/usr/share/elasticsearch/data",
          "./local/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml",
          "./local/jvm.options:/usr/share/elasticsearch/config/jvm.options"
        ]
        command = "bin/elasticsearch"
        args = [
          "-Enetwork.publish_host=${NOMAD_IP_request}",
          "-Ehttp.publish_port=${NOMAD_HOST_PORT_request}",
          "-Ehttp.port=${NOMAD_PORT_request}",
          "-Etransport.publish_port=${NOMAD_HOST_PORT_communication}",
          "-Etransport.tcp.port=${NOMAD_PORT_communication}"
        ]
        ports = [
          "request",
          "communication"
        ]
        ulimit {
          memlock = "-1"
          nofile = "65536"
          nproc = "65536"
        }
      }
      resources {
        cpu = 3200
        memory = 4096
      }
      service {
        name = "default"
        port = "request"
        check {
          name = "rest-tcp"
          type = "tcp"
          interval = "10s"
          timeout = "2s"
        }
      }
      service {
        name = "default-master-0"
        port = "communication"
        check {
          type = "tcp"
          interval = "10s"
          timeout = "2s"
        }
      }
    }
  }
}

I tested that the same client still had the same problem, and I suspect it was csi-ceph

Hey @x602

I’m not too familiar with setting up a elasticsearch cluster so correct me if i’m wrong, but have you ruled out your es node name formats as the issue here? From elastic search docs:

The node names used in the cluster.initial_master_nodes list must exactly match the node.name properties of the nodes. By default the node name is set to the machine’s hostname which may or may not be fully-qualified depending on your system configuration. If each node name is a fully-qualified domain name such as master-a.example.com then you must use fully-qualified domain names in the cluster.initial_master_nodes list too; conversely if your node names are bare hostnames (without the .example.com suffix) then you must use bare hostnames in the cluster.initial_master_nodes list. If you use a mix of fully-qualified and bare hostnames, or there is some other mismatch between node.name and cluster.initial_master_nodes , then the cluster will not form successfully and you will see log messages like the following.

[master-a.example.com] master not discovered yet, this node has not previously joined a bootstrapped (v7+) cluster, and this node must discover master-eligible nodes [master-a, master-b] to bootstrap a cluster: have discovered [{master-b.example.com}{...

This message shows the node names master-a.example.com and master-b.example.com as well as the cluster.initial_master_nodes entries master-a and master-b , and it is clear from this message that they do not match exactly.

It seems like at some point after setup, default-master-0.service.consul:28303 is being read as → {master-1}{albbOP9xT1aGcunqmpUv9A}{WPptKxoCTqCwTa7iGvmmnw}{10.103.3.41}{10.103.3.41:28303}{dilmrt}{ml.machine_memory=4294967296, xpack.installed=true, transform.node=true, ml.max_open_jobs=20} .

Let me know if that’s intended and if it is i’ll be happy to try and see if it’s a nomad issue :slight_smile:

1 Like