Es cluster monitoring check failed

x602 · March 16, 2022, 10:30am

hi

noamd:1.2.6

The health check failed, but there was no error in the container log. Do the bosses know what’s going on, or do they have a better ‘noamd’ job file to deploy the ES cluster

job "es-cluster8" {
  type        = "service"
  datacenters = ["dc1"]

  update {
    max_parallel     = 1
    health_check     = "checks"
    min_healthy_time = "180s"
    healthy_deadline = "4m"
  }

  meta {
    ES_CLUSTER_NAME = "${NOMAD_REGION}-${NOMAD_JOB_NAME}"
  }


  #主节点有EBS支持的卷，因此不能扩展到超过3个节点（每个AZ 1个）
  #缩放到3必须在裁缝师中完成！
  group "es-cluster-master" {
    count = 3

    # 主节点必须始终分布在不同的AZ上
    # constraint {
    #   distinct_property = "${meta.aws.instance.availability-zone}"
    # }

    # 主节点必须在高内存应用程序上运行
    # constraint {
    #   attribute = "${node.class}"
    #   value     = "high-memory-applications"
    # }
    task "es-cluster-master" {
      driver = "docker"

      # 在开始弹性搜索之前，容器将自动删除权限
      user = "root"

      # 允许弹性搜索10分钟以正常关闭
      kill_timeout = "600s"

      # 使用SIGTERM关闭弹性搜索
      kill_signal = "SIGTERM"

      config {
        image      = "elasticsearch:7.8.1"
        command    = "elasticsearch"
        volumes = [
          "./local/unicast_hosts.txt:/usr/share/elasticsearch/config/unicast_hosts.txt"
        ]
        # https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-transport.html
        # https://www.elastic.co/guide/en/elasticsearch/reference/current/modules-http.html
        args = [
          "-Ebootstrap.memory_lock=true",                          # 启动时锁定所有JVM内存
          "-Ecluster.name=${NOMAD_META_ES_CLUSTER_NAME}",          # 群集名称-这必须在主节点和数据节点之间匹配
          # "-Ediscovery.zen.hosts_provider=file",                   # 使用“静态”文件
          # "-Ediscovery.zen.minimum_master_nodes=2",                # >=需要2个主节点才能形成一个健康的集群
          "-Egateway.expected_data_nodes=3",                       # >=3个数据节点组成一个健康的集群
          # "-Egateway.expected_master_nodes=3",                     # >=3个主节点是群集的预期状态
          # "-Egateway.expected_nodes=3",                            # >=群集中预计总共有3个节点
          # "-Egateway.recover_after_nodes=3",                       # >=启动数据恢复需要3个节点
          "-Ehttp.port=${NOMAD_PORT_rest}",                        # 用于在容器内侦听的HTTP端口（最初是端口9200）
          "-Ehttp.publish_port=${NOMAD_HOST_PORT_rest}",           # 主机实例上的HTTP端口（最初为端口9200）
          "-Enetwork.host=0.0.0.0",                                # IP可监听所有流量
          "-Enetwork.publish_host=${NOMAD_IP_rest}",               # 要广播到其他弹性搜索节点的IP（这是主机IP，而不是容器）
          "-Enode.data=true",                                      # 允许节点存储数据
          "-Enode.master=true",                                    # 允许节点被选为主节点
          "-Enode.name=${NOMAD_GROUP_NAME}[${NOMAD_ALLOC_INDEX}]", # 节点名称被删除为分配名称
          "-Epath.logs=/alloc/logs/",                              # 将数据记录到分配目录
          "-Etransport.publish_port=${NOMAD_HOST_PORT_transport}", # 主机实例上的传输端口（最初为端口9300）
          # "-Etransport.tcp.port=${NOMAD_PORT_transport}",          # 集装箱内的运输港（最初为9300港）
          # "-Ettransport.profiles.default.port=${NOMAD_PORT_transport}",
          "-Expack.license.self_generated.type=basic",             # 使用x-packs基本许可证（免费）
          "-Ecluster.initial_master_nodes=${NOMAD_GROUP_NAME}[${NOMAD_ALLOC_INDEX}]"   #初始化
        ]

        ulimit {
          # 确保弹性搜索可以在启动时锁定JVM的所有内存
          memlock = "-1"

          # 确保弹性搜索可以创建足够多的打开文件句柄
          nofile = "65536"

          # 确保弹性搜索可以创建足够的线程
          nproc = "8192"
        }

        # 持久数据配置
        # volume_driver = "rexray/ebs"

        # 这些卷由infra团队提供
        # volumes = [
        #   "${NOMAD_REGION}-${NOMAD_JOB_NAME}/:/usr/share/elasticsearch/data",
        # ]
      }

      # Consor模板写出弹性搜索用于发现其集群对等点的单播主机
      template {
        # 此路径将自动符号链接到容器中的正确位置
        destination = "local/unicast_hosts.txt"

        # 弹性搜索在更改时自动重新加载文件，因此无需任何信号
        change_mode = "noop"
        data = <<EOF
{{- range service (printf "%s-discovery|passing" (env "NOMAD_JOB_NAME")) }}
{{ .Address }}:{{ .Port }}{{ end }}
EOF
      }

      # 这个领事服务用于发现单播主机（参见上面的模板{}）
      service {
        name = "${NOMAD_JOB_NAME}-discovery"
        port = "transport"

        check {
          name     = "transport-tcp"
          port     = "transport"
          type     = "tcp"
          interval = "5s"
          timeout  = "4s"
        }
      }

      # 此领事服务用于端口9200/正常http流量
      service {
        name = "${NOMAD_JOB_NAME}"
        port = "rest"
        tags = ["dd-elastic"]

        check {
          name     = "rest-tcp"
          port     = "rest"
          type     = "tcp"
          interval = "5s"
          timeout  = "4s"
        }

        check {
          name     = "rest-http"
          type     = "http"
          port     = "rest"
          path     = "/"
          interval = "5s"
          timeout  = "4s"
        }
      }

      resources {
        cpu    = 1024
        memory = 8192
        network {
          mbits = 25
          port "rest" {}
          port "transport" {}
        }
      }
    }
  }

  # 与es cluster master类似，请参见此处的注释
  group "es-cluster-data" {
    count = 1

    # 数据节点必须在高内存应用程序上运行
    # constraint {
    #   attribute = "${node.class}"
    #   value     = "high-memory-applications"
    # }

    # 尽最大努力移动没有EBS支持的现有弹性搜索数据
    # 50gig磁盘与EBS卷匹配。
    ephemeral_disk {
      size    = "50000"
      sticky  = true
      migrate = false
    }

    task "es-cluster-data" {
      driver       = "docker"
      user         = "root"
      kill_timeout = "600s"
      kill_signal  = "SIGTERM"
      template {
        destination = "local/unicast_hosts.txt"
        change_mode = "noop"

        data = <<EOF
{{- range service (printf "%s-discovery|passing" (env "NOMAD_JOB_NAME")) }}
{{ .Address }}:{{ .Port }}{{ end }}
EOF
      }
      config {
        volumes = [
          "./local/unicast_hosts.txt:/usr/share/elasticsearch/config/unicast_hosts.txt"
        ]
        image   = "elasticsearch:7.8.1"
        command = "elasticsearch"

        args = [
          "-Ebootstrap.memory_lock=true",
          "-Ecluster.name=${NOMAD_META_ES_CLUSTER_NAME}",
          "-Ediscovery.zen.hosts_provider=file",
          "-Ediscovery.zen.minimum_master_nodes=2",
          "-Egateway.expected_data_nodes=3",
          "-Egateway.expected_master_nodes=3",
          "-Egateway.expected_nodes=3",
          "-Egateway.recover_after_nodes=3",
          "-Ehttp.port=${NOMAD_PORT_rest}",
          "-Ehttp.publish_port=${NOMAD_HOST_PORT_rest}",
          "-Enetwork.host=0.0.0.0",
          "-Enetwork.publish_host=${NOMAD_IP_rest}",
          "-Enode.data=true",
          "-Enode.master=false",
          "-Enode.max_local_storage_nodes=1",
          "-Enode.name=${NOMAD_ALLOC_NAME}",
          "-Epath.data=/alloc/data/",
          "-Epath.logs=/alloc/logs/",
          "-Etransport.publish_port=${NOMAD_HOST_PORT_transport}",
          "-Etransport.tcp.port=${NOMAD_PORT_transport}",
          "-Expack.license.self_generated.type=basic",
        ]

        ulimit {
          memlock = "-1"
          nofile  = "65536"
          nproc   = "8192"
        }
      }



      service {
        name = "${NOMAD_JOB_NAME}"
        port = "rest"
        tags = ["dd-elastic"]

        check {
          name     = "rest-tcp"
          port     = "rest"
          type     = "tcp"
          interval = "5s"
          timeout  = "4s"
        }

        check {
          name     = "rest-http"
          type     = "http"
          port     = "rest"
          path     = "/"
          interval = "5s"
          timeout  = "4s"
        }
      }

      resources {
        cpu    = 1024
        memory = 8192
        network {
          mbits = 25
          port "rest" {}
          port "transport" {}
        }
      }
    }
  }

  group "es-cluster-kibana" {
    count = 1

    # constraint {
    #   attribute = "${node.class}"
    #   value     = "applications"
    # }

    update {
      max_parallel     = 1
      health_check     = "checks"
      min_healthy_time = "10s"
      healthy_deadline = "9m"
    }

    task "es-cluster-kibana" {
      driver       = "docker"
      kill_timeout = "60s"
      kill_signal  = "SIGTERM"

      config {
        image   = "docker.elastic.co/kibana/kibana:6.1.3"
        command = "kibana"

        # https://www.elastic.co/guide/en/kibana/current/settings.html
        # https://www.elastic.co/guide/en/kibana/current/settings-xpack-kb.html
        args = [
          "--elasticsearch.url=http://${NOMAD_JOB_NAME}.service.consul:80",
          "--server.host=0.0.0.0",
          "--server.name=${NOMAD_JOB_NAME}.service.consul",
          "--server.port=${NOMAD_PORT_http}",
          "--path.data=/alloc/data",
          "--elasticsearch.preserveHost=false",
          "--xpack.apm.ui.enabled=false",
          "--xpack.graph.enabled=false",
          "--xpack.ml.enabled=false",
        ]

        ulimit {
          memlock = "-1"
          nofile  = "65536"
          nproc   = "8192"
        }
      }

      service {
        name = "${NOMAD_JOB_NAME}-kibana"
        port = "http"

        check {
          name     = "http-tcp"
          port     = "http"
          type     = "tcp"
          interval = "5s"
          timeout  = "4s"
        }

        check {
          name     = "http-http"
          type     = "http"
          port     = "http"
          path     = "/"
          interval = "5s"
          timeout  = "4s"
        }
      }

      resources {
        cpu    = 1024
        memory = 2048

        network {
          mbits = 5
          port "http" {}
        }
      }
    }
  }
}

x602 · March 16, 2022, 10:38am

I referred to this：[question] use nomad deploy a 3 nodes elasticsearch cluster · Issue #3831 · hashicorp/nomad · GitHub