Nomad host logs and metrics using vector, Loki (Grafana cloud)

Hello,

I finally managed to have all my logs and nomad metrics through vector running in a docker container (via nomad).
I copy/paste below my nomad client config (here it is also setup as server) to see what you need to mount. You’ll also find my vector job config file. I hope it will help someone.

Some notes :

  • For journald logs to be accessible from the vector container, you need to mount /var/log/journal and give your container the same machine-id. It means, also mounting the /etc/machine-id directory (see below)
  • To access nomad host from the vector container, you may need to add this extra args (-add-host host.docker.internal:host-gateway) to the docker config
  • Some nomad metrics are not accessible (depends on your system …). In my case, nomad_client_allocs_memory_rss is not accessible.

PS :

  • I use terraform so some variables are passed/replaced by terraform
  • My host machine is running Debian 11.6

My nomad.hcl config


# Full configuration options can be found at https://www.nomadproject.io/docs/configuration

#datacenter = "dc1"
data_dir  = "/opt/nomad/data"

# Bind on tailscale interface
bind_addr = "{{ GetInterfaceIP \"tailscale0\" }}"

# See https://developer.hashicorp.com/nomad/tutorials/access-control/access-control-bootstrap
#acl {
#  enabled    = true
#}

telemetry {
  collection_interval = "15s"
  disable_hostname = true
  prometheus_metrics = true
  publish_allocation_metrics = true
  publish_node_metrics = true
}

server {
  enabled          = ${server?"true":"false"}  
  default_scheduler_config {
    memory_oversubscription_enabled = true
  }
  bootstrap_expect=${bootstrap_expect}
}

client {
  enabled = ${client?"true":"false"}
  host_network "tailscale" {
    interface = "tailscale0"
    reserved_ports = "${reserved_ports}"
  }

  # Used for docker logs
  host_volume "docker-sock-ro" {
    path = "/var/run/docker.sock"
    read_only = true
  }

  # Used for host systemd logs
  host_volume "journald-ro" {
    path = "/var/log/journal"
    read_only = true
  }
  host_volume "machineid-ro" {
    path = "/etc/machine-id"
    read_only = true
  }
}

plugin "docker" {
  config {
    # extra Docker labels to be set by Nomad on each Docker container with the appropriate value
    extra_labels = ["job_name", "task_group_name", "task_name", "namespace", "node_name"]
  }
}

/*consul {
  address = "{{ GetInterfaceIP \"tailscale0\" }}:8500"
}*/

My vector job file (passed to the nomad_job terraform resource)

job "vector" {
  datacenters = ["dc1"]
  # system job, runs on all nodes
  type = "system"
  update {
    min_healthy_time = "10s"
    healthy_deadline = "5m"
    progress_deadline = "10m"
    auto_revert = true
  }
  group "vector" {
    count = 1
    restart {
      attempts = 3
      interval = "10m"
      delay = "30s"
      mode = "fail"
    }
    # docker socket volume
    volume "docker-sock" {
      type = "host"
      source = "docker-sock-ro"
      read_only = true
    }
    volume "journald" {
      type = "host"
      source = "journald-ro"
      read_only = true
    }
    volume "machineid" {
      type = "host"
      source = "machineid-ro"
      read_only = true
    }
    ephemeral_disk {
      size    = 500 # 500 MB
      sticky  = true
    }
    task "vector" {
      driver = "docker"
      config {
        image = "timberio/vector:0.26.0-debian"
      }
      # docker socket volume mount
      volume_mount {
        volume = "docker-sock"
        destination = "/var/run/docker.sock"
        read_only = true
      }
      volume_mount {
        volume = "journald"
        destination = "/var/log/journal"
        read_only = true
      }
      volume_mount {
        volume = "machineid"
        destination = "/etc/machine-id"
        read_only = true
      }
      # Vector won't start unless the sinks(backends) configured are healthy
      env {
        VECTOR_CONFIG = "local/vector.toml"
        VECTOR_REQUIRE_HEALTHY = "true"
      }
      # resource limits are a good idea because you don't want your log collection to consume all resources available
      resources {
        cpu    = 500 # MHz
        memory = 256 # MB
      }
      # template with Vector's configuration
      template {
        destination = "local/vector.toml"
        change_mode   = "signal"
        change_signal = "SIGHUP"
        # overriding the delimiters to [[ ]] to avoid conflicts with Vector's native templating, which also uses {{ }}
        left_delimiter = "[["
        right_delimiter = "]]"
        data=<<EOH
[[- with nomadVar "nomad/jobs/vector/vector/vector" -]]
          data_dir = "alloc/data/"
          [api]
            enabled = false
          [sources.host_journald_logs]
            type = "journald"
            current_boot_only = true
            since_now = true
            include_units = []
            # Warning and above
            include_matches.PRIORITY = [ "0", "1", "2", "3", "4" ]
          [sources.logs]
            type = "docker_logs"
          [transforms.apps_logs]
            type = "remap"
            inputs = ["logs"]
            source = ".message = parse_json!(.message)"
          [sources.nomad_host_metrics]
            type = "prometheus_scrape"
            endpoints = [ "http://${nomad_host_tailnet_ip}/v1/metrics?format=prometheus" ]
            scrape_interval_secs = 15
            instance_tag = "instance"
            endpoint_tag = "endpoint"
[[ if eq "${environment}" "dev" ]]
          [sinks.out]
            type = "console"
            inputs = [ "apps_logs", "host_journald_logs", "nomad_host_metrics" ]
            encoding.codec = "json"
[[ end ]]
          [sinks.prometheus]
            type = "prometheus_remote_write"
            inputs = [ "nomad_host_metrics" ]
            endpoint = "https://prometheus-prod-01-eu-west-0.grafana.net/api/prom/push"
            healthcheck.enabled = false
            auth.strategy = "basic"
            auth.user = "[[.prometheus_user]]"
            auth.password = "[[.prometheus_password]]"
          [sinks.loki]
            type = "loki"
            inputs = ["apps_logs", "host_journald_logs"]
            endpoint = "https://[[.loki_user]]:[[.loki_password]]@logs-prod-eu-west-0.grafana.net"
            compression = "snappy"
            encoding.codec = "json"
            healthcheck.enabled = true

            # remove fields that have been converted to labels to avoid having the field twice
            remove_label_fields = true
              [sinks.loki.labels]
              # See https://vector.dev/docs/reference/vrl/expressions/#path-example-quoted-path
              job = "{{label.\"com.hashicorp.nomad.job_name\" }}"
              task = "{{label.\"com.hashicorp.nomad.task_name\" }}"
              group = "{{label.\"com.hashicorp.nomad.task_group_name\" }}"
              #namespace = "{{label.\"com.hashicorp.nomad.namespace\" }}"
              node = "{{label.\"com.hashicorp.nomad.node_name\" }}"
              correlation_id = "{{ message.requestId }}"
[[- end -]]
        EOH
      }
      kill_timeout = "30s"
    }
  }
}

Regarding the nomad dashboards, i started with the nomad integration provided by Grafana and did some adjustments.

I did not share here my HaProxy and apps logs/metrics but you get the idea.

Hope it helps,
Best regards,

Brahim

1 Like