Pre-run hook network failed explanations?

Hi,

Nomad 1.2.3
Consul 1.10.3
CNI 1.0.1

Every times I reboot my cluster nomad/consul/vault. I have this error on job, especially these whose used consul connect.

failed to setup alloc: pre-run hook "network" failed: failed to configure networking for alloc: failed to configure network: failed to set bridge addr: could not set bridge's mac: invalid argument

or

failed to setup alloc: pre-run hook "network" failed: failed to configure networking for alloc: failed to configure network: failed to allocate for range 0: 172.26.64.101 has been allocated to 987e3755-f6e9-2c97-9310-106d433e9182, duplicate allocation is not allowed

Sometimes I need to restart nomad service or sometimes waiting. But waiting 30 minutes or 1 hour :sleeping: and suddenly after many auto-restart of job it works!

I didn’t find something relevant on logs.

Can I have just an explanation about this to find maybe a solution. Thanks! :slightly_smiling_face:

Hi @fred-gb. Thanks for using Nomad.

Could you share your jobspec file and any network configuration from your server/client hcl files? Both of these errors appear to be coming from the call to containerd/go-cni/cni.Setup. I might be able to spot something if I can see your config.

Cheers,

Derek

Hi,

Thanks, today same problem, after 1 hour it works.

My mariadb job:

job "mariadb" {
  region = "global"
  datacenters = ["dc1"]
  type = "service"

  vault {
    policies = ["credentials"]
  }

   update {
     max_parallel     = 1
     canary           = 1
     min_healthy_time = "10s"
     healthy_deadline = "5m"
     auto_revert      = true
     auto_promote     = true
     health_check     = "checks"
     stagger          = "30s"
   }
  constraint {
    attribute = "${node.unique.name}"
    value     = "backend"
  }

  group "mariadb" {
    network {
      mode = "bridge"
    }

    service {
      name = "mariadb"
      port = "3306"
      connect {
        sidecar_service {}
      }
    }

    task "mariadb" {
        driver = "docker"
        config {
          image = "ghcr.io/linuxserver/mariadb"


          volumes = [
            "/data/mariadb/mysql:/config"
          ]
        }

        template {
          data = <<EOH
MYSQL_ROOT_PASSWORD = {{with secret "database/data/mariadb/mariadb"}}{{.Data.data.mysql_root_password}}{{end}}
MYSQL_USER = {{with secret "database/data/mariadb/mariadb"}}{{.Data.data.mysql_user}}{{end}}
MYSQL_PASSWORD = {{with secret "database/data/mariadb/mariadb"}}{{.Data.data.mysql_password}}{{end}}
MYSQL_DATABASE = {{with secret "database/data/mariadb/mariadb"}}{{.Data.data.mysql_database}}{{end}}

EOH

          destination = "secrets/file.env"
          env         = true
        }

        resources {
            memory = 1000
            cpu    = 1000
        }

       }
  }
}

cat /etc/nomad.d/server.hcl

server {
    enabled = true

    bootstrap_expect = 3


    rejoin_after_leave = false

    enabled_schedulers = ["service","batch","system"]
    num_schedulers = 1

    node_gc_threshold = "24h"
    eval_gc_threshold = "1h"
    job_gc_threshold = "4h"
    deployment_gc_threshold = "1h"

    encrypt = ""

    raft_protocol = 3
}

base.hcl:

name = "backend001"
region = "global"
datacenter = "dc1"

enable_debug = false
disable_update_check = false


bind_addr = "0.0.0.0"
advertise {
    http = "10.0.0.249:4646"
    rpc = "10.0.0.249:4647"
    serf = "10.0.0.249:4648"
}
ports {
    http = 4646
    rpc = 4647
    serf = 4648
}

consul {
    # The address to the Consul agent.
    address = "localhost:8500"
    token = ""
    # The service name to register the server and client with Consul.
    server_service_name = "nomad-servers"
    client_service_name = "nomad-clients"
    tags = {}

    # Enables automatically registering the services.
    auto_advertise = true

    # Enabling the server and client to bootstrap using Consul.
    server_auto_join = true
    client_auto_join = true
}

data_dir = "/var/nomad"

log_level = "INFO"
enable_syslog = true

leave_on_terminate = true
leave_on_interrupt = false


acl {
    enabled = false
    token_ttl = "30s"
    policy_ttl = "30s"
    replication_token = ""
}

vault {
    enabled = true
    address = "http://127.0.0.1:8200"
    allow_unauthenticated = true
    create_from_role = "nomad-cluster"
    task_token_ttl = ""
    ca_file = ""
    ca_path = ""
    cert_file = ""
    key_file = ""
    tls_server_name = ""
    tls_skip_verify = false
    token = "xxxx"
    namespace = ""
}


telemetry {
    disable_hostname = "false"
    collection_interval = "1s"
    use_node_name = "false"
    publish_allocation_metrics = "false"
    publish_node_metrics = "false"
    filter_default = "true"
    prefix_filter = []
    disable_dispatched_job_summary_metrics = "false"
    statsite_address = ""
    statsd_address = ""
    datadog_address = ""
    datadog_tags = []
    prometheus_metrics = "true"
    circonus_api_token = ""
    circonus_api_app = "nomad"
    circonus_api_url = "https://api.circonus.com/v2"
    circonus_submission_interval = "10s"
    circonus_submission_url = ""
    circonus_check_id = ""
    circonus_check_force_metric_activation = "false"
    circonus_check_instance_id = ""
    circonus_check_search_tag = ""
    circonus_check_display_name = ""
    circonus_check_tags = ""
    circonus_broker_id = ""
    circonus_broker_select_tag = ""
}

autopilot {
    cleanup_dead_servers      = true
    last_contact_threshold    = "200ms"
    max_trailing_logs         = 250
    server_stabilization_time = "10s"
}

client.hcl

client {
    enabled = true

    node_class = ""
    no_host_uuid = false


    max_kill_timeout = "30s"

    network_speed = 0
    cpu_total_compute = 0

    gc_interval = "1m"
    gc_disk_usage_threshold = 80
    gc_inode_usage_threshold = 70
    gc_parallel_destroys = 2

    reserved {
        cpu = 0
        memory = 0
        disk = 0
    }



    options = {
        "driver.raw_exec.enable" = "true"
        "docker.privileged.enabled" = "true"
        "docker.volumes.enabled" = "true"
        "docker.cleanup.image.delay" = "1h"
    }

    }

I show you all ! :grinning:

Thanks!

This value in the server.hcl file stands out to me because it matches the time interval you are witnessing. I’m wondering if this is having an unexpected effect. This line checks the oldThreshold variable which is computed a few lines earlier.

Could you try setting that value a bit lower and seeing if it has an impact? I’m not suggesting this is a fix, but I’m hoping since you have this environment up and running, it will be faster/easier for you to test the theory.

Hi,
Sorry but since this first post, I didn’t have the same issue. I don’t know…