AWS Batch (EC2) jobs stay in RUNNABLE state

Hi all,

I’m trying to create AWS Batch, mostly based on examples from the terraform registry.
This is what my main.tf looks like:

terraform {
    required_providers {
      aws = {
        source = "hashicorp/aws"
        version = "~> 4.16"
      }
    }    
}

provider "aws" {
    region = "${var.region}"  
}

# Zmienne:
variable "region" {
  type = string
  
}

variable "batch_name" {
  type = string

}

variable "ecr_image" {
  type = string
  default = "busybox" # TODO: Usunąć po testach. 

}


# Konfiguracja AWS Batch:

# Konfiguracja Compute Environment
resource "aws_iam_role" "ecs_instance_role" {
  name = "${var.batch_name}_ecs_instance_role"

  assume_role_policy = <<EOF
{
    "Version": "2012-10-17",
    "Statement": [
    {
        "Action": "sts:AssumeRole",
        "Effect": "Allow",
        "Principal": {
            "Service": "ec2.amazonaws.com"
        }
    }
    ]
}
EOF
}

resource "aws_iam_role_policy_attachment" "ecs_instance_role" {
  role       = aws_iam_role.ecs_instance_role.name
  policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role"
}

resource "aws_iam_instance_profile" "ecs_instance_role" {
  name = "${var.batch_name}_ecs_instance_role"
  role = aws_iam_role.ecs_instance_role.name
}

resource "aws_iam_role" "aws_batch_service_role" {
  name = "${var.batch_name}_aws_batch_service_role"

  assume_role_policy = <<EOF
{
    "Version": "2012-10-17",
    "Statement": [
    {
        "Action": "sts:AssumeRole",
        "Effect": "Allow",
        "Principal": {
        "Service": "batch.amazonaws.com"
        }
    }
    ]
}
EOF
}

resource "aws_iam_role_policy_attachment" "aws_batch_service_role" {
  role       = aws_iam_role.aws_batch_service_role.name
  policy_arn = "arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole"
}

resource "aws_security_group" "batch_security_group" {
  name = "${var.batch_name}_security_group"
  vpc_id = aws_vpc.batch_vpc.id
  egress {
    from_port   = 0
    to_port     = 0
    protocol    = "-1"
    cidr_blocks = ["0.0.0.0/0"]
  }
}

resource "aws_vpc" "batch_vpc" {

  cidr_block = "10.1.0.0/16"
}

resource "aws_subnet" "batch_subnet" {
  vpc_id     = aws_vpc.batch_vpc.id
  cidr_block = "10.1.1.0/24"
  map_public_ip_on_launch = true
}

resource "aws_batch_compute_environment" "batch_compute_environment" {
  compute_environment_name = "${var.batch_name}_compute_environment"

  compute_resources {
    instance_role = aws_iam_instance_profile.ecs_instance_role.arn

    instance_type = [
      "optimal",
    ]

    max_vcpus = 2
    min_vcpus = 0

    security_group_ids = [
      aws_security_group.batch_security_group.id,
    ]

    subnets = [
      aws_subnet.batch_subnet.id,
    ]

    type = "EC2"
  }

  service_role = aws_iam_role.aws_batch_service_role.arn
  type         = "MANAGED"
  depends_on   = [aws_iam_role_policy_attachment.aws_batch_service_role]
}

# Konfiguracja Batch Job Definition:
resource "aws_batch_job_definition" "batch_job_definition" {
  name = "${var.batch_name}_batch_job"
  type = "container"

  container_properties = <<CONTAINER_PROPERTIES
{
    "command": ["ls", "-la"],
    "image": "${var.ecr_image}",
    "memory": 7168,
    "vcpus": 2,
    "volumes": [
      {
        "host": {
          "sourcePath": "/tmp"
        },
        "name": "tmp"
      }
    ],
    "environment": [
        {"name": "VARNAME", "value": "VARVAL"}
    ],
    "mountPoints": [
        {
          "sourceVolume": "tmp",
          "containerPath": "/tmp",
          "readOnly": false
        }
    ],
    "ulimits": [
      {
        "hardLimit": 1024,
        "name": "nofile",
        "softLimit": 1024
      }
    ]
}
CONTAINER_PROPERTIES
}

# Batch Job Queue:
resource "aws_batch_job_queue" "batch_job_queue" {
  name     = "${var.batch_name}_job-queue"
  state    = "ENABLED"
  priority = 1
  compute_environments = [
    "${aws_batch_compute_environment.batch_compute_environment.arn}",
  ]
}

resource "aws_iam_role" "cloudwatch_event_iam_role" {
  name = "${var.batch_name}_cloudwatch_event_iam_role"
  assume_role_policy = <<EOF
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "events.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}
EOF
}

resource "aws_iam_policy" "cloudwatch_event_iam_policy" {
  name = "${var.batch_name}_cloudwatch_event_iam_policy"
  policy = <<EOF
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "batch:SubmitJob"
            ],
            "Resource": [
                "${aws_batch_job_definition.batch_job_definition.arn}",
                "${aws_batch_job_queue.batch_job_queue.arn}"
            ]
        }
    ]
}
EOF
}

resource "aws_iam_role_policy_attachment" "cloudwatch_event_iam_policy_attachment" {
  policy_arn = aws_iam_policy.cloudwatch_event_iam_policy.arn
  role       = aws_iam_role.cloudwatch_event_iam_role.name
}

resource "aws_cloudwatch_event_rule" "cloudwatch_event_rule" {
  name = "${var.batch_name}_eventbridge_event_rule"
  schedule_expression = "cron(20 23 * * ? *)"
}

resource "aws_cloudwatch_event_target" "cloudwatch_event_target" {
  rule = aws_cloudwatch_event_rule.cloudwatch_event_rule.name
  arn =  aws_batch_job_queue.batch_job_queue.arn
  role_arn = aws_iam_role.cloudwatch_event_iam_role.arn
  
  batch_target {
    job_definition = aws_batch_job_definition.batch_job_definition.arn
    job_name = aws_batch_job_definition.batch_job_definition.name
  }
}

Apply work but after that all submitted jobs stucks in RUNNABLE state. Compute environment gets invalid status with error:

CLIENT_ERROR - Your compute environment has been INVALIDATED and scaled down because none of the instances joined the underlying ECS Cluster. Common issues preventing instances joining are the following: VPC/Subnet configuration preventing communication to ECS, incorrect Instance Profile policy preventing authorization to ECS, or customized AMI or LaunchTemplate configurations affecting ECS agent.

I tried to solve the problem using the list here, and:

  • permissions looks good

  • it seems that there isn’t a “A route table with a route destination of 0.0.0.0/0 in subnet”

I don’t know what is the problem and where did I go wrong. I will be grateful for hints.

I solved the problem. I didn’t need to create VPC, SG and subnets. Instead, it’s better to use the defaults for AZ. If anyone will be in a similar situation as me below is a working AWS Batch setup with EventBridge(Cron):

terraform {
    required_providers {
      aws = {
        source = "hashicorp/aws"
        version = "~> 4.16"
      }
    }
}

provider "aws" {
    region = "${var.region}"
}

# Zmienne:
variable "region" {
  type = string

}

variable "batch_name" {
  type = string

}

variable "ecr_image" {
  type = string
  default = "busybox" # TODO: Usunąć po testach.

}

# Data
data "aws_vpc" "default" {
  default = true
}

data "aws_security_group" "selected" {
  vpc_id = data.aws_vpc.default.id

  filter {
    name   = "group-name"
    values = ["default"]
  }
}

data "aws_subnets" "default" {
  filter {
    name = "defaultForAz"
    values = ["true"]
  }
}

data "aws_subnet" "default" {
  for_each = toset(data.aws_subnets.default.ids)
  id = each.value
}

# Konfiguracja AWS Batch:

# Konfiguracja Compute Environment
resource "aws_iam_role" "ecs_instance_role" {
  name = "${var.batch_name}_ecs_instance_role"

  assume_role_policy = <<EOF
{
    "Version": "2012-10-17",
    "Statement": [
    {
        "Action": "sts:AssumeRole",
        "Effect": "Allow",
        "Principal": {
            "Service": "ec2.amazonaws.com"
        }
    }
    ]
}
EOF
}

resource "aws_iam_role_policy_attachment" "ecs_instance_role" {
  role       = aws_iam_role.ecs_instance_role.name
  policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role"
}

resource "aws_iam_instance_profile" "ecs_instance_role" {
  name = "${var.batch_name}_ecs_instance_role"
  role = aws_iam_role.ecs_instance_role.name
}

resource "aws_iam_role" "aws_batch_service_role" {
  name = "${var.batch_name}_aws_batch_service_role"

  assume_role_policy = <<EOF
{
    "Version": "2012-10-17",
    "Statement": [
    {
        "Action": "sts:AssumeRole",
        "Effect": "Allow",
        "Principal": {
        "Service": "batch.amazonaws.com"
        }
    }
    ]
}
EOF
}

resource "aws_iam_role_policy_attachment" "aws_batch_service_role" {
  role       = aws_iam_role.aws_batch_service_role.name
  policy_arn = "arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole"
}

resource "aws_batch_compute_environment" "batch_compute_environment" {
  compute_environment_name = "${var.batch_name}_compute_environment"

  compute_resources {
    instance_role = aws_iam_instance_profile.ecs_instance_role.arn

    instance_type = [
      "optimal",
    ]

    max_vcpus = 2
    min_vcpus = 0

    security_group_ids = [
      data.aws_security_group.selected.id,
    ]

    subnets = [
      for s in data.aws_subnet.default : s.id
    ]

    type = "EC2"
  }

  service_role = aws_iam_role.aws_batch_service_role.arn
  type         = "MANAGED"
  depends_on   = [aws_iam_role_policy_attachment.aws_batch_service_role]
}

# Konfiguracja Batch Job Definition:
resource "aws_batch_job_definition" "batch_job_definition" {
  name = "${var.batch_name}_batch_job"
  type = "container"

  container_properties = <<CONTAINER_PROPERTIES
{
    "command": ["ls", "-la"],
    "image": "${var.ecr_image}",
    "memory": 7168,
    "vcpus": 2,
    "volumes": [
      {
        "host": {
          "sourcePath": "/tmp"
        },
        "name": "tmp"
      }
    ],
    "environment": [
        {"name": "VARNAME", "value": "VARVAL"}
    ],
    "mountPoints": [
        {
          "sourceVolume": "tmp",
          "containerPath": "/tmp",
          "readOnly": false
        }
    ],
    "ulimits": [
      {
        "hardLimit": 1024,
        "name": "nofile",
        "softLimit": 1024
      }
    ]
}
CONTAINER_PROPERTIES
}

# Batch Job Queue:
resource "aws_batch_job_queue" "batch_job_queue" {
  name     = "${var.batch_name}_job-queue"
  state    = "ENABLED"
  priority = 1
  compute_environments = [
    "${aws_batch_compute_environment.batch_compute_environment.arn}",
  ]
}

resource "aws_iam_role" "cloudwatch_event_iam_role" {
  name = "${var.batch_name}_cloudwatch_event_iam_role"
  assume_role_policy = <<EOF
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "events.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}
EOF
}

resource "aws_iam_policy" "cloudwatch_event_iam_policy" {
  name = "${var.batch_name}_cloudwatch_event_iam_policy"
  policy = <<EOF
{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "batch:SubmitJob"
            ],
            "Resource": [
                "${aws_batch_job_definition.batch_job_definition.arn}",
                "${aws_batch_job_queue.batch_job_queue.arn}"
            ]
        }
    ]
}
EOF
}

resource "aws_iam_role_policy_attachment" "cloudwatch_event_iam_policy_attachment" {
  policy_arn = aws_iam_policy.cloudwatch_event_iam_policy.arn
  role       = aws_iam_role.cloudwatch_event_iam_role.name
}

resource "aws_cloudwatch_event_rule" "cloudwatch_event_rule" {
  name = "${var.batch_name}_eventbridge_event_rule"
  schedule_expression = "cron(30 15 * * ? *)"
}

resource "aws_cloudwatch_event_target" "cloudwatch_event_target" {
  rule = aws_cloudwatch_event_rule.cloudwatch_event_rule.name
  arn =  aws_batch_job_queue.batch_job_queue.arn
  role_arn = aws_iam_role.cloudwatch_event_iam_role.arn

  batch_target {
    job_definition = aws_batch_job_definition.batch_job_definition.arn
    job_name = aws_batch_job_definition.batch_job_definition.name
  }
}