Hi all,
I’m trying to create AWS Batch, mostly based on examples from the terraform registry.
This is what my main.tf looks like:
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 4.16"
}
}
}
provider "aws" {
region = "${var.region}"
}
# Zmienne:
variable "region" {
type = string
}
variable "batch_name" {
type = string
}
variable "ecr_image" {
type = string
default = "busybox" # TODO: Usunąć po testach.
}
# Konfiguracja AWS Batch:
# Konfiguracja Compute Environment
resource "aws_iam_role" "ecs_instance_role" {
name = "${var.batch_name}_ecs_instance_role"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": "sts:AssumeRole",
"Effect": "Allow",
"Principal": {
"Service": "ec2.amazonaws.com"
}
}
]
}
EOF
}
resource "aws_iam_role_policy_attachment" "ecs_instance_role" {
role = aws_iam_role.ecs_instance_role.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role"
}
resource "aws_iam_instance_profile" "ecs_instance_role" {
name = "${var.batch_name}_ecs_instance_role"
role = aws_iam_role.ecs_instance_role.name
}
resource "aws_iam_role" "aws_batch_service_role" {
name = "${var.batch_name}_aws_batch_service_role"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": "sts:AssumeRole",
"Effect": "Allow",
"Principal": {
"Service": "batch.amazonaws.com"
}
}
]
}
EOF
}
resource "aws_iam_role_policy_attachment" "aws_batch_service_role" {
role = aws_iam_role.aws_batch_service_role.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole"
}
resource "aws_security_group" "batch_security_group" {
name = "${var.batch_name}_security_group"
vpc_id = aws_vpc.batch_vpc.id
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
}
resource "aws_vpc" "batch_vpc" {
cidr_block = "10.1.0.0/16"
}
resource "aws_subnet" "batch_subnet" {
vpc_id = aws_vpc.batch_vpc.id
cidr_block = "10.1.1.0/24"
map_public_ip_on_launch = true
}
resource "aws_batch_compute_environment" "batch_compute_environment" {
compute_environment_name = "${var.batch_name}_compute_environment"
compute_resources {
instance_role = aws_iam_instance_profile.ecs_instance_role.arn
instance_type = [
"optimal",
]
max_vcpus = 2
min_vcpus = 0
security_group_ids = [
aws_security_group.batch_security_group.id,
]
subnets = [
aws_subnet.batch_subnet.id,
]
type = "EC2"
}
service_role = aws_iam_role.aws_batch_service_role.arn
type = "MANAGED"
depends_on = [aws_iam_role_policy_attachment.aws_batch_service_role]
}
# Konfiguracja Batch Job Definition:
resource "aws_batch_job_definition" "batch_job_definition" {
name = "${var.batch_name}_batch_job"
type = "container"
container_properties = <<CONTAINER_PROPERTIES
{
"command": ["ls", "-la"],
"image": "${var.ecr_image}",
"memory": 7168,
"vcpus": 2,
"volumes": [
{
"host": {
"sourcePath": "/tmp"
},
"name": "tmp"
}
],
"environment": [
{"name": "VARNAME", "value": "VARVAL"}
],
"mountPoints": [
{
"sourceVolume": "tmp",
"containerPath": "/tmp",
"readOnly": false
}
],
"ulimits": [
{
"hardLimit": 1024,
"name": "nofile",
"softLimit": 1024
}
]
}
CONTAINER_PROPERTIES
}
# Batch Job Queue:
resource "aws_batch_job_queue" "batch_job_queue" {
name = "${var.batch_name}_job-queue"
state = "ENABLED"
priority = 1
compute_environments = [
"${aws_batch_compute_environment.batch_compute_environment.arn}",
]
}
resource "aws_iam_role" "cloudwatch_event_iam_role" {
name = "${var.batch_name}_cloudwatch_event_iam_role"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": "events.amazonaws.com"
},
"Action": "sts:AssumeRole"
}
]
}
EOF
}
resource "aws_iam_policy" "cloudwatch_event_iam_policy" {
name = "${var.batch_name}_cloudwatch_event_iam_policy"
policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"batch:SubmitJob"
],
"Resource": [
"${aws_batch_job_definition.batch_job_definition.arn}",
"${aws_batch_job_queue.batch_job_queue.arn}"
]
}
]
}
EOF
}
resource "aws_iam_role_policy_attachment" "cloudwatch_event_iam_policy_attachment" {
policy_arn = aws_iam_policy.cloudwatch_event_iam_policy.arn
role = aws_iam_role.cloudwatch_event_iam_role.name
}
resource "aws_cloudwatch_event_rule" "cloudwatch_event_rule" {
name = "${var.batch_name}_eventbridge_event_rule"
schedule_expression = "cron(20 23 * * ? *)"
}
resource "aws_cloudwatch_event_target" "cloudwatch_event_target" {
rule = aws_cloudwatch_event_rule.cloudwatch_event_rule.name
arn = aws_batch_job_queue.batch_job_queue.arn
role_arn = aws_iam_role.cloudwatch_event_iam_role.arn
batch_target {
job_definition = aws_batch_job_definition.batch_job_definition.arn
job_name = aws_batch_job_definition.batch_job_definition.name
}
}
Apply work but after that all submitted jobs stucks in RUNNABLE state. Compute environment gets invalid status with error:
CLIENT_ERROR - Your compute environment has been INVALIDATED and scaled down because none of the instances joined the underlying ECS Cluster. Common issues preventing instances joining are the following: VPC/Subnet configuration preventing communication to ECS, incorrect Instance Profile policy preventing authorization to ECS, or customized AMI or LaunchTemplate configurations affecting ECS agent.
I tried to solve the problem using the list here, and:
-
permissions looks good
-
it seems that there isn’t a “A route table with a route destination of 0.0.0.0/0 in subnet”
I don’t know what is the problem and where did I go wrong. I will be grateful for hints.