How to figure out why health checks aren't passing in ecs fargate with alb?

Question

I'm quite new into devops, and suffering with setting up a test project for a couple of weeks now.

I have made a terraform file which is supposed to set up most of the project:

# Get subnets
data "aws_subnets" "subnets" {
  filter {
    name   = "vpc-id"
    values = [var.vpc_id]
  }
}

# Get security groups
data "aws_security_groups" "security_groups" {
  filter {
    name   = "vpc-id"
    values = [var.vpc_id]
  }
}

resource "aws_s3_bucket" "lb_logs" {
  bucket = "${var.app_name}-load-balancer-${var.env}-logs"
}
resource "aws_s3_bucket_server_side_encryption_configuration" "encryption" {
  bucket = aws_s3_bucket.lb_logs.bucket

  rule {
    apply_server_side_encryption_by_default {
      sse_algorithm = "AES256"
    }
  }
}
resource "aws_s3_bucket_versioning" "versioning" {
  bucket = aws_s3_bucket.lb_logs.bucket

  versioning_configuration {
    status = "Enabled"
  }
}
resource "aws_s3_bucket_acl" "acl" {
  bucket = aws_s3_bucket.lb_logs.bucket
  acl = "private"
}



data "aws_iam_policy_document" "lb_logs_s3_put_object" {
  statement {
    effect = "Allow"
    principals {
      type = "AWS"
      identifiers = ["arn:aws:iam::156460612806:root"]
    }
    actions = ["s3:PutObject"]
    resources = ["${aws_s3_bucket.lb_logs.arn}/*"]
  }
}
resource "aws_s3_bucket_policy" "lb_logs_s3_put_object" {
  bucket = aws_s3_bucket.lb_logs.id
  policy = data.aws_iam_policy_document.lb_logs_s3_put_object.json
}

# Create load balancer
resource "aws_lb" "load_balancer" {
  name = "${var.app_name}-load-balancer-${var.env}"
  subnets = data.aws_subnets.subnets.ids
  security_groups = data.aws_security_groups.security_groups.ids
  load_balancer_type = "application"
  access_logs {
    bucket = aws_s3_bucket.lb_logs.bucket
    enabled = true
  }
  tags = {
    Environment = "${var.env}"
  }
}

resource "aws_lb_target_group" "blue_target" {
  name     = "${var.app_name}-blue-target-${var.env}"
  protocol = "HTTPS"
  port     = var.port
  target_type = "ip"
  vpc_id   = var.vpc_id
  health_check {
    healthy_threshold = 5
    interval = 30
    matcher = 200
    path = "${var.health_check_path}"
    protocol = "HTTPS"
    timeout = 10
    unhealthy_threshold = 2
  }
}
resource "aws_lb_target_group" "green_target" {
  name     = "${var.app_name}-green-target-${var.env}"
  protocol = "HTTPS"
  port     = var.port
  target_type = "ip"
  vpc_id   = var.vpc_id
  health_check {
    healthy_threshold = 5
    interval = 30
    matcher = 200
    path = "${var.health_check_path}"
    protocol = "HTTPS"
    timeout = 10
    unhealthy_threshold = 2
  }
}

data "aws_acm_certificate" "cert" {
    domain      = var.domain
    statuses = ["ISSUED"]
    most_recent = true
}

resource "aws_lb_listener" "listener" {
  load_balancer_arn = aws_lb.load_balancer.arn
  port              = var.port
  protocol          = "HTTPS"
  ssl_policy        = "ELBSecurityPolicy-2016-08"
  certificate_arn   = data.aws_acm_certificate.cert.arn

  default_action {
    type             = "forward"
    target_group_arn = aws_lb_target_group.blue_target.arn
  }
}

# ECS

resource "aws_ecs_cluster" "cluster" {
  name = "${var.app_name}-cluster-${var.env}"
}

data "aws_ecr_repository" "ecr_repository" {
  name = var.image_repo_name
}

resource "aws_iam_role" "ecs_task_role" {
  name = "EcsTaskRole"
  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Action = "sts:AssumeRole"
        Effect = "Allow"
        Sid    = ""
        Principal = {
          Service = "ecs-tasks.amazonaws.com"
        }
      },
    ]
  })
}
resource "aws_iam_policy" "secrets_manager_read_policy" {
  name        = "SecretsManagerRead"
  description = "Read only access to secrets manager"

  policy = jsonencode({
    Version = "2012-10-17",
    Statement = [
      {
        Sid = "",
        Effect = "Allow",
        Action = [
          "secretsmanager:GetRandomPassword",
          "secretsmanager:GetResourcePolicy",
          "secretsmanager:GetSecretValue",
          "secretsmanager:DescribeSecret",
          "secretsmanager:ListSecretVersionIds",
          "secretsmanager:ListSecrets"
        ],
        Resource = "*"
      }
    ]
  })
}
resource "aws_iam_role_policy_attachment" "attach_secrets_manager_read_to_task_role" {
  role       = aws_iam_role.ecs_task_role.name
  policy_arn = aws_iam_policy.secrets_manager_read_policy.arn
}
resource "aws_iam_role_policy_attachment" "attach_s3_read_to_task_role" {
  role       = aws_iam_role.ecs_task_role.name
  policy_arn = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
}
resource "aws_iam_role_policy_attachment" "attach_ses_to_task_role" {
  role       = aws_iam_role.ecs_task_role.name
  policy_arn = "arn:aws:iam::aws:policy/AmazonSESFullAccess"
}
resource "aws_iam_role" "ecs_exec_role" {
  name = "EcsExecRole"
  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Action = "sts:AssumeRole"
        Effect = "Allow"
        Sid    = ""
        Principal = {
          Service = "ecs-tasks.amazonaws.com"
        }
      },
    ]
  })
}
resource "aws_iam_policy" "log_groups_write_policy" {
  name        = "LogGroupsWrite"
  description = "Read only access to secrets manager"

  policy = jsonencode({
    Version = "2012-10-17",
    Statement = [
      {
        Sid = "",
        Effect = "Allow",
        Action = [
          "logs:CreateLogGroup"
        ],
        Resource = "*"
      }
    ]
  })
}
resource "aws_iam_role_policy_attachment" "attach_secrets_manager_read_to_exec_role" {
  role       = aws_iam_role.ecs_exec_role.name
  policy_arn = aws_iam_policy.log_groups_write_policy.arn
}
resource "aws_iam_role_policy_attachment" "attach_ecs_task_exec_to_exec_role" {
  role       = aws_iam_role.ecs_exec_role.name
  policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}
resource "aws_iam_role_policy_attachment" "attach_fault_injection_simulator_to_exec_role" {
  role       = aws_iam_role.ecs_exec_role.name
  policy_arn = "arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorECSAccess"
}

resource "aws_ecs_task_definition" "task_def" {
  family = "${var.app_name}-task-def-${var.env}"
  network_mode = "awsvpc"
  task_role_arn = aws_iam_role.ecs_task_role.arn
  execution_role_arn = aws_iam_role.ecs_exec_role.arn
  requires_compatibilities = ["FARGATE"]
  cpu = "256"
  memory = "512"
  runtime_platform {
    cpu_architecture = "X86_64"
    operating_system_family = "LINUX"
  }
  container_definitions = jsonencode([
    {
      name      = "${var.app_name}-container-${var.env}"
      image     = "${data.aws_ecr_repository.ecr_repository.repository_url}:latest"
      cpu       = 0
      essential = true
      portMappings = [
        {
          containerPort = var.port
          hostPort      = var.port
        },
      ]
      environment = [
        {
          name = "PORT",
          value = tostring("${var.port}")
        },
        {
          name = "NODE_ENV",
          value = var.env
        }
      ]
      logConfiguration = {
          logDriver = "awslogs"
          options = {
            "awslogs-create-group" = "true"
            "awslogs-group" = "${var.app_name}-task-def-${var.env}"
            "awslogs-region" = "${var.region}"
            "awslogs-stream-prefix" = "ecs"
          }
        }
    },
  ])
}

resource "aws_ecs_service" "service" {
  lifecycle {
    ignore_changes = [
      task_definition,
      load_balancer,
    ]
  }
  cluster = aws_ecs_cluster.cluster.arn
  name = "${var.app_name}-service-${var.env}"
  task_definition = aws_ecs_task_definition.task_def.arn
  load_balancer {
    target_group_arn = aws_lb_target_group.blue_target.arn
    container_name = "${var.app_name}-container-${var.env}"
    container_port = var.port
  }
  capacity_provider_strategy {
    capacity_provider = "FARGATE"
    base = 0
    weight = 1
  }
  scheduling_strategy = "REPLICA"
  deployment_controller {
    type = "CODE_DEPLOY"
  }
  platform_version = "1.4.0"
  network_configuration {
    assign_public_ip = true
    subnets = data.aws_subnets.subnets.ids
    security_groups = data.aws_security_groups.security_groups.ids
  }
  desired_count = 1
}

# DEPLOYMENT

resource "aws_codedeploy_app" "codedeploy_app" {
  name = "${var.app_name}-application-${var.env}"
  compute_platform = "ECS"
}

resource "aws_iam_role" "codedeploy_role" {
  name = "CodedeployRole"
  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Action = "sts:AssumeRole"
        Effect = "Allow"
        Sid    = ""
        Principal = {
          Service = "codedeploy.amazonaws.com"
        }
      },
    ]
  })
}
resource "aws_iam_role_policy_attachment" "attach_codedeploy_role" {
  role       = aws_iam_role.codedeploy_role.name
  policy_arn = "arn:aws:iam::aws:policy/service-role/AWSCodeDeployRole"
}
resource "aws_iam_role_policy_attachment" "attach_codedeploy_role_for_ecs" {
  role       = aws_iam_role.codedeploy_role.name
  policy_arn = "arn:aws:iam::aws:policy/AWSCodeDeployRoleForECS"
}

resource "aws_codedeploy_deployment_group" "deployment_group" {
  app_name = aws_codedeploy_app.codedeploy_app.name
  deployment_config_name = "CodeDeployDefault.ECSAllAtOnce"
  auto_rollback_configuration {
    enabled = true
    events = ["DEPLOYMENT_FAILURE"]
  }
  blue_green_deployment_config {
    deployment_ready_option {
      action_on_timeout = "CONTINUE_DEPLOYMENT"
      wait_time_in_minutes = 0
    }
    terminate_blue_instances_on_deployment_success {
      action                           = "TERMINATE"
      termination_wait_time_in_minutes = 5
    }
  }
  deployment_group_name = "${var.app_name}-deployment-group-${var.env}"
  deployment_style {
    deployment_option = "WITH_TRAFFIC_CONTROL"
    deployment_type = "BLUE_GREEN"
  }
  load_balancer_info {
    target_group_pair_info {
      prod_traffic_route {
        listener_arns = [aws_lb_listener.listener.arn]
      }

      target_group {
        name = aws_lb_target_group.blue_target.name
      }

      target_group {
        name = aws_lb_target_group.green_target.name
      }
    }
  }
  service_role_arn = aws_iam_role.codedeploy_role.arn
  ecs_service {
    service_name = aws_ecs_service.service.name
    cluster_name = aws_ecs_cluster.cluster.name
  }
}

resource "aws_appautoscaling_target" "scalable_target" {
  service_namespace  = "ecs"
  resource_id        = "service/${aws_ecs_cluster.cluster.name}/${aws_ecs_service.service.name}"
  scalable_dimension = "ecs:service:DesiredCount"
  min_capacity       = 1
  max_capacity       = 5
}
resource "aws_appautoscaling_policy" "cpu_scaling_policy" {
  name = "${var.app_name}-cpu-scaling-policy-${var.env}"
  service_namespace = "ecs"
  resource_id = "service/${aws_ecs_cluster.cluster.name}/${aws_ecs_service.service.name}"
  scalable_dimension = "ecs:service:DesiredCount"
  policy_type = "TargetTrackingScaling"
  target_tracking_scaling_policy_configuration {
    target_value = 70
    predefined_metric_specification {
      predefined_metric_type = "ECSServiceAverageCPUUtilization"
    }
    scale_out_cooldown = 300
    scale_in_cooldown = 300
    disable_scale_in = false
  }
}
resource "aws_appautoscaling_policy" "memory_scaling_policy" {
  name = "${var.app_name}-memory-scaling-policy-${var.env}"
  service_namespace = "ecs"
  resource_id = "service/${aws_ecs_cluster.cluster.name}/${aws_ecs_service.service.name}"
  scalable_dimension = "ecs:service:DesiredCount"
  policy_type = "TargetTrackingScaling"
  target_tracking_scaling_policy_configuration {
    target_value = 70
    predefined_metric_specification {
      predefined_metric_type = "ECSServiceAverageMemoryUtilization"
    }
    scale_out_cooldown = 300
    scale_in_cooldown = 300
    disable_scale_in = false
  }
}

I've created a project which is without HTTPS, and custom domain (started small, built it step-by-step, first without auto-scaling, logging, and other fancy stuff). it works fine, health checks are passing, can connect, etc.

I've decided to create the exact same thing just with HTTPS, and instead of using the alb's dns to call the api, assign a custom domain.

The load balancer is constantly creating/destroying instances because health checks are failing.

I was doing some research, and I couldn't find a way to debug why this is happening. All I know from the container logs is that it starts, all good, no errors, but they are being terminated because health checks are failing. I cannot access any logs about why, I can only see that there are unhealthy targets.

Now because it is in a VPC, and they don't have a static ip address, and set to HTTPS, it seems like from the load balancer level down to containers it's a black box where it's impossible to debug.

Couldn't think of anything else, I set my security group to allow all requests from all ports to check if I can call the health check endpoint.

Turns out I can, but it returns 502. More detailed logs from the load balancer:

type https
time 2023-02-10T14:37:00.099726Z
elb app/myapp-load-balancer-staging/c6aabdb240600ca8
client:port myip:38255
target:port targetip:3000
request_processing_time -1
target_processing_time -1
response_processing_time -1
elb_status_code 502
target_status_code -
received_bytes 360
sent_bytes 277
"request" "GET https://api.myapp.com:3000/rest/health HTTP/1.1"
"user_agent" "PostmanRuntime/7.29.0"
ssl_cipher -SHA256
ssl_protocol TLSv1.2
target_group_arn arn:aws:elasticloadbalancing:eu-west-1:myaccountnumber:targetgroup/myapp-blue-target-staging/id
"trace_id" "Root=1-63e6568c-7c78be0f1e967e59370fbb80"
"domain_name" "api.myapp.com"
"chosen_cert_arn" "arn:aws:acm:eu-west-1:myaccountnumber:certificate/certid"
matched_rule_priority 0
request_creation_time 2023-02-10T14:37:00.096000Z
"actions_executed" "forward"
"redirect_url" "-"
"error_reason" "-"
"target:port_list" "172.31.2.112:3000"
"target_status_code_list" "-"
"classification" "Ambiguous"
"classification_reason" "UndefinedContentLengthSemantics"

All I could find is this guide on the topic, but it just explained the problem, didn't show a solution.

Helping me to spot what I'm doing wrong would help a lot, but I'd really appreciate a guide on how to debug these things between the load balancer and containers as they are set so secure with vpcs and everything that even the admins cannot access them.

Mark B · Accepted Answer

This is because you are using var.port for all the port settings, for the load balancer listener, target group traffic port, and container port. And you have configured the target group to use the HTTPS protocol. However the SSL traffic is terminated at the load balancer. Only the load balancer has an SSL certificate, so only the load balancer can handle HTTPS traffic. The traffic from the load balancer to the container is still HTTP.

You need to separate out your port settings and traffic protocol settings so that only the load balancer listener is using port 443/HTTPS. The other ports should be configured to use port HTTP just like they were before when everything was working for you, before you enabled SSL.

How to figure out why health checks aren't passing in ecs fargate with alb?

Answers (1)

Related Questions

How to figure out why health checks aren&#39;t passing in ecs fargate with alb?

Answers (1)

Related Questions

How to figure out why health checks aren't passing in ecs fargate with alb?