Reputation: 3705
I'm quite new into devops, and suffering with setting up a test project for a couple of weeks now.
I have made a terraform file which is supposed to set up most of the project:
# Get subnets
data "aws_subnets" "subnets" {
filter {
name = "vpc-id"
values = [var.vpc_id]
}
}
# Get security groups
data "aws_security_groups" "security_groups" {
filter {
name = "vpc-id"
values = [var.vpc_id]
}
}
resource "aws_s3_bucket" "lb_logs" {
bucket = "${var.app_name}-load-balancer-${var.env}-logs"
}
resource "aws_s3_bucket_server_side_encryption_configuration" "encryption" {
bucket = aws_s3_bucket.lb_logs.bucket
rule {
apply_server_side_encryption_by_default {
sse_algorithm = "AES256"
}
}
}
resource "aws_s3_bucket_versioning" "versioning" {
bucket = aws_s3_bucket.lb_logs.bucket
versioning_configuration {
status = "Enabled"
}
}
resource "aws_s3_bucket_acl" "acl" {
bucket = aws_s3_bucket.lb_logs.bucket
acl = "private"
}
data "aws_iam_policy_document" "lb_logs_s3_put_object" {
statement {
effect = "Allow"
principals {
type = "AWS"
identifiers = ["arn:aws:iam::156460612806:root"]
}
actions = ["s3:PutObject"]
resources = ["${aws_s3_bucket.lb_logs.arn}/*"]
}
}
resource "aws_s3_bucket_policy" "lb_logs_s3_put_object" {
bucket = aws_s3_bucket.lb_logs.id
policy = data.aws_iam_policy_document.lb_logs_s3_put_object.json
}
# Create load balancer
resource "aws_lb" "load_balancer" {
name = "${var.app_name}-load-balancer-${var.env}"
subnets = data.aws_subnets.subnets.ids
security_groups = data.aws_security_groups.security_groups.ids
load_balancer_type = "application"
access_logs {
bucket = aws_s3_bucket.lb_logs.bucket
enabled = true
}
tags = {
Environment = "${var.env}"
}
}
resource "aws_lb_target_group" "blue_target" {
name = "${var.app_name}-blue-target-${var.env}"
protocol = "HTTPS"
port = var.port
target_type = "ip"
vpc_id = var.vpc_id
health_check {
healthy_threshold = 5
interval = 30
matcher = 200
path = "${var.health_check_path}"
protocol = "HTTPS"
timeout = 10
unhealthy_threshold = 2
}
}
resource "aws_lb_target_group" "green_target" {
name = "${var.app_name}-green-target-${var.env}"
protocol = "HTTPS"
port = var.port
target_type = "ip"
vpc_id = var.vpc_id
health_check {
healthy_threshold = 5
interval = 30
matcher = 200
path = "${var.health_check_path}"
protocol = "HTTPS"
timeout = 10
unhealthy_threshold = 2
}
}
data "aws_acm_certificate" "cert" {
domain = var.domain
statuses = ["ISSUED"]
most_recent = true
}
resource "aws_lb_listener" "listener" {
load_balancer_arn = aws_lb.load_balancer.arn
port = var.port
protocol = "HTTPS"
ssl_policy = "ELBSecurityPolicy-2016-08"
certificate_arn = data.aws_acm_certificate.cert.arn
default_action {
type = "forward"
target_group_arn = aws_lb_target_group.blue_target.arn
}
}
# ECS
resource "aws_ecs_cluster" "cluster" {
name = "${var.app_name}-cluster-${var.env}"
}
data "aws_ecr_repository" "ecr_repository" {
name = var.image_repo_name
}
resource "aws_iam_role" "ecs_task_role" {
name = "EcsTaskRole"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Sid = ""
Principal = {
Service = "ecs-tasks.amazonaws.com"
}
},
]
})
}
resource "aws_iam_policy" "secrets_manager_read_policy" {
name = "SecretsManagerRead"
description = "Read only access to secrets manager"
policy = jsonencode({
Version = "2012-10-17",
Statement = [
{
Sid = "",
Effect = "Allow",
Action = [
"secretsmanager:GetRandomPassword",
"secretsmanager:GetResourcePolicy",
"secretsmanager:GetSecretValue",
"secretsmanager:DescribeSecret",
"secretsmanager:ListSecretVersionIds",
"secretsmanager:ListSecrets"
],
Resource = "*"
}
]
})
}
resource "aws_iam_role_policy_attachment" "attach_secrets_manager_read_to_task_role" {
role = aws_iam_role.ecs_task_role.name
policy_arn = aws_iam_policy.secrets_manager_read_policy.arn
}
resource "aws_iam_role_policy_attachment" "attach_s3_read_to_task_role" {
role = aws_iam_role.ecs_task_role.name
policy_arn = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
}
resource "aws_iam_role_policy_attachment" "attach_ses_to_task_role" {
role = aws_iam_role.ecs_task_role.name
policy_arn = "arn:aws:iam::aws:policy/AmazonSESFullAccess"
}
resource "aws_iam_role" "ecs_exec_role" {
name = "EcsExecRole"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Sid = ""
Principal = {
Service = "ecs-tasks.amazonaws.com"
}
},
]
})
}
resource "aws_iam_policy" "log_groups_write_policy" {
name = "LogGroupsWrite"
description = "Read only access to secrets manager"
policy = jsonencode({
Version = "2012-10-17",
Statement = [
{
Sid = "",
Effect = "Allow",
Action = [
"logs:CreateLogGroup"
],
Resource = "*"
}
]
})
}
resource "aws_iam_role_policy_attachment" "attach_secrets_manager_read_to_exec_role" {
role = aws_iam_role.ecs_exec_role.name
policy_arn = aws_iam_policy.log_groups_write_policy.arn
}
resource "aws_iam_role_policy_attachment" "attach_ecs_task_exec_to_exec_role" {
role = aws_iam_role.ecs_exec_role.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}
resource "aws_iam_role_policy_attachment" "attach_fault_injection_simulator_to_exec_role" {
role = aws_iam_role.ecs_exec_role.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSFaultInjectionSimulatorECSAccess"
}
resource "aws_ecs_task_definition" "task_def" {
family = "${var.app_name}-task-def-${var.env}"
network_mode = "awsvpc"
task_role_arn = aws_iam_role.ecs_task_role.arn
execution_role_arn = aws_iam_role.ecs_exec_role.arn
requires_compatibilities = ["FARGATE"]
cpu = "256"
memory = "512"
runtime_platform {
cpu_architecture = "X86_64"
operating_system_family = "LINUX"
}
container_definitions = jsonencode([
{
name = "${var.app_name}-container-${var.env}"
image = "${data.aws_ecr_repository.ecr_repository.repository_url}:latest"
cpu = 0
essential = true
portMappings = [
{
containerPort = var.port
hostPort = var.port
},
]
environment = [
{
name = "PORT",
value = tostring("${var.port}")
},
{
name = "NODE_ENV",
value = var.env
}
]
logConfiguration = {
logDriver = "awslogs"
options = {
"awslogs-create-group" = "true"
"awslogs-group" = "${var.app_name}-task-def-${var.env}"
"awslogs-region" = "${var.region}"
"awslogs-stream-prefix" = "ecs"
}
}
},
])
}
resource "aws_ecs_service" "service" {
lifecycle {
ignore_changes = [
task_definition,
load_balancer,
]
}
cluster = aws_ecs_cluster.cluster.arn
name = "${var.app_name}-service-${var.env}"
task_definition = aws_ecs_task_definition.task_def.arn
load_balancer {
target_group_arn = aws_lb_target_group.blue_target.arn
container_name = "${var.app_name}-container-${var.env}"
container_port = var.port
}
capacity_provider_strategy {
capacity_provider = "FARGATE"
base = 0
weight = 1
}
scheduling_strategy = "REPLICA"
deployment_controller {
type = "CODE_DEPLOY"
}
platform_version = "1.4.0"
network_configuration {
assign_public_ip = true
subnets = data.aws_subnets.subnets.ids
security_groups = data.aws_security_groups.security_groups.ids
}
desired_count = 1
}
# DEPLOYMENT
resource "aws_codedeploy_app" "codedeploy_app" {
name = "${var.app_name}-application-${var.env}"
compute_platform = "ECS"
}
resource "aws_iam_role" "codedeploy_role" {
name = "CodedeployRole"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Sid = ""
Principal = {
Service = "codedeploy.amazonaws.com"
}
},
]
})
}
resource "aws_iam_role_policy_attachment" "attach_codedeploy_role" {
role = aws_iam_role.codedeploy_role.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSCodeDeployRole"
}
resource "aws_iam_role_policy_attachment" "attach_codedeploy_role_for_ecs" {
role = aws_iam_role.codedeploy_role.name
policy_arn = "arn:aws:iam::aws:policy/AWSCodeDeployRoleForECS"
}
resource "aws_codedeploy_deployment_group" "deployment_group" {
app_name = aws_codedeploy_app.codedeploy_app.name
deployment_config_name = "CodeDeployDefault.ECSAllAtOnce"
auto_rollback_configuration {
enabled = true
events = ["DEPLOYMENT_FAILURE"]
}
blue_green_deployment_config {
deployment_ready_option {
action_on_timeout = "CONTINUE_DEPLOYMENT"
wait_time_in_minutes = 0
}
terminate_blue_instances_on_deployment_success {
action = "TERMINATE"
termination_wait_time_in_minutes = 5
}
}
deployment_group_name = "${var.app_name}-deployment-group-${var.env}"
deployment_style {
deployment_option = "WITH_TRAFFIC_CONTROL"
deployment_type = "BLUE_GREEN"
}
load_balancer_info {
target_group_pair_info {
prod_traffic_route {
listener_arns = [aws_lb_listener.listener.arn]
}
target_group {
name = aws_lb_target_group.blue_target.name
}
target_group {
name = aws_lb_target_group.green_target.name
}
}
}
service_role_arn = aws_iam_role.codedeploy_role.arn
ecs_service {
service_name = aws_ecs_service.service.name
cluster_name = aws_ecs_cluster.cluster.name
}
}
resource "aws_appautoscaling_target" "scalable_target" {
service_namespace = "ecs"
resource_id = "service/${aws_ecs_cluster.cluster.name}/${aws_ecs_service.service.name}"
scalable_dimension = "ecs:service:DesiredCount"
min_capacity = 1
max_capacity = 5
}
resource "aws_appautoscaling_policy" "cpu_scaling_policy" {
name = "${var.app_name}-cpu-scaling-policy-${var.env}"
service_namespace = "ecs"
resource_id = "service/${aws_ecs_cluster.cluster.name}/${aws_ecs_service.service.name}"
scalable_dimension = "ecs:service:DesiredCount"
policy_type = "TargetTrackingScaling"
target_tracking_scaling_policy_configuration {
target_value = 70
predefined_metric_specification {
predefined_metric_type = "ECSServiceAverageCPUUtilization"
}
scale_out_cooldown = 300
scale_in_cooldown = 300
disable_scale_in = false
}
}
resource "aws_appautoscaling_policy" "memory_scaling_policy" {
name = "${var.app_name}-memory-scaling-policy-${var.env}"
service_namespace = "ecs"
resource_id = "service/${aws_ecs_cluster.cluster.name}/${aws_ecs_service.service.name}"
scalable_dimension = "ecs:service:DesiredCount"
policy_type = "TargetTrackingScaling"
target_tracking_scaling_policy_configuration {
target_value = 70
predefined_metric_specification {
predefined_metric_type = "ECSServiceAverageMemoryUtilization"
}
scale_out_cooldown = 300
scale_in_cooldown = 300
disable_scale_in = false
}
}
I've created a project which is without HTTPS, and custom domain (started small, built it step-by-step, first without auto-scaling, logging, and other fancy stuff). it works fine, health checks are passing, can connect, etc.
I've decided to create the exact same thing just with HTTPS, and instead of using the alb's dns to call the api, assign a custom domain.
The load balancer is constantly creating/destroying instances because health checks are failing.
I was doing some research, and I couldn't find a way to debug why this is happening. All I know from the container logs is that it starts, all good, no errors, but they are being terminated because health checks are failing. I cannot access any logs about why, I can only see that there are unhealthy targets.
Now because it is in a VPC, and they don't have a static ip address, and set to HTTPS, it seems like from the load balancer level down to containers it's a black box where it's impossible to debug.
Couldn't think of anything else, I set my security group to allow all requests from all ports to check if I can call the health check endpoint.
Turns out I can, but it returns 502. More detailed logs from the load balancer:
type https
time 2023-02-10T14:37:00.099726Z
elb app/myapp-load-balancer-staging/c6aabdb240600ca8
client:port myip:38255
target:port targetip:3000
request_processing_time -1
target_processing_time -1
response_processing_time -1
elb_status_code 502
target_status_code -
received_bytes 360
sent_bytes 277
"request" "GET https://api.myapp.com:3000/rest/health HTTP/1.1"
"user_agent" "PostmanRuntime/7.29.0"
ssl_cipher <some text>-SHA256
ssl_protocol TLSv1.2
target_group_arn arn:aws:elasticloadbalancing:eu-west-1:myaccountnumber:targetgroup/myapp-blue-target-staging/id
"trace_id" "Root=1-63e6568c-7c78be0f1e967e59370fbb80"
"domain_name" "api.myapp.com"
"chosen_cert_arn" "arn:aws:acm:eu-west-1:myaccountnumber:certificate/certid"
matched_rule_priority 0
request_creation_time 2023-02-10T14:37:00.096000Z
"actions_executed" "forward"
"redirect_url" "-"
"error_reason" "-"
"target:port_list" "172.31.2.112:3000"
"target_status_code_list" "-"
"classification" "Ambiguous"
"classification_reason" "UndefinedContentLengthSemantics"
All I could find is this guide on the topic, but it just explained the problem, didn't show a solution.
Helping me to spot what I'm doing wrong would help a lot, but I'd really appreciate a guide on how to debug these things between the load balancer and containers as they are set so secure with vpcs and everything that even the admins cannot access them.
Upvotes: 0
Views: 662
Reputation: 201008
This is because you are using var.port
for all the port settings, for the load balancer listener, target group traffic port, and container port. And you have configured the target group to use the HTTPS protocol. However the SSL traffic is terminated at the load balancer. Only the load balancer has an SSL certificate, so only the load balancer can handle HTTPS traffic. The traffic from the load balancer to the container is still HTTP.
You need to separate out your port settings and traffic protocol settings so that only the load balancer listener is using port 443/HTTPS. The other ports should be configured to use port HTTP just like they were before when everything was working for you, before you enabled SSL.
Upvotes: 1