Reputation: 957
I have provisioned a GKE private cluster using the below terraform script
resource "google_container_cluster" "cluster" {
name = var.cluster_name
project = var.project
location = var.zone
network = google_compute_network.network.self_link
subnetwork = google_compute_subnetwork.subnetwork.self_link
logging_service = "logging.googleapis.com/kubernetes"
monitoring_service = "monitoring.googleapis.com/kubernetes"
remove_default_node_pool = "true"
initial_node_count = 1
addons_config {
network_policy_config {
disabled = false
}
}
workload_identity_config {
identity_namespace = format("%s.svc.id.goog", var.project)
}
master_auth {
username = ""
password = ""
client_certificate_config {
issue_client_certificate = "false"
}
}
network_policy {
enabled = "true"
}
ip_allocation_policy {
cluster_secondary_range_name = google_compute_subnetwork.subnetwork.secondary_ip_range.0.range_name
services_secondary_range_name = google_compute_subnetwork.subnetwork.secondary_ip_range.1.range_name
}
master_authorized_networks_config {
cidr_blocks {
display_name = "bastion"
cidr_block = format("%s/32", google_compute_instance.bastion.network_interface.0.network_ip)
}
}
private_cluster_config {
enable_private_endpoint = "true"
enable_private_nodes = "true"
master_ipv4_cidr_block = "172.16.0.16/28"
}
timeouts {
create = "30m"
update = "30m"
delete = "30m"
}
depends_on = [
google_project_service.service,
google_project_iam_member.service-account,
google_project_iam_member.service-account-custom,
google_compute_router_nat.nat,
]
}
resource "google_container_node_pool" "private-np-1" {
name = "private-np-1"
location = var.zone
cluster = google_container_cluster.cluster.name
node_count = "3"
management {
auto_repair = "true"
auto_upgrade = "false"
}
node_config {
machine_type = "e2-micro"
disk_type = "pd-standard"
disk_size_gb = 100
image_type = "COS"
service_account = google_service_account.gke-sa.email
oauth_scopes = [
"https://www.googleapis.com/auth/devstorage.read_only",
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring",
"https://www.googleapis.com/auth/servicecontrol",
"https://www.googleapis.com/auth/service.management.readonly",
"https://www.googleapis.com/auth/trace.append",
]
labels = {
cluster = var.cluster_name
}
workload_metadata_config {
node_metadata = "GKE_METADATA_SERVER"
}
metadata = {
google-compute-enable-virtio-rng = "true"
disable-legacy-endpoints = "true"
}
}
depends_on = [
google_container_cluster.cluster,
]
}
Here's my network terraform script:
resource "google_service_account" "gke-sa" {
account_id = format("%s-node-sa", var.cluster_name)
display_name = "GKE Security Service Account"
project = var.project
}
resource "google_project_iam_member" "service-account" {
count = length(var.service_account_iam_roles)
project = var.project
role = element(var.service_account_iam_roles, count.index)
member = format("serviceAccount:%s", google_service_account.gke-sa.email)
}
resource "google_project_service" "service" {
count = length(var.project_services)
project = var.project
service = element(var.project_services, count.index)
disable_on_destroy = false
}
resource "google_compute_network" "network" {
name = format("%s-network", var.cluster_name)
project = var.project
auto_create_subnetworks = false
depends_on = [
google_project_service.service,
]
}
resource "google_compute_subnetwork" "subnetwork" {
name = format("%s-subnet", var.cluster_name)
project = var.project
network = google_compute_network.network.self_link
region = var.region
ip_cidr_range = "10.0.0.0/24"
private_ip_google_access = true
secondary_ip_range {
range_name = format("%s-pod-range", var.cluster_name)
ip_cidr_range = "10.1.0.0/16"
}
secondary_ip_range {
range_name = format("%s-svc-range", var.cluster_name)
ip_cidr_range = "10.2.0.0/20"
}
}
resource "google_compute_address" "nat" {
name = format("%s-nat-ip", var.cluster_name)
project = var.project
region = var.region
depends_on = [
google_project_service.service,
]
}
resource "google_compute_router" "router" {
name = format("%s-cloud-router", var.cluster_name)
project = var.project
region = var.region
network = google_compute_network.network.self_link
bgp {
asn = 64514
}
}
resource "google_compute_router_nat" "nat" {
name = format("%s-cloud-nat", var.cluster_name)
project = var.project
router = google_compute_router.router.name
region = var.region
nat_ip_allocate_option = "MANUAL_ONLY"
nat_ips = [google_compute_address.nat.self_link]
source_subnetwork_ip_ranges_to_nat = "LIST_OF_SUBNETWORKS"
subnetwork {
name = google_compute_subnetwork.subnetwork.self_link
source_ip_ranges_to_nat = ["PRIMARY_IP_RANGE", "LIST_OF_SECONDARY_IP_RANGES"]
secondary_ip_range_names = [
google_compute_subnetwork.subnetwork.secondary_ip_range.0.range_name,
google_compute_subnetwork.subnetwork.secondary_ip_range.1.range_name,
]
}
}
locals {
hostname = format("%s-bastion", var.cluster_name)
}
resource "google_service_account" "bastion" {
account_id = format("%s-bastion-sa", var.cluster_name)
display_name = "GKE Bastion SA"
}
resource "google_compute_firewall" "bastion-ssh" {
name = format("%s-bastion-ssh", var.cluster_name)
network = google_compute_network.network.name
direction = "INGRESS"
project = var.project
source_ranges = ["0.0.0.0/0"]
allow {
protocol = "tcp"
ports = ["22"]
}
target_tags = ["bastion"]
}
data "template_file" "startup_script" {
template = <<-EOF
sudo apt-get update -y
sudo apt-get install -y tinyproxy
EOF
}
resource "google_compute_instance" "bastion" {
name = local.hostname
machine_type = "g1-small"
zone = var.zone
project = var.project
tags = ["bastion"]
boot_disk {
initialize_params {
image = "debian-cloud/debian-9"
}
}
metadata_startup_script = data.template_file.startup_script.rendered
network_interface {
subnetwork = google_compute_subnetwork.subnetwork.name
access_config {
// Ephemeral IP
}
}
allow_stopping_for_update = true
service_account {
email = google_service_account.bastion.email
scopes = ["cloud-platform"]
}
// This provider is used to block the subsequent providers until the instance
// is available.
provisioner "local-exec" {
command = <<EOF
READY=""
for i in $(seq 1 20); do
if gcloud compute ssh ${local.hostname} --project ${var.project} --zone ${var.region}-a --command uptime; then
READY="yes"
break;
fi
echo "Waiting for ${local.hostname} to initialize..."
sleep 10;
done
if [[ -z $READY ]]; then
echo "${local.hostname} failed to start in time."
echo "Please verify that the instance starts and then re-run `terraform apply`"
exit 1
fi
EOF
}
}
In summary what the above script is doing are:
These resources are all getting creating fine, however the node statuses are very unstable. Only one of the nodes is switching statuses (frequently) between ready and unknown. The rest 2 nodes are always in status unknown. Any deployment (even ingress controller deployment) is failing.
I am not an expert with GKE.
What is the issue here? Why nodes are in "unknown status"?
Upvotes: 0
Views: 574
Reputation: 863
If the node is marked in Unknown state, it means it cannot communicate with the control plane API Server.
Additionally, There might be user installed webhooks which might interfere with kube-system pods. These webhooks might prevent CNI plugins from starting and make a node NotReady/unknown.
Sometimes this issue occurs because of overloaded resources, the node might end up in an unhealthy state if there aren't sufficient resources for it to function.
If the issue occurs next time, the command 'kubectl describe node ' would greatly assist in resolving the situation.
Typically, the node enters an unknown status due to any of the conditions listed below. OutOfDisk MemoryPressure DiskPressure
Upvotes: 0