Reputation: 61
I have noticed our gke cluster system pods (gke-metrics-agent) running out of memory. I have tried to editing daemonset yaml file for increasing memory request to 200Mi and memory limit 200Mi for it. However, it didn't not allow me to apply it. It recreated with default as it like before which is 50Mi. pod status image
Please help me to increase memory resource of gke-metrics-agent
Upvotes: 6
Views: 4754
Reputation: 21
Use the below command
kubectl label node/NODENAME cloud.google.com/gke-metrics-agent-scaling-level=20 --overwrite
Upvotes: 2
Reputation: 216
You can add this label in your nodes and it will increase the memory resource limit from 50Mi to 70Mi of gke-metrics-agent pods.
kubectl label node/NODENAME cloud.google.com/gke-metrics-agent-scaling-level=20 --overwrite
Upvotes: 2
Reputation: 178
Generally CrashLoopBackOff
indicates that a container is repeatedly crashing after restarting. You can follow the documentation to troubleshoot CrashLoopBackOff
issues.
A possible workaround that would limit the OOM killing of the gke-metric-agent would be to increase the memory limit for the gke-metric-agent pods. This can be done by disabling GKE monitoring and to use a custom Metric Agent manifest to deploy the gke-metric-agent to the cluster. This will allow you to adjust the memory resources for the gke-metric-agent to stop it from being killed.
To do so, you can follow the steps below:
CLUSTER=<cluster_name>
PROJECT=<project>
LOCATION=<location>
gcloud container clusters update $CLUSTER --zone=$LOCATION --project=$PROJECT --monitoring-service=none --logging-service=logging.googleapis.com/kubernetes
sed -u -e's/{{.ClusterName}}/'${CLUSTER}'/g' -e's/{{.Location}}/'${LOCATION}'/g' metrics-agent.yaml | kubectl apply -f -
---
apiVersion: v1
kind: ConfigMap
metadata:
name: gke-metrics-agent-conf
namespace: default
data:
gke-metrics-agent-config: |
receivers:
prometheus:
use_start_time_metric: true
config:
scrape_configs:
- job_name: "kubelet"
scrape_interval: 60s
static_configs:
- targets: ["$KUBELET_HOST:10255"]
metric_relabel_configs:
- source_labels: [ __name__ ]
target_label: gke_component_name
replacement: "nodes/kubelet"
- job_name: "kubelet-prober"
scrape_interval: 60s
static_configs:
- targets: ["$KUBELET_HOST:10255"]
metrics_path: /metrics/probes
metric_relabel_configs:
- source_labels: [__name__]
regex: "prober_probe_total|process_start_time_seconds"
action: keep
- source_labels: [ __name__ ]
target_label: gke_component_name
replacement: "nodes/kubelet"
- job_name: "addons"
scrape_interval: 60s
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- kube-system
selectors:
- role: pod
field: "spec.nodeName=$NODE_NAME"
relabel_configs:
- source_labels: [ __meta_kubernetes_pod_container_port_name ]
regex: ".*metrics"
action: keep
- source_labels: [ __meta_kubernetes_pod_annotationpresent_components_gke_io_component_name ]
regex: true
action: keep
- source_labels: [ __meta_kubernetes_pod_annotationpresent_monitoring_gke_io_path, __meta_kubernetes_pod_annotation_monitoring_gke_io_path ]
regex: "true;(.*)"
target_label: __metrics_path__
- source_labels: [ __meta_kubernetes_pod_name ]
target_label: pod
- source_labels: [ __meta_kubernetes_pod_container_name ]
target_label: container
- source_labels: [ __meta_kubernetes_namespace ]
target_label: namespace
- source_labels: [ __meta_kubernetes_pod_annotation_components_gke_io_component_name ]
target_label: gke_component_name
replacement: "addons/${ARG1}"
- source_labels: [ gke_component_name ]
target_label: gke_component_name
regex: "(.*)-(.*)"
replacement: "${ARG1}_${ARG2}"
- source_labels: [ gke_component_name ]
target_label: gke_component_name
regex: "(.*)-(.*)"
replacement: "${ARG1}_${ARG2}"
- job_name: "coredns"
scrape_interval: 60s
static_configs:
- targets: ["$KUBELET_HOST:9253"]
metric_relabel_configs:
- source_labels: [ __name__ ]
target_label: gke_component_name
replacement: "nodes/coredns"
- job_name: "coredns-nodecache"
scrape_interval: 60s
static_configs:
- targets: ["$KUBELET_HOST:9353"]
metric_relabel_configs:
- source_labels: [ __name__ ]
target_label: gke_component_name
replacement: "nodes/coredns"
- job_name: "node"
scrape_interval: 60s
static_configs:
- targets: ["$KUBELET_HOST:10231"]
metric_relabel_configs:
- source_labels: [ __name__ ]
target_label: gke_component_name
replacement: "net/cluster/node"
kubenode:
endpoint: "http://$KUBELET_HOST:10255"
scrape_interval: 60s
cluster_name: {{.ClusterName}}
location: {{.Location}}
node_name: "$NODE_NAME"
kubernetes_service_host: "$KUBERNETES_SERVICE_HOST"
exporters:
stackdriver:
endpoint: monitoring.googleapis.com:443
skip_create_metric_descriptor: true
processors:
resource:
type: "host"
labels:
cloud.zone: {{.Location}}
host.name: "$NODE_NAME"
k8s.cluster.name: {{.ClusterName}}
metrics_export:
common_prefix: "kubernetes.io/internal"
detect_container_metrics: true
allowed_labels:
- "project"
- "location"
- "cluster_name"
- "node_name"
- "namespace"
- "pod"
- "container"
export_map:
- "kubernetes.io/internal/nodes/kubelet/process_start_time_seconds":
drop: true
- "kubernetes.io/internal/nodes/kubelet/kubelet_docker_operations_total":
allowed_labels:
- "operation_type"
export_name: "kubernetes.io/internal/nodes/kubelet/docker_operations_total"
export_as_int: true
- "kubernetes.io/internal/nodes/kubelet/kubelet_docker_operations_errors_total":
allowed_labels:
- "operation_type"
export_name: "kubernetes.io/internal/nodes/kubelet/docker_operations_errors_total"
export_as_int: true
- "kubernetes.io/internal/nodes/kubelet/kubelet_runtime_operations_total":
allowed_labels:
- "operation_type"
export_name: "kubernetes.io/internal/nodes/kubelet/runtime_operations_total"
export_as_int: true
- "kubernetes.io/internal/nodes/kubelet/kubelet_runtime_operations_errors_total":
allowed_labels:
- "operation_type"
export_name: "kubernetes.io/internal/nodes/kubelet/runtime_operations_errors_total"
export_as_int: true
- "kubernetes.io/internal/nodes/kubelet/rest_client_requests_total":
allowed_labels:
- "code"
- "method"
- "host"
export_as_int: true
- "kubernetes.io/internal/nodes/kubelet/storage_operation_duration_seconds":
allowed_labels:
- "volume_plugin"
- "operation_name"
- "kubernetes.io/internal/nodes/kubelet/kubelet_network_plugin_operations_duration_seconds":
allowed_labels:
- "operation_type"
export_name: "kubernetes.io/internal/nodes/kubelet/network_plugin_operations_duration_seconds"
- "kubernetes.io/internal/nodes/kubelet/storage_operation_errors_total":
allowed_labels:
- "volume_plugin"
- "operation_name"
export_as_int: true
- "kubernetes.io/internal/nodes/kubelet/storage_operation_status_count":
allowed_labels:
- "volume_plugin"
- "operation_name"
- "status"
export_as_int: true
- "kubernetes.io/internal/nodes/kubelet/prober_probe_total":
allowed_labels:
- "container"
- "namespace"
- "pod"
- "pod_uid"
- "result"
- "probe_type"
export_as_int: true
is_container_metric: true
- "kubernetes.io/internal/nodes/coredns/process_start_time_seconds":
drop: true
- "kubernetes.io/internal/nodes/coredns/coredns_cache_drops_total":
allowed_labels:
- "server"
export_name: "kubernetes.io/internal/nodes/coredns/cache_drops_total"
- "kubernetes.io/internal/nodes/coredns/coredns_cache_hits_total":
allowed_labels:
- "server"
- "type"
export_name: "kubernetes.io/internal/nodes/coredns/cache_hits_total"
export_as_int: true
- "kubernetes.io/internal/nodes/coredns/coredns_cache_misses_total":
allowed_labels:
- "server"
export_name: "kubernetes.io/internal/nodes/coredns/cache_misses_total"
export_as_int: true
- "kubernetes.io/internal/nodes/coredns/coredns_cache_prefetch_total":
allowed_labels:
- "server"
export_name: "kubernetes.io/internal/nodes/coredns/cache_prefetch_total"
- "kubernetes.io/internal/nodes/coredns/coredns_cache_size":
allowed_labels:
- "server"
- "type"
export_name: "kubernetes.io/internal/nodes/coredns/cache_size"
export_as_int: true
- "kubernetes.io/internal/nodes/coredns/coredns_dns_request_count_total":
allowed_labels:
- "family"
- "proto"
- "server"
- "zone"
export_name: "kubernetes.io/internal/nodes/coredns/dns_request_count_total"
export_as_int: true
- "kubernetes.io/internal/nodes/coredns/coredns_dns_request_duration_seconds":
allowed_labels:
- "server"
- "zone"
export_name: "kubernetes.io/internal/nodes/coredns/dns_request_duration_seconds"
- "kubernetes.io/internal/nodes/coredns/coredns_dns_request_type_count_total":
allowed_labels:
- "server"
- "type"
- "zone"
export_name: "kubernetes.io/internal/nodes/coredns/dns_request_type_count_total"
export_as_int: true
- "kubernetes.io/internal/nodes/coredns/coredns_dns_response_rcode_count_total":
allowed_labels:
- "rcode"
- "server"
- "zone"
export_name: "kubernetes.io/internal/nodes/coredns/dns_response_rcode_count_total"
export_as_int: true
- "kubernetes.io/internal/nodes/coredns/coredns_forward_healthcheck_failure_count_total":
allowed_labels:
- "to"
export_name: "kubernetes.io/internal/nodes/coredns/forward_healthcheck_failure_count_total"
export_as_int: true
- "kubernetes.io/internal/nodes/coredns/coredns_forward_request_count_total":
allowed_labels:
- "to"
export_name: "kubernetes.io/internal/nodes/coredns/forward_request_count_total"
export_as_int: true
- "kubernetes.io/internal/nodes/coredns/coredns_forward_request_duration_seconds":
allowed_labels:
- "to"
export_name: "kubernetes.io/internal/nodes/coredns/forward_request_duration_seconds"
- "kubernetes.io/internal/nodes/coredns/coredns_forward_response_rcode_count_total":
allowed_labels:
- "rcode"
- "to"
export_name: "kubernetes.io/internal/nodes/coredns/forward_response_rcode_count_total"
export_as_int: true
- "kubernetes.io/internal/nodes/coredns/coredns_forward_sockets_open":
allowed_labels:
- "to"
export_name: "kubernetes.io/internal/nodes/coredns/forward_sockets_open"
export_as_int: true
- "kubernetes.io/internal/nodes/coredns/coredns_health_request_duration_seconds":
allowed_labels: []
export_name: "kubernetes.io/internal/nodes/coredns/health_request_duration_seconds"
export_as_int: true
- "kubernetes.io/internal/nodes/coredns/coredns_panic_count_total":
allowed_labels: []
export_name: "kubernetes.io/internal/nodes/coredns/dns_panic_count_total"
export_as_int: true
- "kubernetes.io/internal/nodes/coredns/nodecache_setup_errors_total":
allowed_labels:
- "errortype"
export_name: "kubernetes.io/internal/nodes/coredns/nodecache_setup_errors_total"
- "kubernetes.io/internal/net/cluster/node/process_start_time_seconds":
drop: true
- "kubernetes.io/internal/net/cluster/node/conntrack_entries":
allowed_labels: []
export_as_int: true
- "kubernetes.io/internal/net/cluster/node/conntrack_error_count":
allowed_labels:
- "type"
export_as_int: true
- "kubernetes.io/internal/net/cluster/node/num_inuse_sockets":
allowed_labels:
- "protocol"
export_as_int: true
- "kubernetes.io/internal/net/cluster/node/num_tw_sockets":
allowed_labels: []
export_as_int: true
- "kubernetes.io/internal/net/cluster/node/socket_memory":
allowed_labels: []
export_as_int: true
- "kubernetes.io/internal/addons/kubedns/process_start_time_seconds":
drop: true
- "kubernetes.io/internal/addons/kubedns/skydns_skydns_dns_request_count_total":
allowed_labels:
- "system"
export_name: "kubernetes.io/internal/addons/kubedns/skydns_dns_request_count_total"
export_as_int: true
- "kubernetes.io/internal/addons/kubedns/skydns_skydns_dns_request_duration_seconds":
allowed_labels:
- "system"
export_name: "kubernetes.io/internal/addons/kubedns/skydns_dns_request_duration_seconds"
- "kubernetes.io/internal/addons/kubedns/skydns_skydns_dns_response_size_bytes":
allowed_labels:
- "system"
export_name: "kubernetes.io/internal/addons/kubedns/skydns_dns_response_size_bytes"
- "kubernetes.io/internal/addons/kubedns/skydns_skydns_dns_error_count_total":
allowed_labels:
- "system"
- "cause"
export_name: "kubernetes.io/internal/addons/kubedns/skydns_dns_error_count_total"
export_as_int: true
- "kubernetes.io/internal/addons/kubedns/skydns_skydns_dns_cachemiss_count_total":
allowed_labels:
- "cache"
export_name: "kubernetes.io/internal/addons/kubedns/skydns_dns_cachemiss_count_total"
export_as_int: true
extensions:
observability:
endpoint: monitoring.googleapis.com:443
prefix: "kubernetes.io/internal/addons/gke_otelsvc"
resource:
type: "k8s_container"
labels:
location: {{.Location}}
cluster_name: {{.ClusterName}}
pod_name: "$POD_NAME"
namespace_name: "$POD_NAMESPACE"
container_name: "gke-metrics-agent"
service:
extensions:
- observability
pipelines:
metrics/kube:
receivers:
- kubenode
exporters:
- stackdriver
metrics/prom:
receivers:
- prometheus
processors:
- resource
- metrics_export
exporters:
- stackdriver
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: gke-metrics-agent
namespace: default
---
apiVersion: policy/v1beta1
kind: PodSecurityPolicy
metadata:
annotations:
apparmor.security.beta.kubernetes.io/allowedProfileNames: runtime/default
apparmor.security.beta.kubernetes.io/defaultProfileName: runtime/default
kubernetes.io/description: Policy used by the gke-metrics-agent addon.
seccomp.security.alpha.kubernetes.io/allowedProfileNames: runtime/default,docker/default
seccomp.security.alpha.kubernetes.io/defaultProfileName: docker/default
name: gce.gke-metrics-agent
labels:
kubernetes.io/cluster-service: 'true'
spec:
privileged: false
allowPrivilegeEscalation: false
volumes:
- 'hostPath'
- 'secret'
- 'configMap'
allowedHostPaths:
- pathPrefix: /etc/ssl/certs
hostNetwork: true
hostIPC: false
hostPID: false
runAsUser:
rule: 'RunAsAny'
seLinux:
rule: 'RunAsAny'
supplementalGroups:
rule: 'RunAsAny'
fsGroup:
rule: 'RunAsAny'
readOnlyRootFilesystem: false
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: gke-metrics-agent
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- pods
verbs:
- list
- watch
- apiGroups:
- policy
resourceNames:
- gce.gke-metrics-agent
resources:
- podsecuritypolicies
verbs:
- use
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: gke-metrics-agent
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: gke-metrics-agent
subjects:
- kind: ServiceAccount
name: gke-metrics-agent
namespace: default
---
# linux deployment
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: gke-metrics-agent
namespace: default
labels:
k8s-app: gke-metrics-agent
component: gke-metrics-agent
spec:
selector:
matchLabels:
k8s-app: gke-metrics-agent
component: gke-metrics-agent
template:
metadata:
labels:
k8s-app: gke-metrics-agent
component: gke-metrics-agent
spec:
nodeSelector:
kubernetes.io/os: linux
tolerations:
- effect: NoExecute
operator: Exists
- effect: NoSchedule
operator: Exists
hostNetwork: true
serviceAccount: gke-metrics-agent
containers:
- name: gke-metrics-agent
image: "gcr.io/gke-release/gke-metrics-agent:0.1.3-gke.0"
resources:
requests:
memory: 50Mi
cpu: 3m
limits:
memory: 70Mi
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: KUBELET_HOST
value: "127.0.0.1"
- name: ARG1
value: "${1}"
- name: ARG2
value: "${2}"
- name: WINDOWS_JOB_ACTION
value: "drop"
command:
- "/otelsvc"
- "--config=/conf/gke-metrics-agent-config.yaml"
- "--metrics-level=NONE"
volumeMounts:
- name: gke-metrics-agent-config-vol
mountPath: /conf
- name: ssl-certs
mountPath: /etc/ssl/certs
readOnly: true
volumes:
- configMap:
name: gke-metrics-agent-conf
items:
- key: gke-metrics-agent-config
path: gke-metrics-agent-config.yaml
name: gke-metrics-agent-config-vol
- name: ssl-certs
hostPath:
path: /etc/ssl/certs
---
# windows deployment
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: gke-metrics-agent-windows
namespace: default
labels:
k8s-app: gke-metrics-agent
component: gke-metrics-agent
spec:
selector:
matchLabels:
k8s-app: gke-metrics-agent
component: gke-metrics-agent
template:
metadata:
labels:
k8s-app: gke-metrics-agent
component: gke-metrics-agent
spec:
nodeSelector:
kubernetes.io/os: windows
tolerations:
- effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 300
- effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 300
- effect: NoSchedule
key: node.kubernetes.io/os
operator: Equal
value: windows
serviceAccount: gke-metrics-agent
containers:
- name: gke-metrics-agent
image: "gke.io/gke-release/gke-metrics-agent-windows:0.3.1-gke.2"
resources:
requests:
cpu: 5m
memory: 200Mi
limits:
memory: 200Mi
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: KUBELET_HOST
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: KUBERNETES_SERVICE_HOST
value: "kubernetes.default.svc.cluster.local"
- name: ARG1
value: "${1}"
- name: ARG2
value: "${2}"
- name: WINDOWS_JOB_ACTION
value: "keep"
command:
- "c:\\otelsvc.exe"
- "--config=/conf/gke-metrics-agent-config.yaml"
- "--metrics-level=NONE"
volumeMounts:
- name: gke-metrics-agent-config-vol
mountPath: /conf
volumes:
- configMap:
name: gke-metrics-agent-conf
items:
- key: gke-metrics-agent-config
path: gke-metrics-agent-config.yaml
name: gke-metrics-agent-config-vol
NOTE: You can edit the memory limit for the linux deployment as per your requirements.
sed -u -e's/{{.ClusterName}}/'${CLUSTER}'/g' -e's/{{.Location}}/'${LOCATION}'/g' metrics-agent.yaml | kubectl delete -f -
OR
kubectl delete ds gke-metrics-agent
Kubectl delete ds gke-metrics-agent-windows
kubectl delete cm gke-metrics-agent-conf
kubectl delete sa gke-metrics-agent
gcloud container clusters update $CLUSTER --zone=$LOCATION --project=$PROJECT --monitoring-service=monitoring.googleapis.com/kubernetes --logging-service=logging.googleapis.com/kubernetes
Upvotes: 5