Reputation: 171
What did you do?
I ran prometheus2.0.0 on kubernetesv1.8.5
What did you expect to see?
Everything went well.
What did you see instead? Under which circumstances?
Everything went well at beginning. But several hours later, pods' statuses turned to "CrashLoopBackOff", all prometheus turned unavaliable. After prometheus pods created, nothing has been done.
[root@k8s-1 prometheus]# kubectl get all -n monitoring
NAME DESIRED CURRENT AGE
statefulsets/prometheus-k8s 0 2 16h
NAME READY STATUS RESTARTS AGE
po/prometheus-k8s-0 0/1 CrashLoopBackOff 81 16h
po/prometheus-k8s-1 0/1 CrashLoopBackOff 22 16h
Environment
[root@k8s-1 prometheus]# kubectl version --short
Client Version: v1.8.5
Server Version: v1.8.5
[root@k8s-1 prometheus]# docker images | grep -i prometheus
quay.io/prometheus/alertmanager v0.12.0 f87cbd5f1360 5 weeks ago 31.2 MB
quay.io/prometheus/node_exporter v0.15.2 ff5ecdcfc4a2 6 weeks ago 22.8 MB
quay.io/prometheus/prometheus v2.0.0 67141fa03496 2 months ago 80.2 MB
System information:
[root@k8s-1 prometheus]# uname -srm
Linux 3.10.0-229.el7.x86_64 x86_64
Prometheus version:
v2.0.0
Prometheus configuration file:
[root@k8s-1 prometheus]# cat prometheus-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-k8s-config
namespace: monitoring
data:
prometheus.yaml: |
global:
scrape_interval: 10s
scrape_timeout: 10s
evaluation_interval: 10s
rule_files:
- "/etc/prometheus-rules/*.rules"
scrape_configs:
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-nodes'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
- job_name: 'kubernetes-cadvisor'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
- job_name: 'kubernetes-services'
metrics_path: /probe
params:
module: [http_2xx]
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: kubernetes_name
- job_name: 'kubernetes-ingresses'
metrics_path: /probe
params:
module: [http_2xx]
kubernetes_sd_configs:
- role: ingress
relabel_configs:
- source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path]
regex: (.+);(.+);(.+)
replacement: ${1}://${2}${3}
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_ingress_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_ingress_name]
target_label: kubernetes_name
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
prometheus.yaml:
[root@k8s-1 prometheus]# cat prometheus-all-together.yaml
apiVersion: v1
kind: Service
metadata:
labels:
prometheus: k8s
name: prometheus-k8s
namespace: monitoring
annotations:
prometheus.io/scrape: "true"
spec:
ports:
- name: web
nodePort: 30900
port: 9090
protocol: TCP
targetPort: web
selector:
prometheus: k8s
sessionAffinity: None
type: NodePort
---
apiVersion: apps/v1beta1
kind: StatefulSet
metadata:
labels:
prometheus: k8s
name: prometheus-k8s
namespace: monitoring
spec:
selector:
matchLabels:
app: prometheus
prometheus: k8s
serviceName: prometheus-k8s
replicas: 2
template:
metadata:
labels:
app: prometheus
prometheus: k8s
spec:
securityContext:
runAsUser: 65534
fsGroup: 65534
runAsNonRoot: true
containers:
- args:
- --config.file=/etc/prometheus/config/prometheus.yaml
- --storage.tsdb.path=/cephfs/prometheus/data
- --storage.tsdb.retention=180d
- --web.route-prefix=/
- --web.enable-lifecycle
- --web.enable-admin-api
image: quay.io/prometheus/prometheus:v2.0.0
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 10
httpGet:
path: /status
port: web
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 3
name: prometheus
ports:
- containerPort: 9090
name: web
protocol: TCP
readinessProbe:
failureThreshold: 6
httpGet:
path: /status
port: web
scheme: HTTP
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 3
resources:
requests:
cpu: 100m
memory: 200Mi
limits:
cpu: 500m
memory: 500Mi
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /etc/prometheus/config
name: config
readOnly: false
- mountPath: /etc/prometheus/rules
name: rules
readOnly: false
- mountPath: /cephfs/prometheus/data
name: data
subPath: prometheus-data
readOnly: false
serviceAccount: prometheus-k8s
serviceAccountName: prometheus-k8s
terminationGracePeriodSeconds: 60
volumes:
- configMap:
defaultMode: 511
name: prometheus-k8s-config
name: config
- configMap:
defaultMode: 511
name: prometheus-k8s-rules
name: rules
- name: data
persistentVolumeClaim:
claimName: cephfs-pvc
updateStrategy:
type: RollingUpdate
Logs:
[root@k8s-1 prometheus]# kubectl logs prometheus-k8s-0 -n monitoring
level=info ts=2018-01-20T03:16:32.966070249Z caller=main.go:215 msg="Starting Prometheus" version="(version=2.0.0, branch=HEAD, revision=0a74f98628a0463dddc90528220c94de5032d1a0)"
level=info ts=2018-01-20T03:16:32.966225361Z caller=main.go:216 build_context="(go=go1.9.2, user=root@615b82cb36b6, date=20171108-07:11:59)"
level=info ts=2018-01-20T03:16:32.966252185Z caller=main.go:217 host_details="(Linux 3.10.0-229.el7.x86_64 #1 SMP Fri Mar 6 11:36:42 UTC 2015 x86_64 prometheus-k8s-0 (none))"
level=info ts=2018-01-20T03:16:32.969789371Z caller=web.go:380 component=web msg="Start listening for connections" address=0.0.0.0:9090
level=info ts=2018-01-20T03:16:32.971388907Z caller=main.go:314 msg="Starting TSDB"
level=info ts=2018-01-20T03:16:32.971596811Z caller=targetmanager.go:71 component="target manager" msg="Starting target manager..."
level=error ts=2018-01-20T03:16:59.781338012Z caller=main.go:323 msg="Opening storage failed" err="invalid block sequence: block time ranges overlap (1516348800000, 1516356000000)"
[root@k8s-1 prometheus]#
[root@k8s-1 prometheus]# kubectl logs prometheus-k8s-1 -n monitoring
level=info ts=2018-01-20T03:15:22.701351679Z caller=main.go:215 msg="Starting Prometheus" version="(version=2.0.0, branch=HEAD, revision=0a74f98628a0463dddc90528220c94de5032d1a0)"
level=info ts=2018-01-20T03:15:22.70148418Z caller=main.go:216 build_context="(go=go1.9.2, user=root@615b82cb36b6, date=20171108-07:11:59)"
level=info ts=2018-01-20T03:15:22.701512333Z caller=main.go:217 host_details="(Linux 3.10.0-229.el7.x86_64 #1 SMP Fri Mar 6 11:36:42 UTC 2015 x86_64 prometheus-k8s-1 (none))"
level=info ts=2018-01-20T03:15:22.705824203Z caller=web.go:380 component=web msg="Start listening for connections" address=0.0.0.0:9090
level=info ts=2018-01-20T03:15:22.707629775Z caller=main.go:314 msg="Starting TSDB"
level=info ts=2018-01-20T03:15:22.707837323Z caller=targetmanager.go:71 component="target manager" msg="Starting target manager..."
level=error ts=2018-01-20T03:15:54.775639791Z caller=main.go:323 msg="Opening storage failed" err="invalid block sequence: block time ranges overlap (1516348800000, 1516356000000)"
[root@k8s-1 prometheus]# kubectl describe po/prometheus-k8s-0 -n monitoring
Name: prometheus-k8s-0
Namespace: monitoring
Node: k8s-3/172.16.1.8
Start Time: Fri, 19 Jan 2018 17:59:38 +0800
Labels: app=prometheus
controller-revision-hash=prometheus-k8s-7d86dfbd86
prometheus=k8s
Annotations: kubernetes.io/created-by={"kind":"SerializedReference","apiVersion":"v1","reference":{"kind":"StatefulSet","namespace":"monitoring","name":"prometheus-k8s","uid":"7593d8ac-fcff-11e7-9333-fa163e48f857"...
Status: Running
IP: 10.244.2.54
Created By: StatefulSet/prometheus-k8s
Controlled By: StatefulSet/prometheus-k8s
Containers:
prometheus:
Container ID: docker://98faabe55fb71050aacd776d349a6567c25c339117159356eedc10cbc19ef02a
Image: quay.io/prometheus/prometheus:v2.0.0
Image ID: docker-pullable://quay.io/prometheus/prometheus@sha256:53afe934a8d497bb703dbbf7db273681a56677775c462833da8d85015471f7a3
Port: 9090/TCP
Args:
--config.file=/etc/prometheus/config/prometheus.yaml
--storage.tsdb.path=/cephfs/prometheus/data
--storage.tsdb.retention=180d
--web.route-prefix=/
--web.enable-lifecycle
--web.enable-admin-api
State: Waiting
Reason: CrashLoopBackOff
Last State: Terminated
Reason: Error
Exit Code: 1
Started: Sat, 20 Jan 2018 11:11:00 +0800
Finished: Sat, 20 Jan 2018 11:11:29 +0800
Ready: False
Restart Count: 84
Limits:
cpu: 500m
memory: 500Mi
Requests:
cpu: 100m
memory: 200Mi
Liveness: http-get http://:web/status delay=30s timeout=3s period=5s #success=1 #failure=10
Readiness: http-get http://:web/status delay=0s timeout=3s period=5s #success=1 #failure=6
Environment: <none>
Mounts:
/cephfs/prometheus/data from data (rw)
/etc/prometheus/config from config (rw)
/etc/prometheus/rules from rules (rw)
/var/run/secrets/kubernetes.io/serviceaccount from prometheus-k8s-token-x8xzh (ro)
Conditions:
Type Status
Initialized True
Ready False
PodScheduled True
Volumes:
config:
Type: ConfigMap (a volume populated by a ConfigMap)
Name: prometheus-k8s-config
Optional: false
rules:
Type: ConfigMap (a volume populated by a ConfigMap)
Name: prometheus-k8s-rules
Optional: false
data:
Type: PersistentVolumeClaim (a reference to a PersistentVolumeClaim in the same namespace)
ClaimName: cephfs-pvc
ReadOnly: false
prometheus-k8s-token-x8xzh:
Type: Secret (a volume populated by a Secret)
SecretName: prometheus-k8s-token-x8xzh
Optional: false
QoS Class: Burstable
Node-Selectors: <none>
Tolerations: node.alpha.kubernetes.io/notReady:NoExecute for 300s
node.alpha.kubernetes.io/unreachable:NoExecute for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Pulled 15m (x83 over 17h) kubelet, k8s-3 Container image "quay.io/prometheus/prometheus:v2.0.0" already present on machine
Warning FailedSync 23s (x1801 over 7h) kubelet, k8s-3 Error syncing pod
logs on kubernetes nodes:
[root@k8s-3 01C48JAGH1QCGKGCG72E0B2Y8R]# journalctl -xeu kubelet --no-pager
1月 20 11:21:54 k8s-3 kubelet[14306]: I0120 11:21:54.619924 14306 kuberuntime_manager.go:749] Back-off 5m0s restarting failed container=prometheus pod=prometheus-k8s-0_monitoring(7598959a-fcff-11e7-9333-fa163e48f857)
1月 20 11:21:54 k8s-3 kubelet[14306]: E0120 11:21:54.620042 14306 pod_workers.go:182] Error syncing pod 7598959a-fcff-11e7-9333-fa163e48f857 ("prometheus-k8s-0_monitoring(7598959a-fcff-11e7-9333-fa163e48f857)"), skipping: failed to "StartContainer" for "prometheus" with CrashLoopBackOff: "Back-off 5m0s restarting failed container=prometheus pod=prometheus-k8s-0_monitoring(7598959a-fcff-11e7-9333-fa163e48f857)"
1月 20 11:22:08 k8s-3 kubelet[14306]: I0120 11:22:08.615438 14306 kuberuntime_manager.go:500] Container {Name:prometheus Image:quay.io/prometheus/prometheus:v2.0.0 Command:[] Args:[--config.file=/etc/prometheus/config/prometheus.yaml --storage.tsdb.path=/cephfs/prometheus/data --storage.tsdb.retention=180d --web.route-prefix=/ --web.enable-lifecycle --web.enable-admin-api] WorkingDir: Ports:[{Name:web HostPort:0 ContainerPort:9090 Protocol:TCP HostIP:}] EnvFrom:[] Env:[] Resources:{Limits:map[cpu:{i:{value:500 scale:-3} d:{Dec:<nil>} s:500m Format:DecimalSI} memory:{i:{value:524288000 scale:0} d:{Dec:<nil>} s:500Mi Format:BinarySI}] Requests:map[cpu:{i:{value:100 scale:-3} d:{Dec:<nil>} s:100m Format:DecimalSI} memory:{i:{value:209715200 scale:0} d:{Dec:<nil>} s: Format:BinarySI}]} VolumeMounts:[{Name:config ReadOnly:false MountPath:/etc/prometheus/config SubPath: MountPropagation:<nil>} {Name:rules ReadOnly:false MountPath:/etc/prometheus/rules SubPath: MountPropagation:<nil>} {Name:data ReadOnly:false MountPath:/cephfs/prometheus/data SubPath:prometheus-data MountPropagation:<nil>} {Name:prometheus-k8s-token-x8xzh ReadOnly:true MountPath:/var/run/secrets/kubernetes.io/serviceaccount SubPath: MountPropagation:<nil>}] LivenessProbe:&Probe{Handler:Handler{Exec:nil,HTTPGet:&HTTPGetAction{Path:/status,Port:web,Host:,Scheme:HTTP,HTTPHeaders:[],},TCPSocket:nil,},InitialDelaySeconds:30,TimeoutSeconds:3,PeriodSeconds:5,SuccessThreshold:1,FailureThreshold:10,} ReadinessProbe:&Probe{Handler:Handler{Exec:nil,HTTPGet:&HTTPGetAction{Path:/status,Port:web,Host:,Scheme:HTTP,HTTPHeaders:[],},TCPSocket:nil,},InitialDelaySeconds:0,TimeoutSeconds:3,PeriodSeconds:5,SuccessThreshold:1,FailureThreshold:6,} Lifecycle:nil TerminationMessagePath:/dev/termination-log TerminationMessagePolicy:File ImagePullPolicy:IfNotPresent SecurityContext:nil Stdin:false StdinOnce:false TTY:false} is dead, but RestartPolicy says that we should restart it.
1月 20 11:22:08 k8s-3 kubelet[14306]: I0120 11:22:08.615662 14306 kuberuntime_manager.go:739] checking backoff for container "prometheus" in pod "prometheus-k8s-0_monitoring(7598959a-fcff-11e7-9333-fa163e48f857)"
Any suggestions? Thanks.
Upvotes: 0
Views: 2187
Reputation: 34122
Two Prometheus servers cannot share the same storage directory, you should have gotten a locking error about this.
Upvotes: 1