feat: initial alert rules
This commit is contained in:
@@ -0,0 +1,56 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: k8s-cluster-rules
|
||||
namespace: monitoring
|
||||
labels:
|
||||
release: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: cluster.critical
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: KubeAPIServerDown
|
||||
expr: absent(up{job="apiserver"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "API Server inaccessible"
|
||||
description: "L'API server ne répond plus depuis 5 minutes."
|
||||
|
||||
- alert: EtcdDown
|
||||
expr: absent(up{job="etcd"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "etcd inaccessible"
|
||||
description: "etcd ne répond plus."
|
||||
|
||||
- alert: KubeSchedulerDown
|
||||
expr: absent(up{job="kube-scheduler"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Scheduler inaccessible"
|
||||
description: "Plus aucun pod ne peut être schedulé."
|
||||
|
||||
- alert: KubeControllerManagerDown
|
||||
expr: absent(up{job="kube-controller-manager"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Controller Manager inaccessible"
|
||||
description: "Les deployments et replicasets ne fonctionnent plus."
|
||||
|
||||
- alert: etcdHighCommitDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Latence etcd élevée"
|
||||
description: "La latence de commit etcd dépasse 250ms."
|
||||
@@ -0,0 +1,28 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: k8s-network-rules
|
||||
namespace: monitoring
|
||||
labels:
|
||||
release: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: network.critical
|
||||
rules:
|
||||
- alert: CoreDNSDown
|
||||
expr: absent(up{job="coredns"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "CoreDNS inaccessible"
|
||||
description: "CoreDNS est indisponible, la résolution DNS interne est cassée."
|
||||
|
||||
- alert: KubeAPIServerLatencyHigh
|
||||
expr: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"WATCH|LIST"}[5m])) by (le)) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Latence API Server élevée"
|
||||
description: "La latence p99 de l'API server dépasse 1 seconde."
|
||||
@@ -0,0 +1,55 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: k8s-node-rules
|
||||
namespace: monitoring
|
||||
labels:
|
||||
release: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: nodes.critical
|
||||
rules:
|
||||
- alert: NodeNotReady
|
||||
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Node NotReady"
|
||||
description: "Le node {{ $labels.node }} est NotReady depuis 15 minutes."
|
||||
|
||||
- alert: NodeMemoryPressure
|
||||
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Node en MemoryPressure"
|
||||
description: "Le node {{ $labels.node }} est en MemoryPressure."
|
||||
|
||||
- alert: NodeDiskPressure
|
||||
expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Node en DiskPressure"
|
||||
description: "Le node {{ $labels.node }} est en DiskPressure."
|
||||
|
||||
- alert: NodeHighCPU
|
||||
expr: 100 - (avg by(node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "CPU node élevé"
|
||||
description: "CPU du node {{ $labels.node }} > 85%."
|
||||
|
||||
- alert: NodeHighMemory
|
||||
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "RAM node élevée"
|
||||
description: "RAM du node {{ $labels.node }} > 90%."
|
||||
@@ -0,0 +1,55 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: k8s-pod-rules
|
||||
namespace: monitoring
|
||||
labels:
|
||||
release: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: pods.critical
|
||||
rules:
|
||||
- alert: KubePodCrashLooping
|
||||
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Pod en CrashLoop"
|
||||
description: "Le pod {{ $labels.namespace }}/{{ $labels.pod }} crashe en boucle."
|
||||
|
||||
- alert: KubePodNotReady
|
||||
expr: sum by(namespace, pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Pod non Ready"
|
||||
description: "Le pod {{ $labels.namespace }}/{{ $labels.pod }} n'est pas Ready depuis 15 minutes."
|
||||
|
||||
- alert: KubeDeploymentReplicasMismatch
|
||||
expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Replicas manquants"
|
||||
description: "Le deployment {{ $labels.namespace }}/{{ $labels.deployment }} n'a pas le bon nombre de replicas."
|
||||
|
||||
- alert: KubeContainerOOMKilled
|
||||
expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} == 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Conteneur OOMKilled"
|
||||
description: "Le conteneur {{ $labels.container }} du pod {{ $labels.pod }} a été tué par OOMKiller."
|
||||
|
||||
- alert: KubeHpaMaxedOut
|
||||
expr: kube_horizontalpodautoscaler_status_current_replicas == kube_horizontalpodautoscaler_spec_max_replicas
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "HPA au maximum"
|
||||
description: "L'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} est à son maximum."
|
||||
@@ -0,0 +1,28 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: k8s-storage-rules
|
||||
namespace: monitoring
|
||||
labels:
|
||||
release: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: storage.critical
|
||||
rules:
|
||||
- alert: KubePersistentVolumeFull
|
||||
expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes < 0.03
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PVC plein"
|
||||
description: "Le PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} est plein."
|
||||
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4*3600) < 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PVC bientôt plein"
|
||||
description: "Le PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} sera plein dans moins de 4h."
|
||||
Reference in New Issue
Block a user