feat: initial alert rules

This commit is contained in:
2026-06-15 17:17:06 +02:00
parent eab99f7a00
commit b48ff0d8a1
7 changed files with 288 additions and 0 deletions
+56
View File
@@ -0,0 +1,56 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: k8s-cluster-rules
namespace: monitoring
labels:
release: monitoring
spec:
groups:
- name: cluster.critical
interval: 30s
rules:
- alert: KubeAPIServerDown
expr: absent(up{job="apiserver"} == 1)
for: 5m
labels:
severity: critical
annotations:
summary: "API Server inaccessible"
description: "L'API server ne répond plus depuis 5 minutes."
- alert: EtcdDown
expr: absent(up{job="etcd"} == 1)
for: 5m
labels:
severity: critical
annotations:
summary: "etcd inaccessible"
description: "etcd ne répond plus."
- alert: KubeSchedulerDown
expr: absent(up{job="kube-scheduler"} == 1)
for: 5m
labels:
severity: critical
annotations:
summary: "Scheduler inaccessible"
description: "Plus aucun pod ne peut être schedulé."
- alert: KubeControllerManagerDown
expr: absent(up{job="kube-controller-manager"} == 1)
for: 5m
labels:
severity: critical
annotations:
summary: "Controller Manager inaccessible"
description: "Les deployments et replicasets ne fonctionnent plus."
- alert: etcdHighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
for: 10m
labels:
severity: warning
annotations:
summary: "Latence etcd élevée"
description: "La latence de commit etcd dépasse 250ms."
+28
View File
@@ -0,0 +1,28 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: k8s-network-rules
namespace: monitoring
labels:
release: monitoring
spec:
groups:
- name: network.critical
rules:
- alert: CoreDNSDown
expr: absent(up{job="coredns"} == 1)
for: 5m
labels:
severity: critical
annotations:
summary: "CoreDNS inaccessible"
description: "CoreDNS est indisponible, la résolution DNS interne est cassée."
- alert: KubeAPIServerLatencyHigh
expr: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"WATCH|LIST"}[5m])) by (le)) > 1
for: 5m
labels:
severity: critical
annotations:
summary: "Latence API Server élevée"
description: "La latence p99 de l'API server dépasse 1 seconde."
+55
View File
@@ -0,0 +1,55 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: k8s-node-rules
namespace: monitoring
labels:
release: monitoring
spec:
groups:
- name: nodes.critical
rules:
- alert: NodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 15m
labels:
severity: critical
annotations:
summary: "Node NotReady"
description: "Le node {{ $labels.node }} est NotReady depuis 15 minutes."
- alert: NodeMemoryPressure
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
for: 2m
labels:
severity: critical
annotations:
summary: "Node en MemoryPressure"
description: "Le node {{ $labels.node }} est en MemoryPressure."
- alert: NodeDiskPressure
expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
for: 2m
labels:
severity: critical
annotations:
summary: "Node en DiskPressure"
description: "Le node {{ $labels.node }} est en DiskPressure."
- alert: NodeHighCPU
expr: 100 - (avg by(node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 10m
labels:
severity: warning
annotations:
summary: "CPU node élevé"
description: "CPU du node {{ $labels.node }} > 85%."
- alert: NodeHighMemory
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "RAM node élevée"
description: "RAM du node {{ $labels.node }} > 90%."
+55
View File
@@ -0,0 +1,55 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: k8s-pod-rules
namespace: monitoring
labels:
release: monitoring
spec:
groups:
- name: pods.critical
rules:
- alert: KubePodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 0
for: 15m
labels:
severity: critical
annotations:
summary: "Pod en CrashLoop"
description: "Le pod {{ $labels.namespace }}/{{ $labels.pod }} crashe en boucle."
- alert: KubePodNotReady
expr: sum by(namespace, pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}) > 0
for: 15m
labels:
severity: critical
annotations:
summary: "Pod non Ready"
description: "Le pod {{ $labels.namespace }}/{{ $labels.pod }} n'est pas Ready depuis 15 minutes."
- alert: KubeDeploymentReplicasMismatch
expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
for: 15m
labels:
severity: critical
annotations:
summary: "Replicas manquants"
description: "Le deployment {{ $labels.namespace }}/{{ $labels.deployment }} n'a pas le bon nombre de replicas."
- alert: KubeContainerOOMKilled
expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} == 1
for: 0m
labels:
severity: warning
annotations:
summary: "Conteneur OOMKilled"
description: "Le conteneur {{ $labels.container }} du pod {{ $labels.pod }} a été tué par OOMKiller."
- alert: KubeHpaMaxedOut
expr: kube_horizontalpodautoscaler_status_current_replicas == kube_horizontalpodautoscaler_spec_max_replicas
for: 15m
labels:
severity: warning
annotations:
summary: "HPA au maximum"
description: "L'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} est à son maximum."
+28
View File
@@ -0,0 +1,28 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: k8s-storage-rules
namespace: monitoring
labels:
release: monitoring
spec:
groups:
- name: storage.critical
rules:
- alert: KubePersistentVolumeFull
expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes < 0.03
for: 2m
labels:
severity: critical
annotations:
summary: "PVC plein"
description: "Le PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} est plein."
- alert: KubePersistentVolumeFillingUp
expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4*3600) < 0
for: 1h
labels:
severity: critical
annotations:
summary: "PVC bientôt plein"
description: "Le PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} sera plein dans moins de 4h."