Files
alerts-rules/rules/cluster.yaml
T
2026-06-15 17:17:06 +02:00

56 lines
1.8 KiB
YAML

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: k8s-cluster-rules
namespace: monitoring
labels:
release: monitoring
spec:
groups:
- name: cluster.critical
interval: 30s
rules:
- alert: KubeAPIServerDown
expr: absent(up{job="apiserver"} == 1)
for: 5m
labels:
severity: critical
annotations:
summary: "API Server inaccessible"
description: "L'API server ne répond plus depuis 5 minutes."
- alert: EtcdDown
expr: absent(up{job="etcd"} == 1)
for: 5m
labels:
severity: critical
annotations:
summary: "etcd inaccessible"
description: "etcd ne répond plus."
- alert: KubeSchedulerDown
expr: absent(up{job="kube-scheduler"} == 1)
for: 5m
labels:
severity: critical
annotations:
summary: "Scheduler inaccessible"
description: "Plus aucun pod ne peut être schedulé."
- alert: KubeControllerManagerDown
expr: absent(up{job="kube-controller-manager"} == 1)
for: 5m
labels:
severity: critical
annotations:
summary: "Controller Manager inaccessible"
description: "Les deployments et replicasets ne fonctionnent plus."
- alert: etcdHighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
for: 10m
labels:
severity: warning
annotations:
summary: "Latence etcd élevée"
description: "La latence de commit etcd dépasse 250ms."