feat: initial alert rules
This commit is contained in:
@@ -0,0 +1,56 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: k8s-cluster-rules
|
||||
namespace: monitoring
|
||||
labels:
|
||||
release: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: cluster.critical
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: KubeAPIServerDown
|
||||
expr: absent(up{job="apiserver"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "API Server inaccessible"
|
||||
description: "L'API server ne répond plus depuis 5 minutes."
|
||||
|
||||
- alert: EtcdDown
|
||||
expr: absent(up{job="etcd"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "etcd inaccessible"
|
||||
description: "etcd ne répond plus."
|
||||
|
||||
- alert: KubeSchedulerDown
|
||||
expr: absent(up{job="kube-scheduler"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Scheduler inaccessible"
|
||||
description: "Plus aucun pod ne peut être schedulé."
|
||||
|
||||
- alert: KubeControllerManagerDown
|
||||
expr: absent(up{job="kube-controller-manager"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Controller Manager inaccessible"
|
||||
description: "Les deployments et replicasets ne fonctionnent plus."
|
||||
|
||||
- alert: etcdHighCommitDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Latence etcd élevée"
|
||||
description: "La latence de commit etcd dépasse 250ms."
|
||||
Reference in New Issue
Block a user