feat: initial alert rules
This commit is contained in:
@@ -0,0 +1,33 @@
|
|||||||
|
name: Deploy alert rules
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [main]
|
||||||
|
paths:
|
||||||
|
- 'rules/**'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
deploy:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Setup kubectl
|
||||||
|
uses: azure/setup-kubectl@v3
|
||||||
|
with:
|
||||||
|
version: 'v1.29.0'
|
||||||
|
|
||||||
|
- name: Configure kubeconfig
|
||||||
|
run: |
|
||||||
|
mkdir -p $HOME/.kube
|
||||||
|
echo "${{ secrets.KUBECONFIG }}" | base64 -d > $HOME/.kube/config
|
||||||
|
chmod 600 $HOME/.kube/config
|
||||||
|
|
||||||
|
- name: Deploy rules
|
||||||
|
run: kubectl apply -f rules/ -n monitoring
|
||||||
|
|
||||||
|
- name: Verify
|
||||||
|
run: |
|
||||||
|
sleep 10
|
||||||
|
kubectl get prometheusrule -n monitoring
|
||||||
@@ -0,0 +1,33 @@
|
|||||||
|
name: Validate alert rules
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
paths:
|
||||||
|
- 'rules/**'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
validate:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Install promtool
|
||||||
|
run: |
|
||||||
|
PROM_VERSION="2.51.0"
|
||||||
|
wget -q https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz
|
||||||
|
tar xf prometheus-*.tar.gz
|
||||||
|
sudo mv prometheus-*/promtool /usr/local/bin/
|
||||||
|
|
||||||
|
- name: Validate rules
|
||||||
|
run: |
|
||||||
|
for f in rules/*.yaml; do
|
||||||
|
echo "→ Validation de $f"
|
||||||
|
python3 -c "
|
||||||
|
import yaml, sys
|
||||||
|
with open('$f') as fh:
|
||||||
|
doc = yaml.safe_load(fh)
|
||||||
|
groups = doc.get('spec', doc).get('groups', [])
|
||||||
|
print(yaml.dump({'groups': groups}))
|
||||||
|
" > /tmp/rules-flat.yaml
|
||||||
|
promtool check rules /tmp/rules-flat.yaml
|
||||||
|
done
|
||||||
@@ -0,0 +1,56 @@
|
|||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: k8s-cluster-rules
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
release: monitoring
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: cluster.critical
|
||||||
|
interval: 30s
|
||||||
|
rules:
|
||||||
|
- alert: KubeAPIServerDown
|
||||||
|
expr: absent(up{job="apiserver"} == 1)
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "API Server inaccessible"
|
||||||
|
description: "L'API server ne répond plus depuis 5 minutes."
|
||||||
|
|
||||||
|
- alert: EtcdDown
|
||||||
|
expr: absent(up{job="etcd"} == 1)
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "etcd inaccessible"
|
||||||
|
description: "etcd ne répond plus."
|
||||||
|
|
||||||
|
- alert: KubeSchedulerDown
|
||||||
|
expr: absent(up{job="kube-scheduler"} == 1)
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Scheduler inaccessible"
|
||||||
|
description: "Plus aucun pod ne peut être schedulé."
|
||||||
|
|
||||||
|
- alert: KubeControllerManagerDown
|
||||||
|
expr: absent(up{job="kube-controller-manager"} == 1)
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Controller Manager inaccessible"
|
||||||
|
description: "Les deployments et replicasets ne fonctionnent plus."
|
||||||
|
|
||||||
|
- alert: etcdHighCommitDurations
|
||||||
|
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Latence etcd élevée"
|
||||||
|
description: "La latence de commit etcd dépasse 250ms."
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: k8s-network-rules
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
release: monitoring
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: network.critical
|
||||||
|
rules:
|
||||||
|
- alert: CoreDNSDown
|
||||||
|
expr: absent(up{job="coredns"} == 1)
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "CoreDNS inaccessible"
|
||||||
|
description: "CoreDNS est indisponible, la résolution DNS interne est cassée."
|
||||||
|
|
||||||
|
- alert: KubeAPIServerLatencyHigh
|
||||||
|
expr: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"WATCH|LIST"}[5m])) by (le)) > 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Latence API Server élevée"
|
||||||
|
description: "La latence p99 de l'API server dépasse 1 seconde."
|
||||||
@@ -0,0 +1,55 @@
|
|||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: k8s-node-rules
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
release: monitoring
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: nodes.critical
|
||||||
|
rules:
|
||||||
|
- alert: NodeNotReady
|
||||||
|
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Node NotReady"
|
||||||
|
description: "Le node {{ $labels.node }} est NotReady depuis 15 minutes."
|
||||||
|
|
||||||
|
- alert: NodeMemoryPressure
|
||||||
|
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Node en MemoryPressure"
|
||||||
|
description: "Le node {{ $labels.node }} est en MemoryPressure."
|
||||||
|
|
||||||
|
- alert: NodeDiskPressure
|
||||||
|
expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Node en DiskPressure"
|
||||||
|
description: "Le node {{ $labels.node }} est en DiskPressure."
|
||||||
|
|
||||||
|
- alert: NodeHighCPU
|
||||||
|
expr: 100 - (avg by(node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "CPU node élevé"
|
||||||
|
description: "CPU du node {{ $labels.node }} > 85%."
|
||||||
|
|
||||||
|
- alert: NodeHighMemory
|
||||||
|
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "RAM node élevée"
|
||||||
|
description: "RAM du node {{ $labels.node }} > 90%."
|
||||||
@@ -0,0 +1,55 @@
|
|||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: k8s-pod-rules
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
release: monitoring
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: pods.critical
|
||||||
|
rules:
|
||||||
|
- alert: KubePodCrashLooping
|
||||||
|
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Pod en CrashLoop"
|
||||||
|
description: "Le pod {{ $labels.namespace }}/{{ $labels.pod }} crashe en boucle."
|
||||||
|
|
||||||
|
- alert: KubePodNotReady
|
||||||
|
expr: sum by(namespace, pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}) > 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Pod non Ready"
|
||||||
|
description: "Le pod {{ $labels.namespace }}/{{ $labels.pod }} n'est pas Ready depuis 15 minutes."
|
||||||
|
|
||||||
|
- alert: KubeDeploymentReplicasMismatch
|
||||||
|
expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Replicas manquants"
|
||||||
|
description: "Le deployment {{ $labels.namespace }}/{{ $labels.deployment }} n'a pas le bon nombre de replicas."
|
||||||
|
|
||||||
|
- alert: KubeContainerOOMKilled
|
||||||
|
expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} == 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Conteneur OOMKilled"
|
||||||
|
description: "Le conteneur {{ $labels.container }} du pod {{ $labels.pod }} a été tué par OOMKiller."
|
||||||
|
|
||||||
|
- alert: KubeHpaMaxedOut
|
||||||
|
expr: kube_horizontalpodautoscaler_status_current_replicas == kube_horizontalpodautoscaler_spec_max_replicas
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "HPA au maximum"
|
||||||
|
description: "L'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} est à son maximum."
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: k8s-storage-rules
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
release: monitoring
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: storage.critical
|
||||||
|
rules:
|
||||||
|
- alert: KubePersistentVolumeFull
|
||||||
|
expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes < 0.03
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "PVC plein"
|
||||||
|
description: "Le PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} est plein."
|
||||||
|
|
||||||
|
- alert: KubePersistentVolumeFillingUp
|
||||||
|
expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4*3600) < 0
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "PVC bientôt plein"
|
||||||
|
description: "Le PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} sera plein dans moins de 4h."
|
||||||
Reference in New Issue
Block a user