From b48ff0d8a13cb484eeec2551a890da300a09e4a2 Mon Sep 17 00:00:00 2001 From: mbelfares Date: Mon, 15 Jun 2026 17:17:06 +0200 Subject: [PATCH] feat: initial alert rules --- .github/workflows/deploy.yml | 33 ++++++++++++++++++++ .github/workflows/validate.yml | 33 ++++++++++++++++++++ rules/cluster.yaml | 56 ++++++++++++++++++++++++++++++++++ rules/network.yaml | 28 +++++++++++++++++ rules/nodes.yaml | 55 +++++++++++++++++++++++++++++++++ rules/pods.yaml | 55 +++++++++++++++++++++++++++++++++ rules/storage.yaml | 28 +++++++++++++++++ 7 files changed, 288 insertions(+) create mode 100644 .github/workflows/deploy.yml create mode 100644 .github/workflows/validate.yml create mode 100644 rules/cluster.yaml create mode 100644 rules/network.yaml create mode 100644 rules/nodes.yaml create mode 100644 rules/pods.yaml create mode 100644 rules/storage.yaml diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..28e0010 --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,33 @@ +name: Deploy alert rules + +on: + push: + branches: [main] + paths: + - 'rules/**' + +jobs: + deploy: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Setup kubectl + uses: azure/setup-kubectl@v3 + with: + version: 'v1.29.0' + + - name: Configure kubeconfig + run: | + mkdir -p $HOME/.kube + echo "${{ secrets.KUBECONFIG }}" | base64 -d > $HOME/.kube/config + chmod 600 $HOME/.kube/config + + - name: Deploy rules + run: kubectl apply -f rules/ -n monitoring + + - name: Verify + run: | + sleep 10 + kubectl get prometheusrule -n monitoring \ No newline at end of file diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml new file mode 100644 index 0000000..804d8de --- /dev/null +++ b/.github/workflows/validate.yml @@ -0,0 +1,33 @@ +name: Validate alert rules + +on: + pull_request: + paths: + - 'rules/**' + +jobs: + validate: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install promtool + run: | + PROM_VERSION="2.51.0" + wget -q https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz + tar xf prometheus-*.tar.gz + sudo mv prometheus-*/promtool /usr/local/bin/ + + - name: Validate rules + run: | + for f in rules/*.yaml; do + echo "→ Validation de $f" + python3 -c " + import yaml, sys + with open('$f') as fh: + doc = yaml.safe_load(fh) + groups = doc.get('spec', doc).get('groups', []) + print(yaml.dump({'groups': groups})) + " > /tmp/rules-flat.yaml + promtool check rules /tmp/rules-flat.yaml + done \ No newline at end of file diff --git a/rules/cluster.yaml b/rules/cluster.yaml new file mode 100644 index 0000000..c1c8187 --- /dev/null +++ b/rules/cluster.yaml @@ -0,0 +1,56 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: k8s-cluster-rules + namespace: monitoring + labels: + release: monitoring +spec: + groups: + - name: cluster.critical + interval: 30s + rules: + - alert: KubeAPIServerDown + expr: absent(up{job="apiserver"} == 1) + for: 5m + labels: + severity: critical + annotations: + summary: "API Server inaccessible" + description: "L'API server ne répond plus depuis 5 minutes." + + - alert: EtcdDown + expr: absent(up{job="etcd"} == 1) + for: 5m + labels: + severity: critical + annotations: + summary: "etcd inaccessible" + description: "etcd ne répond plus." + + - alert: KubeSchedulerDown + expr: absent(up{job="kube-scheduler"} == 1) + for: 5m + labels: + severity: critical + annotations: + summary: "Scheduler inaccessible" + description: "Plus aucun pod ne peut être schedulé." + + - alert: KubeControllerManagerDown + expr: absent(up{job="kube-controller-manager"} == 1) + for: 5m + labels: + severity: critical + annotations: + summary: "Controller Manager inaccessible" + description: "Les deployments et replicasets ne fonctionnent plus." + + - alert: etcdHighCommitDurations + expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25 + for: 10m + labels: + severity: warning + annotations: + summary: "Latence etcd élevée" + description: "La latence de commit etcd dépasse 250ms." \ No newline at end of file diff --git a/rules/network.yaml b/rules/network.yaml new file mode 100644 index 0000000..71a89f1 --- /dev/null +++ b/rules/network.yaml @@ -0,0 +1,28 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: k8s-network-rules + namespace: monitoring + labels: + release: monitoring +spec: + groups: + - name: network.critical + rules: + - alert: CoreDNSDown + expr: absent(up{job="coredns"} == 1) + for: 5m + labels: + severity: critical + annotations: + summary: "CoreDNS inaccessible" + description: "CoreDNS est indisponible, la résolution DNS interne est cassée." + + - alert: KubeAPIServerLatencyHigh + expr: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"WATCH|LIST"}[5m])) by (le)) > 1 + for: 5m + labels: + severity: critical + annotations: + summary: "Latence API Server élevée" + description: "La latence p99 de l'API server dépasse 1 seconde." \ No newline at end of file diff --git a/rules/nodes.yaml b/rules/nodes.yaml new file mode 100644 index 0000000..bdc2219 --- /dev/null +++ b/rules/nodes.yaml @@ -0,0 +1,55 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: k8s-node-rules + namespace: monitoring + labels: + release: monitoring +spec: + groups: + - name: nodes.critical + rules: + - alert: NodeNotReady + expr: kube_node_status_condition{condition="Ready",status="true"} == 0 + for: 15m + labels: + severity: critical + annotations: + summary: "Node NotReady" + description: "Le node {{ $labels.node }} est NotReady depuis 15 minutes." + + - alert: NodeMemoryPressure + expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 + for: 2m + labels: + severity: critical + annotations: + summary: "Node en MemoryPressure" + description: "Le node {{ $labels.node }} est en MemoryPressure." + + - alert: NodeDiskPressure + expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 + for: 2m + labels: + severity: critical + annotations: + summary: "Node en DiskPressure" + description: "Le node {{ $labels.node }} est en DiskPressure." + + - alert: NodeHighCPU + expr: 100 - (avg by(node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 + for: 10m + labels: + severity: warning + annotations: + summary: "CPU node élevé" + description: "CPU du node {{ $labels.node }} > 85%." + + - alert: NodeHighMemory + expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90 + for: 5m + labels: + severity: warning + annotations: + summary: "RAM node élevée" + description: "RAM du node {{ $labels.node }} > 90%." \ No newline at end of file diff --git a/rules/pods.yaml b/rules/pods.yaml new file mode 100644 index 0000000..d723603 --- /dev/null +++ b/rules/pods.yaml @@ -0,0 +1,55 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: k8s-pod-rules + namespace: monitoring + labels: + release: monitoring +spec: + groups: + - name: pods.critical + rules: + - alert: KubePodCrashLooping + expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 0 + for: 15m + labels: + severity: critical + annotations: + summary: "Pod en CrashLoop" + description: "Le pod {{ $labels.namespace }}/{{ $labels.pod }} crashe en boucle." + + - alert: KubePodNotReady + expr: sum by(namespace, pod) (kube_pod_status_phase{phase!~"Running|Succeeded"}) > 0 + for: 15m + labels: + severity: critical + annotations: + summary: "Pod non Ready" + description: "Le pod {{ $labels.namespace }}/{{ $labels.pod }} n'est pas Ready depuis 15 minutes." + + - alert: KubeDeploymentReplicasMismatch + expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available + for: 15m + labels: + severity: critical + annotations: + summary: "Replicas manquants" + description: "Le deployment {{ $labels.namespace }}/{{ $labels.deployment }} n'a pas le bon nombre de replicas." + + - alert: KubeContainerOOMKilled + expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} == 1 + for: 0m + labels: + severity: warning + annotations: + summary: "Conteneur OOMKilled" + description: "Le conteneur {{ $labels.container }} du pod {{ $labels.pod }} a été tué par OOMKiller." + + - alert: KubeHpaMaxedOut + expr: kube_horizontalpodautoscaler_status_current_replicas == kube_horizontalpodautoscaler_spec_max_replicas + for: 15m + labels: + severity: warning + annotations: + summary: "HPA au maximum" + description: "L'HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} est à son maximum." \ No newline at end of file diff --git a/rules/storage.yaml b/rules/storage.yaml new file mode 100644 index 0000000..b8c8e95 --- /dev/null +++ b/rules/storage.yaml @@ -0,0 +1,28 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: k8s-storage-rules + namespace: monitoring + labels: + release: monitoring +spec: + groups: + - name: storage.critical + rules: + - alert: KubePersistentVolumeFull + expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes < 0.03 + for: 2m + labels: + severity: critical + annotations: + summary: "PVC plein" + description: "Le PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} est plein." + + - alert: KubePersistentVolumeFillingUp + expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4*3600) < 0 + for: 1h + labels: + severity: critical + annotations: + summary: "PVC bientôt plein" + description: "Le PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} sera plein dans moins de 4h." \ No newline at end of file