chore: add an alert based on the metric prometheus_sd_kubernetes_failures_total

that was introcued in https://github.com/prometheus/prometheus/pull/13554 The same motivation for adding the metric applies: To avoid silent SD failures, as existing logs may not be regularly checked and can be missed. Signed-off-by: machine424 <ayoubmrini424@gmail.com> Co-authored-by: Simon Pasquier <spasquie@redhat.com>
2025-03-05 20:59:13 -08:00 · 2024-06-18 13:38:20 +02:00 · 2024-06-18 13:38:20 +02:00 · f9ca6c4ae6
parent 5c417684f8
commit f9ca6c4ae6
1 changed files with 14 additions and 0 deletions
--- a/documentation/prometheus-mixin/alerts.libsonnet
+++ b/documentation/prometheus-mixin/alerts.libsonnet
@ -34,6 +34,20 @@
              description: 'Prometheus %(prometheusName)s has failed to refresh SD with mechanism {{$labels.mechanism}}.' % $._config,
            },
          },
+          {
+            alert: 'PrometheusKubernetesListWatchFailures',
+            expr: |||
+              increase(prometheus_sd_kubernetes_failures_total{%(prometheusSelector)s}[5m]) > 0
+            ||| % $._config,
+            'for': '15m',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              summary: 'Requests in Kubernetes SD are failing.',
+              description: 'Kubernetes service discovery of Prometheus %(prometheusName)s is experiencing {{ printf "%%.0f" $value }} failures with LIST/WATCH requests to the Kubernetes API in the last 5 minutes.' % $._config,
+            },
+          },
          {
            alert: 'PrometheusNotificationQueueRunningFull',
            expr: |||