From f9ca6c4ae613ec997297843f32cae3f6c4f0f20a Mon Sep 17 00:00:00 2001 From: machine424 Date: Tue, 18 Jun 2024 13:38:20 +0200 Subject: [PATCH] chore: add an alert based on the metric prometheus_sd_kubernetes_failures_total that was introcued in https://github.com/prometheus/prometheus/pull/13554 The same motivation for adding the metric applies: To avoid silent SD failures, as existing logs may not be regularly checked and can be missed. Signed-off-by: machine424 Co-authored-by: Simon Pasquier --- documentation/prometheus-mixin/alerts.libsonnet | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index 508d89c24..563daab80 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -34,6 +34,20 @@ description: 'Prometheus %(prometheusName)s has failed to refresh SD with mechanism {{$labels.mechanism}}.' % $._config, }, }, + { + alert: 'PrometheusKubernetesListWatchFailures', + expr: ||| + increase(prometheus_sd_kubernetes_failures_total{%(prometheusSelector)s}[5m]) > 0 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Requests in Kubernetes SD are failing.', + description: 'Kubernetes service discovery of Prometheus %(prometheusName)s is experiencing {{ printf "%%.0f" $value }} failures with LIST/WATCH requests to the Kubernetes API in the last 5 minutes.' % $._config, + }, + }, { alert: 'PrometheusNotificationQueueRunningFull', expr: |||