chore: add an alert based on the metric prometheus_sd_kubernetes_failures_total

that was introcued in https://github.com/prometheus/prometheus/pull/13554

The same motivation for adding the metric applies: To avoid silent SD failures,
as existing logs may not be regularly checked and can be missed.

Signed-off-by: machine424 <ayoubmrini424@gmail.com>
Co-authored-by: Simon Pasquier <spasquie@redhat.com>
This commit is contained in:
machine424 2024-06-18 13:38:20 +02:00 committed by Ayoub Mrini
parent 5c417684f8
commit f9ca6c4ae6

View file

@ -34,6 +34,20 @@
description: 'Prometheus %(prometheusName)s has failed to refresh SD with mechanism {{$labels.mechanism}}.' % $._config,
},
},
{
alert: 'PrometheusKubernetesListWatchFailures',
expr: |||
increase(prometheus_sd_kubernetes_failures_total{%(prometheusSelector)s}[5m]) > 0
||| % $._config,
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Requests in Kubernetes SD are failing.',
description: 'Kubernetes service discovery of Prometheus %(prometheusName)s is experiencing {{ printf "%%.0f" $value }} failures with LIST/WATCH requests to the Kubernetes API in the last 5 minutes.' % $._config,
},
},
{
alert: 'PrometheusNotificationQueueRunningFull',
expr: |||