From 90f6c1faba7765af21f4acabb58c16bd3e7f303e Mon Sep 17 00:00:00 2001 From: Leo Q Date: Wed, 7 Jun 2023 20:28:13 +0800 Subject: [PATCH] add alert for sd refresh failure (#12410) * add alert for sd refresh failure Due to config error or sd service down, prometheus may fail to refresh sd resource, which may lead to scrape fail or irrelavant metrics. Signed-off-by: Leo Q * apply suggestions Signed-off-by: Leo Q --------- Signed-off-by: Leo Q --- documentation/prometheus-mixin/alerts.libsonnet | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index 0ee5d83c7a..3efb0f27d1 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -20,6 +20,20 @@ description: 'Prometheus %(prometheusName)s has failed to reload its configuration.' % $._config, }, }, + { + alert: 'PrometheusSDRefreshFailure', + expr: ||| + increase(prometheus_sd_refresh_failures_total{%(prometheusSelector)s}[10m]) > 0 + ||| % $._config, + 'for': '20m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Failed Prometheus SD refresh.', + description: 'Prometheus %(prometheusName)s has failed to refresh SD with mechanism {{$labels.mechanism}}.' % $._config, + }, + }, { alert: 'PrometheusNotificationQueueRunningFull', expr: |||