From 8f42192e527bf140c3151ea2fd5e4e66eb0d6867 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Mon, 19 Nov 2018 11:22:55 +0000 Subject: [PATCH] Add Prometheus alerts from kube-prometheus, remove the alertmanager alerts. Signed-off-by: Tom Wilkie --- .../prometheus-mixin/alerts.libsonnet | 123 ++++++++++++++++-- 1 file changed, 110 insertions(+), 13 deletions(-) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index cf4503729..97c92cf58 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -5,7 +5,7 @@ name: 'prometheus', rules: [ { - alert: 'PromBadConfig', + alert: 'PrometheusBadConfig', expr: ||| prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0 ||| % $._config, @@ -14,37 +14,134 @@ severity: 'critical', }, annotations: { - mesage: 'Prometheus failed to reload config, see container logs', + message: 'Prometheus failed to reload config, see container logs', }, }, { - alert: 'PromAlertmanagerBadConfig', + alert: 'PrometheusNotificationQueueRunningFull', expr: ||| - alertmanager_config_last_reload_successful{%(alertmanagerSelector)s} == 0 + predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30) + > + prometheus_notifications_queue_capacity{%(prometheusSelector)s} + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: "Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}}", + }, + }, + { + alert: 'PrometheusErrorSendingAlerts', + expr: ||| + 100 * rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) + / + rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 1 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}', + }, + }, + { + alert: 'PrometheusErrorSendingAlerts', + expr: ||| + 100 * rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) + / + rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 3 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}', + }, + }, + { + alert: 'PrometheusNotConnectedToAlertmanagers', + expr: ||| + prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s} < 1 ||| % $._config, 'for': '10m', labels: { - severity: 'critical', + severity: 'warning', }, annotations: { - message: 'Alertmanager failed to reload config, see container logs', + message: 'Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers', }, }, { - alert: 'PromAlertsFailed', + alert: 'PrometheusTSDBReloadsFailing', expr: ||| - 100 * rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m]) / rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m]) > 1 + increase(prometheus_tsdb_reloads_failures_total{%(prometheusSelector)s}[2h]) > 0 ||| % $._config, - 'for': '5m', + 'for': '12h', labels: { - severity: 'critical', + severity: 'warning', }, annotations: { - message: 'Alertmanager failed to send {{ printf "%.1f" $value }}% alerts to {{ $labels.integration }}.', + message: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.', }, }, { - alert: 'PromRemoteStorageFailures', + alert: 'PrometheusTSDBCompactionsFailing', + expr: ||| + increase(prometheus_tsdb_compactions_failed_total{%(prometheusSelector)s}[2h]) > 0 + ||| % $._config, + 'for': '12h', + labels: { + severity: 'warning', + }, + annotations: { + message: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.', + }, + }, + { + alert: 'PrometheusTSDBWALCorruptions', + expr: ||| + tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0 + ||| % $._config, + 'for': '4h', + labels: { + severity: 'warning', + }, + annotations: { + message: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).', + }, + }, + { + alert: 'PrometheusNotIngestingSamples', + expr: ||| + rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m]) <= 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + message: { + description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.", + }, + }, + { + alert: 'PrometheusTargetScrapesDuplicate', + expr: ||| + increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{%(prometheusSelector)s}[5m]) > 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + message: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values', + }, + }, + { + alert: 'PrometheusRemoteStorageFailures', expr: ||| (rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) * 100) / @@ -60,7 +157,7 @@ }, }, { - alert: 'PromRuleFailures', + alert: 'PrometheusRuleFailures', 'for': '15m', expr: ||| rate(prometheus_rule_evaluation_failures_total{%(prometheusSelector)s}[1m]) > 0