From ee1427faad521d942b2a8a703cdef81266d47d67 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Wed, 9 May 2018 19:02:33 +0200 Subject: [PATCH 01/26] Prometheus monitoring mixin for Prometheus itself. Signed-off-by: Tom Wilkie --- .../prometheus-mixin/alerts.libsonnet | 92 +++++++++++++++++++ .../prometheus-mixin/config.libsonnet | 7 ++ .../prometheus-mixin/mixin.libsonnet | 2 + 3 files changed, 101 insertions(+) create mode 100644 documentation/prometheus-mixin/alerts.libsonnet create mode 100644 documentation/prometheus-mixin/config.libsonnet create mode 100644 documentation/prometheus-mixin/mixin.libsonnet diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet new file mode 100644 index 0000000000..1246e10523 --- /dev/null +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -0,0 +1,92 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'prometheus', + rules: [ + { + alert: 'PromScrapeFailed', + expr: ||| + up != 1 + |||, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Prometheus failed to scrape a target {{ $labels.job }} / {{ $labels.instance }}', + }, + }, + { + alert: 'PromBadConfig', + expr: ||| + prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + mesage: 'Prometheus failed to reload config, see container logs', + }, + }, + { + alert: 'PromAlertmanagerBadConfig', + expr: ||| + alertmanager_config_last_reload_successful{%(alertmanagerSelector)s} == 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Alertmanager failed to reload config, see container logs', + }, + }, + { + alert: 'PromAlertsFailed', + expr: ||| + sum(increase(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m])) by (namespace) > 0 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Alertmanager failed to send an alert.', + }, + }, + { + alert: 'PromRemoteStorageFailures', + expr: ||| + (rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) * 100) + / + (rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) + rate(prometheus_remote_storage_succeeded_samples_total{%(prometheusSelector)s}[1m])) + > 1 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Prometheus failed to send {{ printf "%.1f" $value }}% samples', + }, + }, + { + alert: 'PromRuleFailures', + 'for': '15m', + expr: ||| + rate(prometheus_rule_evaluation_failures_total{%(prometheusSelector)s}[1m]) > 0 + ||| % $._config, + labels: { + severity: 'critical', + }, + annotations: { + message: 'Prometheus failed to evaluate {{ printf "%.1f" $value }} rules / s', + }, + }, + ], + }, + ], + }, +} diff --git a/documentation/prometheus-mixin/config.libsonnet b/documentation/prometheus-mixin/config.libsonnet new file mode 100644 index 0000000000..0e9daa7c00 --- /dev/null +++ b/documentation/prometheus-mixin/config.libsonnet @@ -0,0 +1,7 @@ +{ + _config+:: { + // Selectors are inserted between {} in Prometheus queries. + prometheusSelector: 'job="prometheus"', + alertmanagerSelector: 'job="alertmanager"', + }, +} diff --git a/documentation/prometheus-mixin/mixin.libsonnet b/documentation/prometheus-mixin/mixin.libsonnet new file mode 100644 index 0000000000..95efe331f7 --- /dev/null +++ b/documentation/prometheus-mixin/mixin.libsonnet @@ -0,0 +1,2 @@ +(import 'config.libsonnet') + +(import 'alerts.libsonnet') From e8a8ce5654161c861984c17c2e7d5a7e171127ae Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Tue, 7 Aug 2018 14:14:00 +0200 Subject: [PATCH 02/26] Basic Prometheus dashboard. Signed-off-by: Tom Wilkie --- .../prometheus-mixin/dashboards.libsonnet | 100 ++++++++++++++++++ .../prometheus-mixin/jsonnetfile.json | 14 +++ .../prometheus-mixin/mixin.libsonnet | 1 + 3 files changed, 115 insertions(+) create mode 100644 documentation/prometheus-mixin/dashboards.libsonnet create mode 100644 documentation/prometheus-mixin/jsonnetfile.json diff --git a/documentation/prometheus-mixin/dashboards.libsonnet b/documentation/prometheus-mixin/dashboards.libsonnet new file mode 100644 index 0000000000..8d80ff7877 --- /dev/null +++ b/documentation/prometheus-mixin/dashboards.libsonnet @@ -0,0 +1,100 @@ +local g = import 'grafana-builder/grafana.libsonnet'; + +{ + _config+:: { + storage_backend: error 'must specify storage backend (cassandra, gcp)', + }, + + dashboards+: { + 'prometheus.json': + g.dashboard('Prometheus') + .addMultiTemplate('job', 'prometheus_build_info', 'job') + .addMultiTemplate('instance', 'prometheus_build_info', 'instance') + # Prometheus is quite commonly configured with honor_labels set to true; + # therefor job and instance is not the prometheus server in many queries!. + .addRow( + g.row('Prometheus Stats') + .addPanel( + g.panel('Prometheus Stats') + + g.tablePanel([ + 'count by (job, instance, version) (prometheus_build_info{job=~"$job", instance=~"$instance"})', + 'max by (job, instance) (time() - process_start_time_seconds{job=~"$job", instance=~"$instance"})', + ], { + job: { alias: 'Job' }, + instance: { alias: 'Instance' }, + verstion: { alias: 'Version' }, + 'Value #A': { alias: 'Count', type: 'hidden' }, + 'Value #B': { alias: 'Uptime' }, + }) + ) + ) + .addRow( + g.row('Discovery') + .addPanel( + g.panel('Target Sync') + + g.queryPanel('sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[2m])) by (scrape_job) * 1e3', '{{scrape_job}}') + + { yaxes: g.yaxes('ms') } + ) + .addPanel( + g.panel('Targets') + + g.queryPanel('count(up{})', 'Targets') + + g.stack + ) + ) + .addRow( + g.row('Retrieval') + .addPanel( + g.panel('Target Scrape Duration') + + g.queryPanel('1e3 * sum(scrape_duration_seconds) / count(scrape_duration_seconds)', 'Average') + + { yaxes: g.yaxes('ms') } + ) + .addPanel( + g.panel('Scrape failures') + + g.queryPanel([ + 'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))', + 'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))', + 'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))', + 'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))', + ], [ + 'exceeded sample limit: {{job}}', + 'duplicate timestamp: {{job}}', + 'out of bounds: {{job}}', + 'out of order: {{job}}', + ]) + + g.stack + ) + .addPanel( + g.panel('Appended Samples') + + g.queryPanel('rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[1m])', '{{job}} {{instance}}') + + g.stack + ) + ) + .addRow( + g.row('Storage') + .addPanel( + g.panel('Head Series') + + g.queryPanel('prometheus_tsdb_head_series{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head series') + + g.stack + ) + .addPanel( + g.panel('Head Chunks') + + g.queryPanel('prometheus_tsdb_head_chunks{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head chunks') + + g.stack + ) + ) + .addRow( + g.row('Query') + .addPanel( + g.panel('Query Rate') + + g.queryPanel('rate(prometheus_engine_query_duration_seconds_count{job=~"$job",instance=~"$instance",slice="inner_eval"}[1m])', '{{job}} {{instance}}') + + g.stack, + ) + .addPanel( + g.panel('Stage Duration') + + g.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",job=~"$job",instance=~"$instance"}) * 1e3', '{{slice}}') + + { yaxes: g.yaxes('ms') } + + g.stack, + ) + ) + }, +} diff --git a/documentation/prometheus-mixin/jsonnetfile.json b/documentation/prometheus-mixin/jsonnetfile.json new file mode 100644 index 0000000000..c0261c367d --- /dev/null +++ b/documentation/prometheus-mixin/jsonnetfile.json @@ -0,0 +1,14 @@ +{ + "dependencies": [ + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/kausalco/public", + "subdir": "grafana-builder" + } + }, + "version": "master" + } + ] +} diff --git a/documentation/prometheus-mixin/mixin.libsonnet b/documentation/prometheus-mixin/mixin.libsonnet index 95efe331f7..3c983a3001 100644 --- a/documentation/prometheus-mixin/mixin.libsonnet +++ b/documentation/prometheus-mixin/mixin.libsonnet @@ -1,2 +1,3 @@ (import 'config.libsonnet') + +(import 'dashboards.libsonnet') + (import 'alerts.libsonnet') From 266ba185feded8664916506b932e24cfabc04334 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Tue, 7 Aug 2018 14:15:21 +0200 Subject: [PATCH 03/26] Remove PromScrapeFailed alert. Signed-off-by: Tom Wilkie --- documentation/prometheus-mixin/alerts.libsonnet | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index 1246e10523..40cf06d66a 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -4,19 +4,6 @@ { name: 'prometheus', rules: [ - { - alert: 'PromScrapeFailed', - expr: ||| - up != 1 - |||, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - message: 'Prometheus failed to scrape a target {{ $labels.job }} / {{ $labels.instance }}', - }, - }, { alert: 'PromBadConfig', expr: ||| From 50861d586a794204718f2c8a96b1bf5989764ebb Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Tue, 7 Aug 2018 14:18:33 +0200 Subject: [PATCH 04/26] Alert if more than 1% of alerts fail for a given integration. Signed-off-by: Tom Wilkie --- documentation/prometheus-mixin/alerts.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index 40cf06d66a..cf45037292 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -33,14 +33,14 @@ { alert: 'PromAlertsFailed', expr: ||| - sum(increase(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m])) by (namespace) > 0 + 100 * rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m]) / rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m]) > 1 ||| % $._config, 'for': '5m', labels: { severity: 'critical', }, annotations: { - message: 'Alertmanager failed to send an alert.', + message: 'Alertmanager failed to send {{ printf "%.1f" $value }}% alerts to {{ $labels.integration }}.', }, }, { From 5fd712b210310a1d6acf9f8daa3a5bc73d01c61a Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Tue, 7 Aug 2018 14:25:12 +0200 Subject: [PATCH 05/26] copypasta. Signed-off-by: Tom Wilkie --- documentation/prometheus-mixin/dashboards.libsonnet | 4 ---- 1 file changed, 4 deletions(-) diff --git a/documentation/prometheus-mixin/dashboards.libsonnet b/documentation/prometheus-mixin/dashboards.libsonnet index 8d80ff7877..1e64af4ca0 100644 --- a/documentation/prometheus-mixin/dashboards.libsonnet +++ b/documentation/prometheus-mixin/dashboards.libsonnet @@ -1,10 +1,6 @@ local g = import 'grafana-builder/grafana.libsonnet'; { - _config+:: { - storage_backend: error 'must specify storage backend (cassandra, gcp)', - }, - dashboards+: { 'prometheus.json': g.dashboard('Prometheus') From dfbdf8d3bb1a697ab2d9e7b09689d461072e0fb7 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Fri, 16 Nov 2018 17:23:14 +0000 Subject: [PATCH 06/26] Add a basic readme with link to the mixin docs. Signed-off-by: Tom Wilkie --- documentation/prometheus-mixin/README.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 documentation/prometheus-mixin/README.md diff --git a/documentation/prometheus-mixin/README.md b/documentation/prometheus-mixin/README.md new file mode 100644 index 0000000000..9596f381f8 --- /dev/null +++ b/documentation/prometheus-mixin/README.md @@ -0,0 +1,6 @@ +# Prometheus Mixin + +The Prometheus Mixin is a set of configurable, reusable and extensible alerts +and dashboards for Prometheus. + +For instructions on how to use mixins, see https://github.com/monitoring-mixins/docs. From 8f42192e527bf140c3151ea2fd5e4e66eb0d6867 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Mon, 19 Nov 2018 11:22:55 +0000 Subject: [PATCH 07/26] Add Prometheus alerts from kube-prometheus, remove the alertmanager alerts. Signed-off-by: Tom Wilkie --- .../prometheus-mixin/alerts.libsonnet | 123 ++++++++++++++++-- 1 file changed, 110 insertions(+), 13 deletions(-) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index cf45037292..97c92cf584 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -5,7 +5,7 @@ name: 'prometheus', rules: [ { - alert: 'PromBadConfig', + alert: 'PrometheusBadConfig', expr: ||| prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0 ||| % $._config, @@ -14,37 +14,134 @@ severity: 'critical', }, annotations: { - mesage: 'Prometheus failed to reload config, see container logs', + message: 'Prometheus failed to reload config, see container logs', }, }, { - alert: 'PromAlertmanagerBadConfig', + alert: 'PrometheusNotificationQueueRunningFull', expr: ||| - alertmanager_config_last_reload_successful{%(alertmanagerSelector)s} == 0 + predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30) + > + prometheus_notifications_queue_capacity{%(prometheusSelector)s} + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: "Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}}", + }, + }, + { + alert: 'PrometheusErrorSendingAlerts', + expr: ||| + 100 * rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) + / + rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 1 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}', + }, + }, + { + alert: 'PrometheusErrorSendingAlerts', + expr: ||| + 100 * rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) + / + rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 3 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}', + }, + }, + { + alert: 'PrometheusNotConnectedToAlertmanagers', + expr: ||| + prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s} < 1 ||| % $._config, 'for': '10m', labels: { - severity: 'critical', + severity: 'warning', }, annotations: { - message: 'Alertmanager failed to reload config, see container logs', + message: 'Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers', }, }, { - alert: 'PromAlertsFailed', + alert: 'PrometheusTSDBReloadsFailing', expr: ||| - 100 * rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m]) / rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m]) > 1 + increase(prometheus_tsdb_reloads_failures_total{%(prometheusSelector)s}[2h]) > 0 ||| % $._config, - 'for': '5m', + 'for': '12h', labels: { - severity: 'critical', + severity: 'warning', }, annotations: { - message: 'Alertmanager failed to send {{ printf "%.1f" $value }}% alerts to {{ $labels.integration }}.', + message: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.', }, }, { - alert: 'PromRemoteStorageFailures', + alert: 'PrometheusTSDBCompactionsFailing', + expr: ||| + increase(prometheus_tsdb_compactions_failed_total{%(prometheusSelector)s}[2h]) > 0 + ||| % $._config, + 'for': '12h', + labels: { + severity: 'warning', + }, + annotations: { + message: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.', + }, + }, + { + alert: 'PrometheusTSDBWALCorruptions', + expr: ||| + tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0 + ||| % $._config, + 'for': '4h', + labels: { + severity: 'warning', + }, + annotations: { + message: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).', + }, + }, + { + alert: 'PrometheusNotIngestingSamples', + expr: ||| + rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m]) <= 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + message: { + description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.", + }, + }, + { + alert: 'PrometheusTargetScrapesDuplicate', + expr: ||| + increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{%(prometheusSelector)s}[5m]) > 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + message: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values', + }, + }, + { + alert: 'PrometheusRemoteStorageFailures', expr: ||| (rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) * 100) / @@ -60,7 +157,7 @@ }, }, { - alert: 'PromRuleFailures', + alert: 'PrometheusRuleFailures', 'for': '15m', expr: ||| rate(prometheus_rule_evaluation_failures_total{%(prometheusSelector)s}[1m]) > 0 From 638204c7756ed2e7cc23480a71a7623ebe22552f Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Mon, 19 Nov 2018 12:23:42 +0000 Subject: [PATCH 08/26] Typo Signed-off-by: Tom Wilkie --- documentation/prometheus-mixin/alerts.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index 97c92cf584..4c66c4a9ba 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -123,8 +123,8 @@ labels: { severity: 'warning', }, - message: { - description: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.", + annotations: { + message: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.", }, }, { From e248ffb220d3a3371877188eba0687a2da841e61 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Tue, 12 Feb 2019 15:22:58 +0000 Subject: [PATCH 09/26] Add alert for WAL remote write falling behind. Signed-off-by: Tom Wilkie --- documentation/prometheus-mixin/alerts.libsonnet | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index 4c66c4a9ba..08b43b4ab3 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -156,6 +156,22 @@ message: 'Prometheus failed to send {{ printf "%.1f" $value }}% samples', }, }, + { + alert: 'PrometheusRemoteWriteBehind', + expr: ||| + prometheus_remote_storage_highest_timestamp_in{%(prometheusSelector)s} + - on(job, instance) group_right + prometheus_remote_storage_queue_highest_sent_timestamp{%(prometheusSelector)s} + > 60 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Prometheus remote write is {{ printf "%.1f" $value }}s behind.', + }, + }, { alert: 'PrometheusRuleFailures', 'for': '15m', From b6150692897b76bbe4ea4707930bb26aa68b12be Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Fri, 1 Mar 2019 07:39:48 -0800 Subject: [PATCH 10/26] Update metric names. Signed-off-by: Tom Wilkie --- documentation/prometheus-mixin/alerts.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index 08b43b4ab3..5a1fb8de03 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -159,9 +159,9 @@ { alert: 'PrometheusRemoteWriteBehind', expr: ||| - prometheus_remote_storage_highest_timestamp_in{%(prometheusSelector)s} + prometheus_remote_storage_highest_timestamp_in_seconds{%(prometheusSelector)s} - on(job, instance) group_right - prometheus_remote_storage_queue_highest_sent_timestamp{%(prometheusSelector)s} + prometheus_remote_storage_queue_highest_sent_timestamp_seconds{%(prometheusSelector)s} > 60 ||| % $._config, 'for': '15m', From 38a9bbbec2a10d924fd8d7b4b07422779a78a3ea Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Mon, 4 Mar 2019 12:47:24 +0000 Subject: [PATCH 11/26] Loosen off PrometheusRemoteWriteBehind alert. Signed-off-by: Tom Wilkie --- documentation/prometheus-mixin/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index 5a1fb8de03..86def1e80a 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -162,7 +162,7 @@ prometheus_remote_storage_highest_timestamp_in_seconds{%(prometheusSelector)s} - on(job, instance) group_right prometheus_remote_storage_queue_highest_sent_timestamp_seconds{%(prometheusSelector)s} - > 60 + > 120 ||| % $._config, 'for': '15m', labels: { From a5762f3681385093d520555cd96a97370cf6147e Mon Sep 17 00:00:00 2001 From: Callum Styan Date: Mon, 17 Jun 2019 15:02:42 -0700 Subject: [PATCH 12/26] Add dashboard for remote write to prometheus-mixin. Signed-off-by: Callum Styan --- .../prometheus-mixin/dashboards.libsonnet | 58 ++++++++++++++++++- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/documentation/prometheus-mixin/dashboards.libsonnet b/documentation/prometheus-mixin/dashboards.libsonnet index 1e64af4ca0..b3a2ba2f70 100644 --- a/documentation/prometheus-mixin/dashboards.libsonnet +++ b/documentation/prometheus-mixin/dashboards.libsonnet @@ -91,6 +91,60 @@ local g = import 'grafana-builder/grafana.libsonnet'; { yaxes: g.yaxes('ms') } + g.stack, ) - ) - }, + ), + # Remote write specific dashboard. + 'prometheus-remote-write.json': + g.dashboard('Prometheus Remote Write') + .addMultiTemplate('instance', 'prometheus_build_info', 'instance') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*prometheus.*"}', 'cluster') + .addRow( + g.row('Timestamps') + .addPanel( + g.panel('Highest Timestamp In vs. Highest Timestamp Sent') + + g.queryPanel('prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"} - ignoring(queue) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}', '{{cluster}}:{{instance}}-{{queue}}') + + { yaxes: g.yaxes('s') } + ) + .addPanel( + g.panel('Rate[5m]') + + g.queryPanel('rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m]) - ignoring (queue) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}') + ) + ) + .addRow( + g.row('Samples') + .addPanel( + g.panel('Rate, in vs. succeeded or dropped [5m]') + + g.queryPanel('rate(prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m])- ignoring(queue) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) - rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}') + ) + ) + .addRow( + g.row('Shards') + .addPanel( + g.panel("Num. Shards") + + g.queryPanel('prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}','{{cluster}}:{{instance}}-{{queue}}') + ) + .addPanel( + g.panel("Capacity") + + g.queryPanel('prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance"}','{{cluster}}:{{instance}}-{{queue}}') + ) + ) + .addRow( + g.row('Misc Rates.') + .addPanel( + g.panel("Dropped Samples") + + g.queryPanel('rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])','{{cluster}}:{{instance}}-{{queue}}') + ) + .addPanel( + g.panel("Failed Samples") + + g.queryPanel('rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])','{{cluster}}:{{instance}}-{{queue}}') + ) + .addPanel( + g.panel("Retried Samples") + + g.queryPanel('rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])','{{cluster}}:{{instance}}-{{queue}}') + ) + .addPanel( + g.panel("Enqueue Retries") + + g.queryPanel('rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance"}[5m])','{{cluster}}:{{instance}}-{{queue}}') + ) + ) + } } From e943803a3c36f928ae30b5310eaca007517ed757 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Wed, 26 Jun 2019 15:22:23 +0200 Subject: [PATCH 13/26] Add .gitignore file Signed-off-by: beorn7 --- documentation/prometheus-mixin/.gitignore | 4 ++++ documentation/prometheus-mixin/README.md | 9 ++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 documentation/prometheus-mixin/.gitignore diff --git a/documentation/prometheus-mixin/.gitignore b/documentation/prometheus-mixin/.gitignore new file mode 100644 index 0000000000..b23a75c9bd --- /dev/null +++ b/documentation/prometheus-mixin/.gitignore @@ -0,0 +1,4 @@ +*.yaml +dashboards_out +vendor +jsonnetfile.lock.json diff --git a/documentation/prometheus-mixin/README.md b/documentation/prometheus-mixin/README.md index 9596f381f8..01a7aadeb9 100644 --- a/documentation/prometheus-mixin/README.md +++ b/documentation/prometheus-mixin/README.md @@ -1,6 +1,13 @@ # Prometheus Mixin -The Prometheus Mixin is a set of configurable, reusable and extensible alerts +The Prometheus Mixin is a set of configurable, reusable, and extensible alerts and dashboards for Prometheus. For instructions on how to use mixins, see https://github.com/monitoring-mixins/docs. + + +TODO: need jsonnet v0.10+ +TODO: add MAkefile, explain things. + +go get github.com/google/go-jsonnet/cmd/jsonnet +go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb From ddfabda152ce6b2e68e02c03589045d90d24c3ca Mon Sep 17 00:00:00 2001 From: beorn7 Date: Wed, 26 Jun 2019 15:30:55 +0200 Subject: [PATCH 14/26] Add Makefile and suitable jsonnet files This makes the mixins usable as abvertised. Signed-off-by: beorn7 --- documentation/prometheus-mixin/Makefile | 25 +++++++++++++++++++ documentation/prometheus-mixin/alerts.jsonnet | 1 + .../prometheus-mixin/dashboards.jsonnet | 6 +++++ 3 files changed, 32 insertions(+) create mode 100644 documentation/prometheus-mixin/Makefile create mode 100644 documentation/prometheus-mixin/alerts.jsonnet create mode 100644 documentation/prometheus-mixin/dashboards.jsonnet diff --git a/documentation/prometheus-mixin/Makefile b/documentation/prometheus-mixin/Makefile new file mode 100644 index 0000000000..8319b5edf9 --- /dev/null +++ b/documentation/prometheus-mixin/Makefile @@ -0,0 +1,25 @@ +JSONNET_FMT := jsonnet fmt -n 2 --max-blank-lines 2 --string-style s --comment-style s + +all: fmt prometheus_alerts.yaml dashboards_out lint + +fmt: + find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + xargs -n 1 -- $(JSONNET_FMT) -i + +prometheus_alerts.yaml: mixin.libsonnet config.libsonnet alerts.libsonnet + jsonnet -S alerts.jsonnet > $@ + +dashboards_out: mixin.libsonnet config.libsonnet dashboards.libsonnet + @mkdir -p dashboards_out + jsonnet -J vendor -m dashboards_out dashboards.jsonnet + +lint: prometheus_alerts.yaml + find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + while read f; do \ + $(JSONNET_FMT) "$$f" | diff -u "$$f" -; \ + done + + promtool check rules prometheus_alerts.yaml + +clean: + rm -rf dashboards_out prometheus_alerts.yaml diff --git a/documentation/prometheus-mixin/alerts.jsonnet b/documentation/prometheus-mixin/alerts.jsonnet new file mode 100644 index 0000000000..75e7c1b297 --- /dev/null +++ b/documentation/prometheus-mixin/alerts.jsonnet @@ -0,0 +1 @@ +std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts) diff --git a/documentation/prometheus-mixin/dashboards.jsonnet b/documentation/prometheus-mixin/dashboards.jsonnet new file mode 100644 index 0000000000..fb102817cd --- /dev/null +++ b/documentation/prometheus-mixin/dashboards.jsonnet @@ -0,0 +1,6 @@ +local dashboards = (import 'mixin.libsonnet').dashboards; + +{ + [name]: dashboards[name] + for name in std.objectFields(dashboards) +} From 5c04ef3935a8843d6ec8ca9ad52fb1d82e99ddc4 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Wed, 26 Jun 2019 15:36:49 +0200 Subject: [PATCH 15/26] Make README.md immediately useful Signed-off-by: beorn7 --- documentation/prometheus-mixin/README.md | 28 +++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/documentation/prometheus-mixin/README.md b/documentation/prometheus-mixin/README.md index 01a7aadeb9..1cb009f683 100644 --- a/documentation/prometheus-mixin/README.md +++ b/documentation/prometheus-mixin/README.md @@ -3,11 +3,29 @@ The Prometheus Mixin is a set of configurable, reusable, and extensible alerts and dashboards for Prometheus. -For instructions on how to use mixins, see https://github.com/monitoring-mixins/docs. +To use them, you need to have `jsonnet` (v0.10+) and `jb` installed. If you +have a working Go development environment, it's easiest to run the following: +```bash +$ go get github.com/google/go-jsonnet/cmd/jsonnet +$ go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb +``` +_Note: The make targets `lint` and `fmt` currently don't work with the Go +implementation of `jsonnet`. For the time being, you have to install the [C++ +version of jsonnet](https://github.com/google/jsonnet) if you want to use them._ -TODO: need jsonnet v0.10+ -TODO: add MAkefile, explain things. +Next, install the dependencies by running the following command in this +directory: +```bash +$ jb install +``` + +You can then build a `prometheus_alerts.yaml` with the alerts and a directory +`dashboards_out` with the Grafana dashboard JSON files: +```bash +$ make prometheus_alerts.yaml +$ make dashboards_out +``` + +For more advanced uses of mixins, see https://github.com/monitoring-mixins/docs. -go get github.com/google/go-jsonnet/cmd/jsonnet -go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb From d45e8a0f61c0425f064f727185b0df03525b9b92 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Wed, 26 Jun 2019 16:22:21 +0200 Subject: [PATCH 16/26] Adjust to jsonnet v0.13 Signed-off-by: beorn7 --- documentation/prometheus-mixin/Makefile | 2 +- documentation/prometheus-mixin/README.md | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/documentation/prometheus-mixin/Makefile b/documentation/prometheus-mixin/Makefile index 8319b5edf9..9ade5aa2b6 100644 --- a/documentation/prometheus-mixin/Makefile +++ b/documentation/prometheus-mixin/Makefile @@ -1,4 +1,4 @@ -JSONNET_FMT := jsonnet fmt -n 2 --max-blank-lines 2 --string-style s --comment-style s +JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s all: fmt prometheus_alerts.yaml dashboards_out lint diff --git a/documentation/prometheus-mixin/README.md b/documentation/prometheus-mixin/README.md index 1cb009f683..7f211b2553 100644 --- a/documentation/prometheus-mixin/README.md +++ b/documentation/prometheus-mixin/README.md @@ -3,16 +3,18 @@ The Prometheus Mixin is a set of configurable, reusable, and extensible alerts and dashboards for Prometheus. -To use them, you need to have `jsonnet` (v0.10+) and `jb` installed. If you +To use them, you need to have `jsonnet` (v0.13+) and `jb` installed. If you have a working Go development environment, it's easiest to run the following: ```bash $ go get github.com/google/go-jsonnet/cmd/jsonnet $ go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb ``` -_Note: The make targets `lint` and `fmt` currently don't work with the Go -implementation of `jsonnet`. For the time being, you have to install the [C++ -version of jsonnet](https://github.com/google/jsonnet) if you want to use them._ +_Note: The make targets `lint` and `fmt` need the `jsonnetfmt` binary, which is +currently not included in the Go implementation of `jsonnet`. For the time +being, you have to install the [C++ version of +jsonnetfmt](https://github.com/google/jsonnet) if you want to use `make lint` +or `make fmt`._ Next, install the dependencies by running the following command in this directory: From d5845ad05b1e1599d06ab412d5ab34621ec5ade5 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Wed, 26 Jun 2019 16:23:09 +0200 Subject: [PATCH 17/26] Fix formatting This is the outcome of `make fmt`. Signed-off-by: beorn7 --- .../prometheus-mixin/dashboards.libsonnet | 106 +++++++++--------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/documentation/prometheus-mixin/dashboards.libsonnet b/documentation/prometheus-mixin/dashboards.libsonnet index b3a2ba2f70..8cc00f6d67 100644 --- a/documentation/prometheus-mixin/dashboards.libsonnet +++ b/documentation/prometheus-mixin/dashboards.libsonnet @@ -92,59 +92,59 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.stack, ) ), - # Remote write specific dashboard. + // Remote write specific dashboard. 'prometheus-remote-write.json': g.dashboard('Prometheus Remote Write') - .addMultiTemplate('instance', 'prometheus_build_info', 'instance') - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*prometheus.*"}', 'cluster') - .addRow( - g.row('Timestamps') - .addPanel( - g.panel('Highest Timestamp In vs. Highest Timestamp Sent') + - g.queryPanel('prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"} - ignoring(queue) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}', '{{cluster}}:{{instance}}-{{queue}}') + - { yaxes: g.yaxes('s') } - ) - .addPanel( - g.panel('Rate[5m]') + - g.queryPanel('rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m]) - ignoring (queue) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}') - ) - ) - .addRow( - g.row('Samples') - .addPanel( - g.panel('Rate, in vs. succeeded or dropped [5m]') + - g.queryPanel('rate(prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m])- ignoring(queue) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) - rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}') - ) - ) - .addRow( - g.row('Shards') - .addPanel( - g.panel("Num. Shards") + - g.queryPanel('prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}','{{cluster}}:{{instance}}-{{queue}}') - ) - .addPanel( - g.panel("Capacity") + - g.queryPanel('prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance"}','{{cluster}}:{{instance}}-{{queue}}') - ) - ) - .addRow( - g.row('Misc Rates.') - .addPanel( - g.panel("Dropped Samples") + - g.queryPanel('rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])','{{cluster}}:{{instance}}-{{queue}}') - ) - .addPanel( - g.panel("Failed Samples") + - g.queryPanel('rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])','{{cluster}}:{{instance}}-{{queue}}') - ) - .addPanel( - g.panel("Retried Samples") + - g.queryPanel('rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])','{{cluster}}:{{instance}}-{{queue}}') - ) - .addPanel( - g.panel("Enqueue Retries") + - g.queryPanel('rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance"}[5m])','{{cluster}}:{{instance}}-{{queue}}') - ) - ) - } + .addMultiTemplate('instance', 'prometheus_build_info', 'instance') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*prometheus.*"}', 'cluster') + .addRow( + g.row('Timestamps') + .addPanel( + g.panel('Highest Timestamp In vs. Highest Timestamp Sent') + + g.queryPanel('prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"} - ignoring(queue) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}', '{{cluster}}:{{instance}}-{{queue}}') + + { yaxes: g.yaxes('s') } + ) + .addPanel( + g.panel('Rate[5m]') + + g.queryPanel('rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m]) - ignoring (queue) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}') + ) + ) + .addRow( + g.row('Samples') + .addPanel( + g.panel('Rate, in vs. succeeded or dropped [5m]') + + g.queryPanel('rate(prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m])- ignoring(queue) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) - rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}') + ) + ) + .addRow( + g.row('Shards') + .addPanel( + g.panel('Num. Shards') + + g.queryPanel('prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}', '{{cluster}}:{{instance}}-{{queue}}') + ) + .addPanel( + g.panel('Capacity') + + g.queryPanel('prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance"}', '{{cluster}}:{{instance}}-{{queue}}') + ) + ) + .addRow( + g.row('Misc Rates.') + .addPanel( + g.panel('Dropped Samples') + + g.queryPanel('rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}') + ) + .addPanel( + g.panel('Failed Samples') + + g.queryPanel('rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}') + ) + .addPanel( + g.panel('Retried Samples') + + g.queryPanel('rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}') + ) + .addPanel( + g.panel('Enqueue Retries') + + g.queryPanel('rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}') + ) + ), + }, } From 23c03207e966dc9d2797fc170786eaaef283dddf Mon Sep 17 00:00:00 2001 From: beorn7 Date: Wed, 26 Jun 2019 20:31:05 +0200 Subject: [PATCH 18/26] Fixed indentation Signed-off-by: beorn7 --- .../prometheus-mixin/alerts.libsonnet | 56 ++++++++++++------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index 86def1e80a..5ec1f4f03c 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -20,46 +20,55 @@ { alert: 'PrometheusNotificationQueueRunningFull', expr: ||| - predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30) + ( + predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30) > - prometheus_notifications_queue_capacity{%(prometheusSelector)s} + prometheus_notifications_queue_capacity{%(prometheusSelector)s} + ) ||| % $._config, 'for': '15m', labels: { severity: 'warning', }, annotations: { - message: "Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}}", + message: "Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{$labels.pod}}", }, }, { alert: 'PrometheusErrorSendingAlerts', expr: ||| - 100 * rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) + ( + rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / - rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 1 + rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 1 + ) + * 100 ||| % $._config, 'for': '15m', labels: { severity: 'warning', }, annotations: { - message: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}', + message: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.Alertmanager}}', }, }, { alert: 'PrometheusErrorSendingAlerts', expr: ||| - 100 * rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) + ( + rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / - rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 3 + rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) + ) + * 100 + > 3 ||| % $._config, 'for': '15m', labels: { severity: 'critical', }, annotations: { - message: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}', + message: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.Alertmanager}}', }, }, { @@ -72,7 +81,7 @@ severity: 'warning', }, annotations: { - message: 'Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers', + message: 'Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers', }, }, { @@ -124,7 +133,7 @@ severity: 'warning', }, annotations: { - message: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.", + message: "Prometheus {{$labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.", }, }, { @@ -143,10 +152,17 @@ { alert: 'PrometheusRemoteStorageFailures', expr: ||| - (rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) * 100) - / - (rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) + rate(prometheus_remote_storage_succeeded_samples_total{%(prometheusSelector)s}[1m])) - > 1 + ( + rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) + / + ( + rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) + + + rate(prometheus_remote_storage_succeeded_samples_total{%(prometheusSelector)s}[1m]) + ) + ) + * 100 + > 1 ||| % $._config, 'for': '15m', labels: { @@ -159,10 +175,12 @@ { alert: 'PrometheusRemoteWriteBehind', expr: ||| - prometheus_remote_storage_highest_timestamp_in_seconds{%(prometheusSelector)s} - - on(job, instance) group_right - prometheus_remote_storage_queue_highest_sent_timestamp_seconds{%(prometheusSelector)s} - > 120 + ( + prometheus_remote_storage_highest_timestamp_in_seconds{%(prometheusSelector)s} + - on(job, instance) group_right + prometheus_remote_storage_queue_highest_sent_timestamp_seconds{%(prometheusSelector)s} + ) + > 120 ||| % $._config, 'for': '15m', labels: { From e34af6d4d3b580a2cf279d9295824fdf676d1a92 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Wed, 26 Jun 2019 23:22:16 +0200 Subject: [PATCH 19/26] Address various comments from the review Signed-off-by: beorn7 --- .../prometheus-mixin/alerts.libsonnet | 20 +++++++++---------- .../prometheus-mixin/dashboards.libsonnet | 16 +++++++-------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index 5ec1f4f03c..ef604a159e 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -87,9 +87,9 @@ { alert: 'PrometheusTSDBReloadsFailing', expr: ||| - increase(prometheus_tsdb_reloads_failures_total{%(prometheusSelector)s}[2h]) > 0 + increase(prometheus_tsdb_reloads_failures_total{%(prometheusSelector)s}[3h]) > 0 ||| % $._config, - 'for': '12h', + 'for': '4h', labels: { severity: 'warning', }, @@ -100,9 +100,9 @@ { alert: 'PrometheusTSDBCompactionsFailing', expr: ||| - increase(prometheus_tsdb_compactions_failed_total{%(prometheusSelector)s}[2h]) > 0 + increase(prometheus_tsdb_compactions_failed_total{%(prometheusSelector)s}[3h]) > 0 ||| % $._config, - 'for': '12h', + 'for': '4h', labels: { severity: 'warning', }, @@ -113,7 +113,7 @@ { alert: 'PrometheusTSDBWALCorruptions', expr: ||| - tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0 + increase(tsdb_wal_corruptions_total{%(prometheusSelector)s}[3h]) > 0 ||| % $._config, 'for': '4h', labels: { @@ -153,12 +153,12 @@ alert: 'PrometheusRemoteStorageFailures', expr: ||| ( - rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) + rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[5m]) / ( - rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) + rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[5m]) + - rate(prometheus_remote_storage_succeeded_samples_total{%(prometheusSelector)s}[1m]) + rate(prometheus_remote_storage_succeeded_samples_total{%(prometheusSelector)s}[5m]) ) ) * 100 @@ -192,10 +192,10 @@ }, { alert: 'PrometheusRuleFailures', - 'for': '15m', expr: ||| - rate(prometheus_rule_evaluation_failures_total{%(prometheusSelector)s}[1m]) > 0 + rate(prometheus_rule_evaluation_failures_total{%(prometheusSelector)s}[5m]) > 0 ||| % $._config, + 'for': '15m', labels: { severity: 'critical', }, diff --git a/documentation/prometheus-mixin/dashboards.libsonnet b/documentation/prometheus-mixin/dashboards.libsonnet index 8cc00f6d67..c7df870132 100644 --- a/documentation/prometheus-mixin/dashboards.libsonnet +++ b/documentation/prometheus-mixin/dashboards.libsonnet @@ -7,7 +7,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; .addMultiTemplate('job', 'prometheus_build_info', 'job') .addMultiTemplate('instance', 'prometheus_build_info', 'instance') # Prometheus is quite commonly configured with honor_labels set to true; - # therefor job and instance is not the prometheus server in many queries!. + # therefore job and instance is not the prometheus server in many queries! .addRow( g.row('Prometheus Stats') .addPanel( @@ -18,7 +18,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; ], { job: { alias: 'Job' }, instance: { alias: 'Instance' }, - verstion: { alias: 'Version' }, + version: { alias: 'Version' }, 'Value #A': { alias: 'Count', type: 'hidden' }, 'Value #B': { alias: 'Uptime' }, }) @@ -28,20 +28,20 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Discovery') .addPanel( g.panel('Target Sync') + - g.queryPanel('sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[2m])) by (scrape_job) * 1e3', '{{scrape_job}}') + + g.queryPanel('sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m])) by (scrape_job) * 1e3', '{{scrape_job}}') + { yaxes: g.yaxes('ms') } ) .addPanel( g.panel('Targets') + - g.queryPanel('count(up{})', 'Targets') + + g.queryPanel('sum(prometheus_sd_discovered_targets{job=~"$job",instance=~"$instance"})', 'Targets') + g.stack ) ) .addRow( g.row('Retrieval') .addPanel( - g.panel('Target Scrape Duration') + - g.queryPanel('1e3 * sum(scrape_duration_seconds) / count(scrape_duration_seconds)', 'Average') + + g.panel('Average Scrape Interval Duration') + + g.queryPanel('rate(prometheus_target_interval_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{job=~"$job",instance=~"$instance"}[5m]) * 1e3', '{{interval}} configured') + { yaxes: g.yaxes('ms') } ) .addPanel( @@ -61,7 +61,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; ) .addPanel( g.panel('Appended Samples') + - g.queryPanel('rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[1m])', '{{job}} {{instance}}') + + g.queryPanel('rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[5m])', '{{job}} {{instance}}') + g.stack ) ) @@ -82,7 +82,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Query') .addPanel( g.panel('Query Rate') + - g.queryPanel('rate(prometheus_engine_query_duration_seconds_count{job=~"$job",instance=~"$instance",slice="inner_eval"}[1m])', '{{job}} {{instance}}') + + g.queryPanel('rate(prometheus_engine_query_duration_seconds_count{job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])', '{{job}} {{instance}}') + g.stack, ) .addPanel( From 613cb5430c05e470671ffe6145f3d20c95664791 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Wed, 26 Jun 2019 23:24:22 +0200 Subject: [PATCH 20/26] Add a "work in progress" disclaimer. Signed-off-by: beorn7 --- documentation/prometheus-mixin/README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/documentation/prometheus-mixin/README.md b/documentation/prometheus-mixin/README.md index 7f211b2553..c44e70ca0c 100644 --- a/documentation/prometheus-mixin/README.md +++ b/documentation/prometheus-mixin/README.md @@ -1,5 +1,8 @@ # Prometheus Mixin +_This is work in progress. We aim for it to become a good role model for alerts +and dashboards eventually, but it is not quite there yet._ + The Prometheus Mixin is a set of configurable, reusable, and extensible alerts and dashboards for Prometheus. From 1336a2884830bd12d4d2fcfc2a7547d32bad6928 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Thu, 27 Jun 2019 14:34:11 +0200 Subject: [PATCH 21/26] Use a config variable for the Prometheus name Signed-off-by: beorn7 --- .../prometheus-mixin/alerts.libsonnet | 26 +++++++++---------- .../prometheus-mixin/config.libsonnet | 8 ++++++ 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index ef604a159e..5394a7fed3 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -14,7 +14,7 @@ severity: 'critical', }, annotations: { - message: 'Prometheus failed to reload config, see container logs', + message: 'Prometheus %(prometheusName)s failed to reload config, see container logs' % $._config, }, }, { @@ -31,7 +31,7 @@ severity: 'warning', }, annotations: { - message: "Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{$labels.pod}}", + message: "Prometheus's alert notification queue is running full for %(prometheusName)s" % $._config, }, }, { @@ -49,7 +49,7 @@ severity: 'warning', }, annotations: { - message: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.Alertmanager}}', + message: '{{ printf "%%.1f" $value }}%% errors while sending alerts from Prometheus %(prometheusName)s to Alertmanager {{$labels.Alertmanager}}' % $._config, }, }, { @@ -68,7 +68,7 @@ severity: 'critical', }, annotations: { - message: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.Alertmanager}}', + message: '{{ printf "%%.1f" $value }}%% errors while sending alerts from Prometheus %(prometheusName)s to Alertmanager {{$labels.Alertmanager}}' % $._config, }, }, { @@ -81,7 +81,7 @@ severity: 'warning', }, annotations: { - message: 'Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers', + message: 'Prometheus %(prometheusName)s is not connected to any Alertmanagers' % $._config, }, }, { @@ -94,7 +94,7 @@ severity: 'warning', }, annotations: { - message: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.', + message: 'Prometheus %(prometheusName)s had {{$value | humanize}} reload failures over the last four hours.' % $._config, }, }, { @@ -107,7 +107,7 @@ severity: 'warning', }, annotations: { - message: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.', + message: 'Prometheus %(prometheusName)s had {{$value | humanize}} compaction failures over the last four hours.' % $._config, }, }, { @@ -120,7 +120,7 @@ severity: 'warning', }, annotations: { - message: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).', + message: 'Prometheus %(prometheusName)s has a corrupted write-ahead log (WAL).' % $._config, }, }, { @@ -133,7 +133,7 @@ severity: 'warning', }, annotations: { - message: "Prometheus {{$labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.", + message: "Prometheus %(prometheusName)s isn't ingesting samples." % $._config, }, }, { @@ -146,7 +146,7 @@ severity: 'warning', }, annotations: { - message: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values', + message: 'Prometheus %(prometheusName)s has many samples rejected due to duplicate timestamps but different values' % $._config, }, }, { @@ -169,7 +169,7 @@ severity: 'critical', }, annotations: { - message: 'Prometheus failed to send {{ printf "%.1f" $value }}% samples', + message: 'Prometheus %(prometheusName)s failed to send {{ printf "%%.1f" $value }}%% samples' % $._config, }, }, { @@ -187,7 +187,7 @@ severity: 'critical', }, annotations: { - message: 'Prometheus remote write is {{ printf "%.1f" $value }}s behind.', + message: 'Prometheus %(prometheusName)s remote write is {{ printf "%%.1f" $value }}s behind.' % $._config, }, }, { @@ -200,7 +200,7 @@ severity: 'critical', }, annotations: { - message: 'Prometheus failed to evaluate {{ printf "%.1f" $value }} rules / s', + message: 'Prometheus %(prometheusName)s failed to evaluate {{ printf "%%.1f" $value }} rules / s' % $._config, }, }, ], diff --git a/documentation/prometheus-mixin/config.libsonnet b/documentation/prometheus-mixin/config.libsonnet index 0e9daa7c00..bb4099132a 100644 --- a/documentation/prometheus-mixin/config.libsonnet +++ b/documentation/prometheus-mixin/config.libsonnet @@ -3,5 +3,13 @@ // Selectors are inserted between {} in Prometheus queries. prometheusSelector: 'job="prometheus"', alertmanagerSelector: 'job="alertmanager"', + + // prometheusName is inserted into annotations to name the Prometheus + // instance affected by the alert. + prometheusName: '{{$labels.instance}}', + // If you run Prometheus on Kubernetes with the Prometheus + // Operator, you can make use of the configured target labels for + // nicer naming: + // prometheusNameTemplate: '{{$labels.namespace}}/{{$labels.pod}}' }, } From ded0705bdcf522acde25e1c6f202441be38f39b4 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Thu, 27 Jun 2019 14:39:38 +0200 Subject: [PATCH 22/26] Update remote repo for grafana-builder dependency Signed-off-by: beorn7 --- documentation/prometheus-mixin/jsonnetfile.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentation/prometheus-mixin/jsonnetfile.json b/documentation/prometheus-mixin/jsonnetfile.json index c0261c367d..b5d0ad347a 100644 --- a/documentation/prometheus-mixin/jsonnetfile.json +++ b/documentation/prometheus-mixin/jsonnetfile.json @@ -4,7 +4,7 @@ "name": "grafana-builder", "source": { "git": { - "remote": "https://github.com/kausalco/public", + "remote": "https://github.com/grafana/jsonnet-libs", "subdir": "grafana-builder" } }, From 7a25a2586d0ed6e09ff6a21ac7708fb6143fb423 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Thu, 27 Jun 2019 23:50:26 +0200 Subject: [PATCH 23/26] Sync with alerts from kube-prometheus While doing so, re-introduce the summary/description annotations. Also, add a few more rules and tweak a few of the existing ones. Signed-off-by: beorn7 --- .../prometheus-mixin/alerts.libsonnet | 82 ++++++++++++++----- 1 file changed, 62 insertions(+), 20 deletions(-) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index 5394a7fed3..654f74539e 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -14,7 +14,8 @@ severity: 'critical', }, annotations: { - message: 'Prometheus %(prometheusName)s failed to reload config, see container logs' % $._config, + summary: 'Failed Prometheus configuration reload.', + description: 'Prometheus %(prometheusName)s has failed to reload its configuration.' % $._config, }, }, { @@ -31,31 +32,34 @@ severity: 'warning', }, annotations: { - message: "Prometheus's alert notification queue is running full for %(prometheusName)s" % $._config, + summary: 'Prometheus alert notification queue predicted to run full in less than 30m.', + description: 'Alert notification queue of Prometheus %(prometheusName)s is running full.' % $._config, }, }, { - alert: 'PrometheusErrorSendingAlerts', + alert: 'PrometheusErrorSendingAlertsToSomeAlertmanagers', expr: ||| ( rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / - rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 1 + rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) ) * 100 + > 1 ||| % $._config, 'for': '15m', labels: { severity: 'warning', }, annotations: { - message: '{{ printf "%%.1f" $value }}%% errors while sending alerts from Prometheus %(prometheusName)s to Alertmanager {{$labels.Alertmanager}}' % $._config, + summary: 'Prometheus encounters more than 1% errors sending alerts to a specific Alertmanager.', + description: '{{ printf "%%.1f" $value }}%% errors while sending alerts from Prometheus %(prometheusName)s to Alertmanager {{$labels.alertmanager}}.' % $._config, }, }, { - alert: 'PrometheusErrorSendingAlerts', + alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager', expr: ||| - ( + min without(alertmanager) ( rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) / rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) @@ -68,7 +72,8 @@ severity: 'critical', }, annotations: { - message: '{{ printf "%%.1f" $value }}%% errors while sending alerts from Prometheus %(prometheusName)s to Alertmanager {{$labels.Alertmanager}}' % $._config, + summary: 'Prometheus encounters more than 3% errors sending alerts to any Alertmanager.', + description: '{{ printf "%%.1f" $value }}%% minimum errors while sending alerts from Prometheus %(prometheusName)s to any Alertmanager.' % $._config, }, }, { @@ -81,7 +86,8 @@ severity: 'warning', }, annotations: { - message: 'Prometheus %(prometheusName)s is not connected to any Alertmanagers' % $._config, + summary: 'Prometheus is not connected to any Alertmanagers.', + description: 'Prometheus %(prometheusName)s is not connected to any Alertmanagers.' % $._config, }, }, { @@ -94,7 +100,8 @@ severity: 'warning', }, annotations: { - message: 'Prometheus %(prometheusName)s had {{$value | humanize}} reload failures over the last four hours.' % $._config, + summary: 'Prometheus has issues reloading blocks from disk.', + description: 'Prometheus %(prometheusName)s has detected {{$value | humanize}} reload failures over the last 3h.' % $._config, }, }, { @@ -107,7 +114,8 @@ severity: 'warning', }, annotations: { - message: 'Prometheus %(prometheusName)s had {{$value | humanize}} compaction failures over the last four hours.' % $._config, + summary: 'Prometheus has issues compacting blocks.', + description: 'Prometheus %(prometheusName)s has detected {{$value | humanize}} compaction failures over the last 3h.' % $._config, }, }, { @@ -120,7 +128,8 @@ severity: 'warning', }, annotations: { - message: 'Prometheus %(prometheusName)s has a corrupted write-ahead log (WAL).' % $._config, + summary: 'Prometheus is detecting WAL corruptions.', + description: 'Prometheus %(prometheusName)s has detected {{$value | humanize}} corruptions of the write-ahead log (WAL) over the last 3h.' % $._config, }, }, { @@ -133,20 +142,36 @@ severity: 'warning', }, annotations: { - message: "Prometheus %(prometheusName)s isn't ingesting samples." % $._config, + summary: 'Prometheus is not ingesting samples.', + description: 'Prometheus %(prometheusName)s is not ingesting samples.' % $._config, }, }, { - alert: 'PrometheusTargetScrapesDuplicate', + alert: 'PrometheusDuplicateTimestamps', expr: ||| - increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{%(prometheusSelector)s}[5m]) > 0 + rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{%(prometheusSelector)s}[5m]) > 0 ||| % $._config, 'for': '10m', labels: { severity: 'warning', }, annotations: { - message: 'Prometheus %(prometheusName)s has many samples rejected due to duplicate timestamps but different values' % $._config, + summary: 'Prometheus drops samples with duplicate timestamps.', + description: 'Prometheus %(prometheusName)s is dropping {{$value | humanize}} samples/s with different values but duplicated timestamp.' % $._config, + }, + }, + { + alert: 'PrometheusOutOfOrderTimestamps', + expr: ||| + rate(prometheus_target_scrapes_sample_out_of_order_total{%(prometheusSelector)s}[5m]) > 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Prometheus drops samples with out-of-order timestamps.', + description: 'Prometheus %(prometheusName)s is dropping {{$value | humanize}} samples/s with timestamps arriving out of order.' % $._config, }, }, { @@ -169,7 +194,8 @@ severity: 'critical', }, annotations: { - message: 'Prometheus %(prometheusName)s failed to send {{ printf "%%.1f" $value }}%% samples' % $._config, + summary: 'Prometheus fails to send samples to remote storage.', + description: 'Prometheus %(prometheusName)s failed to send {{ printf "%%.1f" $value }}%% of the samples to queue {{$labels.queue}}.' % $._config, }, }, { @@ -187,20 +213,36 @@ severity: 'critical', }, annotations: { - message: 'Prometheus %(prometheusName)s remote write is {{ printf "%%.1f" $value }}s behind.' % $._config, + summary: 'Prometheus remote write is behind.', + description: 'Prometheus %(prometheusName)s remote write is {{ printf "%%.1f" $value }}s behind for queue {{$labels.queue}}.' % $._config, }, }, { alert: 'PrometheusRuleFailures', expr: ||| - rate(prometheus_rule_evaluation_failures_total{%(prometheusSelector)s}[5m]) > 0 + increase(prometheus_rule_evaluation_failures_total{%(prometheusSelector)s}[5m]) > 0 ||| % $._config, 'for': '15m', labels: { severity: 'critical', }, annotations: { - message: 'Prometheus %(prometheusName)s failed to evaluate {{ printf "%%.1f" $value }} rules / s' % $._config, + summary: 'Prometheus fails to evaluate rules.', + description: 'Prometheus %(prometheusName)s has failed to evaluate {{ printf "%%.0f" $value }} rules in the last 5m.' % $._config, + }, + }, + { + alert: 'PrometheusMissingRuleEvaluations', + expr: ||| + increase(prometheus_rule_group_iterations_missed_total{%(prometheusSelector)s}[5m]) > 0 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Prometheus misses rule evaluations due to slow rule group evaluation.', + description: 'Prometheus %(prometheusName)s has missed {{ printf "%%.0f" $value }} rule group evaluations in the last 5m.' % $._config, }, }, ], From 52707535b8e1359e9cb30bef474015622834e33a Mon Sep 17 00:00:00 2001 From: beorn7 Date: Fri, 28 Jun 2019 15:41:31 +0200 Subject: [PATCH 24/26] Remove/improve unused variables and weird doc comments Signed-off-by: beorn7 --- documentation/prometheus-mixin/config.libsonnet | 5 +++-- documentation/prometheus-mixin/dashboards.libsonnet | 2 -- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/documentation/prometheus-mixin/config.libsonnet b/documentation/prometheus-mixin/config.libsonnet index bb4099132a..27614289ab 100644 --- a/documentation/prometheus-mixin/config.libsonnet +++ b/documentation/prometheus-mixin/config.libsonnet @@ -1,8 +1,9 @@ { _config+:: { - // Selectors are inserted between {} in Prometheus queries. + // prometheusSelector is inserted as part of the label selector in + // PromQL queries to identify metrics collected from Prometheus + // servers. prometheusSelector: 'job="prometheus"', - alertmanagerSelector: 'job="alertmanager"', // prometheusName is inserted into annotations to name the Prometheus // instance affected by the alert. diff --git a/documentation/prometheus-mixin/dashboards.libsonnet b/documentation/prometheus-mixin/dashboards.libsonnet index c7df870132..cc4dba2b80 100644 --- a/documentation/prometheus-mixin/dashboards.libsonnet +++ b/documentation/prometheus-mixin/dashboards.libsonnet @@ -6,8 +6,6 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.dashboard('Prometheus') .addMultiTemplate('job', 'prometheus_build_info', 'job') .addMultiTemplate('instance', 'prometheus_build_info', 'instance') - # Prometheus is quite commonly configured with honor_labels set to true; - # therefore job and instance is not the prometheus server in many queries! .addRow( g.row('Prometheus Stats') .addPanel( From 9a2177949d936de7233af6149985dae4b3376c81 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Fri, 28 Jun 2019 16:46:19 +0200 Subject: [PATCH 25/26] Protect gauge-based alerts against failed scrapes Signed-off-by: beorn7 --- .../prometheus-mixin/alerts.libsonnet | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index 654f74539e..1d2b68d62a 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -7,9 +7,11 @@ { alert: 'PrometheusBadConfig', expr: ||| - prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0 + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_config_last_reload_successful{%(prometheusSelector)s}[5m]) == 0 ||| % $._config, - 'for': '15m', + 'for': '10m', labels: { severity: 'critical', }, @@ -21,10 +23,12 @@ { alert: 'PrometheusNotificationQueueRunningFull', expr: ||| + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. ( predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30) > - prometheus_notifications_queue_capacity{%(prometheusSelector)s} + min_over_time(prometheus_notifications_queue_capacity{%(prometheusSelector)s}[5m]) ) ||| % $._config, 'for': '15m', @@ -79,7 +83,9 @@ { alert: 'PrometheusNotConnectedToAlertmanagers', expr: ||| - prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s} < 1 + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s}[5m]) < 1 ||| % $._config, 'for': '10m', labels: { @@ -201,10 +207,12 @@ { alert: 'PrometheusRemoteWriteBehind', expr: ||| + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. ( - prometheus_remote_storage_highest_timestamp_in_seconds{%(prometheusSelector)s} + max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{%(prometheusSelector)s}[5m]) - on(job, instance) group_right - prometheus_remote_storage_queue_highest_sent_timestamp_seconds{%(prometheusSelector)s} + max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{%(prometheusSelector)s}[5m]) ) > 120 ||| % $._config, From 4825585834f10f60f55a64a05a03a682f031ab91 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Fri, 28 Jun 2019 17:37:49 +0200 Subject: [PATCH 26/26] Tweak tenses Signed-off-by: beorn7 --- documentation/prometheus-mixin/alerts.libsonnet | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index 1d2b68d62a..06c5274579 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -56,7 +56,7 @@ severity: 'warning', }, annotations: { - summary: 'Prometheus encounters more than 1% errors sending alerts to a specific Alertmanager.', + summary: 'Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.', description: '{{ printf "%%.1f" $value }}%% errors while sending alerts from Prometheus %(prometheusName)s to Alertmanager {{$labels.alertmanager}}.' % $._config, }, }, @@ -162,7 +162,7 @@ severity: 'warning', }, annotations: { - summary: 'Prometheus drops samples with duplicate timestamps.', + summary: 'Prometheus is dropping samples with duplicate timestamps.', description: 'Prometheus %(prometheusName)s is dropping {{$value | humanize}} samples/s with different values but duplicated timestamp.' % $._config, }, }, @@ -235,7 +235,7 @@ severity: 'critical', }, annotations: { - summary: 'Prometheus fails to evaluate rules.', + summary: 'Prometheus is failing rule evaluations.', description: 'Prometheus %(prometheusName)s has failed to evaluate {{ printf "%%.0f" $value }} rules in the last 5m.' % $._config, }, }, @@ -249,7 +249,7 @@ severity: 'warning', }, annotations: { - summary: 'Prometheus misses rule evaluations due to slow rule group evaluation.', + summary: 'Prometheus is missing rule evaluations due to slow rule group evaluation.', description: 'Prometheus %(prometheusName)s has missed {{ printf "%%.0f" $value }} rule group evaluations in the last 5m.' % $._config, }, },