Prometheus monitoring mixin for Prometheus itself.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>
This commit is contained in:
Tom Wilkie 2018-05-09 19:02:33 +02:00
parent 11b336e3ca
commit ee1427faad
3 changed files with 101 additions and 0 deletions

View file

@ -0,0 +1,92 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'prometheus',
rules: [
{
alert: 'PromScrapeFailed',
expr: |||
up != 1
|||,
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
message: 'Prometheus failed to scrape a target {{ $labels.job }} / {{ $labels.instance }}',
},
},
{
alert: 'PromBadConfig',
expr: |||
prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0
||| % $._config,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
mesage: 'Prometheus failed to reload config, see container logs',
},
},
{
alert: 'PromAlertmanagerBadConfig',
expr: |||
alertmanager_config_last_reload_successful{%(alertmanagerSelector)s} == 0
||| % $._config,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
message: 'Alertmanager failed to reload config, see container logs',
},
},
{
alert: 'PromAlertsFailed',
expr: |||
sum(increase(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m])) by (namespace) > 0
||| % $._config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
message: 'Alertmanager failed to send an alert.',
},
},
{
alert: 'PromRemoteStorageFailures',
expr: |||
(rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) * 100)
/
(rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) + rate(prometheus_remote_storage_succeeded_samples_total{%(prometheusSelector)s}[1m]))
> 1
||| % $._config,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: 'Prometheus failed to send {{ printf "%.1f" $value }}% samples',
},
},
{
alert: 'PromRuleFailures',
'for': '15m',
expr: |||
rate(prometheus_rule_evaluation_failures_total{%(prometheusSelector)s}[1m]) > 0
||| % $._config,
labels: {
severity: 'critical',
},
annotations: {
message: 'Prometheus failed to evaluate {{ printf "%.1f" $value }} rules / s',
},
},
],
},
],
},
}

View file

@ -0,0 +1,7 @@
{
_config+:: {
// Selectors are inserted between {} in Prometheus queries.
prometheusSelector: 'job="prometheus"',
alertmanagerSelector: 'job="alertmanager"',
},
}

View file

@ -0,0 +1,2 @@
(import 'config.libsonnet') +
(import 'alerts.libsonnet')