diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet new file mode 100644 index 000000000..1246e1052 --- /dev/null +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -0,0 +1,92 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'prometheus', + rules: [ + { + alert: 'PromScrapeFailed', + expr: ||| + up != 1 + |||, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Prometheus failed to scrape a target {{ $labels.job }} / {{ $labels.instance }}', + }, + }, + { + alert: 'PromBadConfig', + expr: ||| + prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + mesage: 'Prometheus failed to reload config, see container logs', + }, + }, + { + alert: 'PromAlertmanagerBadConfig', + expr: ||| + alertmanager_config_last_reload_successful{%(alertmanagerSelector)s} == 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Alertmanager failed to reload config, see container logs', + }, + }, + { + alert: 'PromAlertsFailed', + expr: ||| + sum(increase(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m])) by (namespace) > 0 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Alertmanager failed to send an alert.', + }, + }, + { + alert: 'PromRemoteStorageFailures', + expr: ||| + (rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) * 100) + / + (rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) + rate(prometheus_remote_storage_succeeded_samples_total{%(prometheusSelector)s}[1m])) + > 1 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Prometheus failed to send {{ printf "%.1f" $value }}% samples', + }, + }, + { + alert: 'PromRuleFailures', + 'for': '15m', + expr: ||| + rate(prometheus_rule_evaluation_failures_total{%(prometheusSelector)s}[1m]) > 0 + ||| % $._config, + labels: { + severity: 'critical', + }, + annotations: { + message: 'Prometheus failed to evaluate {{ printf "%.1f" $value }} rules / s', + }, + }, + ], + }, + ], + }, +} diff --git a/documentation/prometheus-mixin/config.libsonnet b/documentation/prometheus-mixin/config.libsonnet new file mode 100644 index 000000000..0e9daa7c0 --- /dev/null +++ b/documentation/prometheus-mixin/config.libsonnet @@ -0,0 +1,7 @@ +{ + _config+:: { + // Selectors are inserted between {} in Prometheus queries. + prometheusSelector: 'job="prometheus"', + alertmanagerSelector: 'job="alertmanager"', + }, +} diff --git a/documentation/prometheus-mixin/mixin.libsonnet b/documentation/prometheus-mixin/mixin.libsonnet new file mode 100644 index 000000000..95efe331f --- /dev/null +++ b/documentation/prometheus-mixin/mixin.libsonnet @@ -0,0 +1,2 @@ +(import 'config.libsonnet') + +(import 'alerts.libsonnet')