2018-05-09 10:02:33 -07:00
|
|
|
{
|
|
|
|
prometheusAlerts+:: {
|
|
|
|
groups+: [
|
|
|
|
{
|
|
|
|
name: 'prometheus',
|
|
|
|
rules: [
|
|
|
|
{
|
2018-11-19 03:22:55 -08:00
|
|
|
alert: 'PrometheusBadConfig',
|
2018-05-09 10:02:33 -07:00
|
|
|
expr: |||
|
|
|
|
prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0
|
|
|
|
||| % $._config,
|
|
|
|
'for': '15m',
|
|
|
|
labels: {
|
|
|
|
severity: 'critical',
|
|
|
|
},
|
|
|
|
annotations: {
|
2018-11-19 03:22:55 -08:00
|
|
|
message: 'Prometheus failed to reload config, see container logs',
|
2018-05-09 10:02:33 -07:00
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
2018-11-19 03:22:55 -08:00
|
|
|
alert: 'PrometheusNotificationQueueRunningFull',
|
2018-05-09 10:02:33 -07:00
|
|
|
expr: |||
|
2018-11-19 03:22:55 -08:00
|
|
|
predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30)
|
|
|
|
>
|
|
|
|
prometheus_notifications_queue_capacity{%(prometheusSelector)s}
|
2018-05-09 10:02:33 -07:00
|
|
|
||| % $._config,
|
2018-11-19 03:22:55 -08:00
|
|
|
'for': '15m',
|
2018-05-09 10:02:33 -07:00
|
|
|
labels: {
|
2018-11-19 03:22:55 -08:00
|
|
|
severity: 'warning',
|
|
|
|
},
|
|
|
|
annotations: {
|
|
|
|
message: "Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ $labels.pod}}",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
alert: 'PrometheusErrorSendingAlerts',
|
|
|
|
expr: |||
|
|
|
|
100 * rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
|
|
|
|
/
|
|
|
|
rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 1
|
|
|
|
||| % $._config,
|
|
|
|
'for': '15m',
|
|
|
|
labels: {
|
|
|
|
severity: 'warning',
|
2018-05-09 10:02:33 -07:00
|
|
|
},
|
|
|
|
annotations: {
|
2018-11-19 03:22:55 -08:00
|
|
|
message: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}',
|
2018-05-09 10:02:33 -07:00
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
2018-11-19 03:22:55 -08:00
|
|
|
alert: 'PrometheusErrorSendingAlerts',
|
2018-05-09 10:02:33 -07:00
|
|
|
expr: |||
|
2018-11-19 03:22:55 -08:00
|
|
|
100 * rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
|
|
|
|
/
|
|
|
|
rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) > 3
|
2018-05-09 10:02:33 -07:00
|
|
|
||| % $._config,
|
2018-11-19 03:22:55 -08:00
|
|
|
'for': '15m',
|
2018-05-09 10:02:33 -07:00
|
|
|
labels: {
|
|
|
|
severity: 'critical',
|
|
|
|
},
|
|
|
|
annotations: {
|
2018-11-19 03:22:55 -08:00
|
|
|
message: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{ $labels.pod}} to Alertmanager {{$labels.Alertmanager}}',
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
alert: 'PrometheusNotConnectedToAlertmanagers',
|
|
|
|
expr: |||
|
|
|
|
prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s} < 1
|
|
|
|
||| % $._config,
|
|
|
|
'for': '10m',
|
|
|
|
labels: {
|
|
|
|
severity: 'warning',
|
|
|
|
},
|
|
|
|
annotations: {
|
|
|
|
message: 'Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected to any Alertmanagers',
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
alert: 'PrometheusTSDBReloadsFailing',
|
|
|
|
expr: |||
|
|
|
|
increase(prometheus_tsdb_reloads_failures_total{%(prometheusSelector)s}[2h]) > 0
|
|
|
|
||| % $._config,
|
|
|
|
'for': '12h',
|
|
|
|
labels: {
|
|
|
|
severity: 'warning',
|
|
|
|
},
|
|
|
|
annotations: {
|
|
|
|
message: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} reload failures over the last four hours.',
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
alert: 'PrometheusTSDBCompactionsFailing',
|
|
|
|
expr: |||
|
|
|
|
increase(prometheus_tsdb_compactions_failed_total{%(prometheusSelector)s}[2h]) > 0
|
|
|
|
||| % $._config,
|
|
|
|
'for': '12h',
|
|
|
|
labels: {
|
|
|
|
severity: 'warning',
|
|
|
|
},
|
|
|
|
annotations: {
|
|
|
|
message: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} compaction failures over the last four hours.',
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
alert: 'PrometheusTSDBWALCorruptions',
|
|
|
|
expr: |||
|
|
|
|
tsdb_wal_corruptions_total{%(prometheusSelector)s} > 0
|
|
|
|
||| % $._config,
|
|
|
|
'for': '4h',
|
|
|
|
labels: {
|
|
|
|
severity: 'warning',
|
|
|
|
},
|
|
|
|
annotations: {
|
|
|
|
message: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead log (WAL).',
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
alert: 'PrometheusNotIngestingSamples',
|
|
|
|
expr: |||
|
|
|
|
rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m]) <= 0
|
|
|
|
||| % $._config,
|
|
|
|
'for': '10m',
|
|
|
|
labels: {
|
|
|
|
severity: 'warning',
|
|
|
|
},
|
2018-11-19 04:23:42 -08:00
|
|
|
annotations: {
|
|
|
|
message: "Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting samples.",
|
2018-11-19 03:22:55 -08:00
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
alert: 'PrometheusTargetScrapesDuplicate',
|
|
|
|
expr: |||
|
|
|
|
increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{%(prometheusSelector)s}[5m]) > 0
|
|
|
|
||| % $._config,
|
|
|
|
'for': '10m',
|
|
|
|
labels: {
|
|
|
|
severity: 'warning',
|
|
|
|
},
|
|
|
|
annotations: {
|
|
|
|
message: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected due to duplicate timestamps but different values',
|
2018-05-09 10:02:33 -07:00
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
2018-11-19 03:22:55 -08:00
|
|
|
alert: 'PrometheusRemoteStorageFailures',
|
2018-05-09 10:02:33 -07:00
|
|
|
expr: |||
|
|
|
|
(rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) * 100)
|
|
|
|
/
|
|
|
|
(rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[1m]) + rate(prometheus_remote_storage_succeeded_samples_total{%(prometheusSelector)s}[1m]))
|
|
|
|
> 1
|
|
|
|
||| % $._config,
|
|
|
|
'for': '15m',
|
|
|
|
labels: {
|
|
|
|
severity: 'critical',
|
|
|
|
},
|
|
|
|
annotations: {
|
|
|
|
message: 'Prometheus failed to send {{ printf "%.1f" $value }}% samples',
|
|
|
|
},
|
|
|
|
},
|
2019-02-12 07:22:58 -08:00
|
|
|
{
|
|
|
|
alert: 'PrometheusRemoteWriteBehind',
|
|
|
|
expr: |||
|
2019-03-01 07:39:48 -08:00
|
|
|
prometheus_remote_storage_highest_timestamp_in_seconds{%(prometheusSelector)s}
|
2019-02-12 07:22:58 -08:00
|
|
|
- on(job, instance) group_right
|
2019-03-01 07:39:48 -08:00
|
|
|
prometheus_remote_storage_queue_highest_sent_timestamp_seconds{%(prometheusSelector)s}
|
2019-03-04 04:47:24 -08:00
|
|
|
> 120
|
2019-02-12 07:22:58 -08:00
|
|
|
||| % $._config,
|
|
|
|
'for': '15m',
|
|
|
|
labels: {
|
|
|
|
severity: 'critical',
|
|
|
|
},
|
|
|
|
annotations: {
|
|
|
|
message: 'Prometheus remote write is {{ printf "%.1f" $value }}s behind.',
|
|
|
|
},
|
|
|
|
},
|
2018-05-09 10:02:33 -07:00
|
|
|
{
|
2018-11-19 03:22:55 -08:00
|
|
|
alert: 'PrometheusRuleFailures',
|
2018-05-09 10:02:33 -07:00
|
|
|
'for': '15m',
|
|
|
|
expr: |||
|
|
|
|
rate(prometheus_rule_evaluation_failures_total{%(prometheusSelector)s}[1m]) > 0
|
|
|
|
||| % $._config,
|
|
|
|
labels: {
|
|
|
|
severity: 'critical',
|
|
|
|
},
|
|
|
|
annotations: {
|
|
|
|
message: 'Prometheus failed to evaluate {{ printf "%.1f" $value }} rules / s',
|
|
|
|
},
|
|
|
|
},
|
|
|
|
],
|
|
|
|
},
|
|
|
|
],
|
|
|
|
},
|
|
|
|
}
|