mirror of
https://github.com/prometheus/prometheus.git
synced 2024-12-26 14:09:41 -08:00
dd174963a2
The counter is only increased when tsdb.Open() is called which Prometheus does only once in its lifetime (when it initializes). If the corruption can't be recovered, tsdb.Open() returns an error and Prometheus exits. Hence the metric is either 0 (no corruption) or 1 (corruption detected and repaired). If the latter, the alert isn't actionable and the only way to resolve it is to restart Prometheus which would reset the counter. Signed-off-by: Simon Pasquier <spasquie@redhat.com>
247 lines
10 KiB
Plaintext
247 lines
10 KiB
Plaintext
{
|
|
prometheusAlerts+:: {
|
|
groups+: [
|
|
{
|
|
name: 'prometheus',
|
|
rules: [
|
|
{
|
|
alert: 'PrometheusBadConfig',
|
|
expr: |||
|
|
# Without max_over_time, failed scrapes could create false negatives, see
|
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
max_over_time(prometheus_config_last_reload_successful{%(prometheusSelector)s}[5m]) == 0
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
summary: 'Failed Prometheus configuration reload.',
|
|
description: 'Prometheus %(prometheusName)s has failed to reload its configuration.' % $._config,
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusNotificationQueueRunningFull',
|
|
expr: |||
|
|
# Without min_over_time, failed scrapes could create false negatives, see
|
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
(
|
|
predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30)
|
|
>
|
|
min_over_time(prometheus_notifications_queue_capacity{%(prometheusSelector)s}[5m])
|
|
)
|
|
||| % $._config,
|
|
'for': '15m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
summary: 'Prometheus alert notification queue predicted to run full in less than 30m.',
|
|
description: 'Alert notification queue of Prometheus %(prometheusName)s is running full.' % $._config,
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusErrorSendingAlertsToSomeAlertmanagers',
|
|
expr: |||
|
|
(
|
|
rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
|
|
/
|
|
rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m])
|
|
)
|
|
* 100
|
|
> 1
|
|
||| % $._config,
|
|
'for': '15m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
summary: 'Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.',
|
|
description: '{{ printf "%%.1f" $value }}%% errors while sending alerts from Prometheus %(prometheusName)s to Alertmanager {{$labels.alertmanager}}.' % $._config,
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager',
|
|
expr: |||
|
|
min without(alertmanager) (
|
|
rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
|
|
/
|
|
rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m])
|
|
)
|
|
* 100
|
|
> 3
|
|
||| % $._config,
|
|
'for': '15m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
summary: 'Prometheus encounters more than 3% errors sending alerts to any Alertmanager.',
|
|
description: '{{ printf "%%.1f" $value }}%% minimum errors while sending alerts from Prometheus %(prometheusName)s to any Alertmanager.' % $._config,
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusNotConnectedToAlertmanagers',
|
|
expr: |||
|
|
# Without max_over_time, failed scrapes could create false negatives, see
|
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
max_over_time(prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s}[5m]) < 1
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
summary: 'Prometheus is not connected to any Alertmanagers.',
|
|
description: 'Prometheus %(prometheusName)s is not connected to any Alertmanagers.' % $._config,
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusTSDBReloadsFailing',
|
|
expr: |||
|
|
increase(prometheus_tsdb_reloads_failures_total{%(prometheusSelector)s}[3h]) > 0
|
|
||| % $._config,
|
|
'for': '4h',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
summary: 'Prometheus has issues reloading blocks from disk.',
|
|
description: 'Prometheus %(prometheusName)s has detected {{$value | humanize}} reload failures over the last 3h.' % $._config,
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusTSDBCompactionsFailing',
|
|
expr: |||
|
|
increase(prometheus_tsdb_compactions_failed_total{%(prometheusSelector)s}[3h]) > 0
|
|
||| % $._config,
|
|
'for': '4h',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
summary: 'Prometheus has issues compacting blocks.',
|
|
description: 'Prometheus %(prometheusName)s has detected {{$value | humanize}} compaction failures over the last 3h.' % $._config,
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusNotIngestingSamples',
|
|
expr: |||
|
|
rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m]) <= 0
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
summary: 'Prometheus is not ingesting samples.',
|
|
description: 'Prometheus %(prometheusName)s is not ingesting samples.' % $._config,
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusDuplicateTimestamps',
|
|
expr: |||
|
|
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{%(prometheusSelector)s}[5m]) > 0
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
summary: 'Prometheus is dropping samples with duplicate timestamps.',
|
|
description: 'Prometheus %(prometheusName)s is dropping {{$value | humanize}} samples/s with different values but duplicated timestamp.' % $._config,
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusOutOfOrderTimestamps',
|
|
expr: |||
|
|
rate(prometheus_target_scrapes_sample_out_of_order_total{%(prometheusSelector)s}[5m]) > 0
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
summary: 'Prometheus drops samples with out-of-order timestamps.',
|
|
description: 'Prometheus %(prometheusName)s is dropping {{$value | humanize}} samples/s with timestamps arriving out of order.' % $._config,
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusRemoteStorageFailures',
|
|
expr: |||
|
|
(
|
|
rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[5m])
|
|
/
|
|
(
|
|
rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[5m])
|
|
+
|
|
rate(prometheus_remote_storage_succeeded_samples_total{%(prometheusSelector)s}[5m])
|
|
)
|
|
)
|
|
* 100
|
|
> 1
|
|
||| % $._config,
|
|
'for': '15m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
summary: 'Prometheus fails to send samples to remote storage.',
|
|
description: 'Prometheus %(prometheusName)s failed to send {{ printf "%%.1f" $value }}%% of the samples to queue {{$labels.queue}}.' % $._config,
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusRemoteWriteBehind',
|
|
expr: |||
|
|
# Without max_over_time, failed scrapes could create false negatives, see
|
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
(
|
|
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{%(prometheusSelector)s}[5m])
|
|
- on(job, instance) group_right
|
|
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{%(prometheusSelector)s}[5m])
|
|
)
|
|
> 120
|
|
||| % $._config,
|
|
'for': '15m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
summary: 'Prometheus remote write is behind.',
|
|
description: 'Prometheus %(prometheusName)s remote write is {{ printf "%%.1f" $value }}s behind for queue {{$labels.queue}}.' % $._config,
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusRuleFailures',
|
|
expr: |||
|
|
increase(prometheus_rule_evaluation_failures_total{%(prometheusSelector)s}[5m]) > 0
|
|
||| % $._config,
|
|
'for': '15m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
summary: 'Prometheus is failing rule evaluations.',
|
|
description: 'Prometheus %(prometheusName)s has failed to evaluate {{ printf "%%.0f" $value }} rules in the last 5m.' % $._config,
|
|
},
|
|
},
|
|
{
|
|
alert: 'PrometheusMissingRuleEvaluations',
|
|
expr: |||
|
|
increase(prometheus_rule_group_iterations_missed_total{%(prometheusSelector)s}[5m]) > 0
|
|
||| % $._config,
|
|
'for': '15m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
summary: 'Prometheus is missing rule evaluations due to slow rule group evaluation.',
|
|
description: 'Prometheus %(prometheusName)s has missed {{ printf "%%.0f" $value }} rule group evaluations in the last 5m.' % $._config,
|
|
},
|
|
},
|
|
],
|
|
},
|
|
],
|
|
},
|
|
}
|