mirror of
https://github.com/prometheus/prometheus.git
synced 2025-01-12 14:27:27 -08:00
prometheus-mixin: add HA-group aware alerts
There is certainly a potential to add more of these. This is mostly meant to introduce the concept and cover a few critical parts. Signed-off-by: beorn7 <beorn@grafana.com>
This commit is contained in:
parent
cda52234eb
commit
371ca9ff46
|
@ -60,26 +60,6 @@
|
||||||
description: '{{ printf "%%.1f" $value }}%% errors while sending alerts from Prometheus %(prometheusName)s to Alertmanager {{$labels.alertmanager}}.' % $._config,
|
description: '{{ printf "%%.1f" $value }}%% errors while sending alerts from Prometheus %(prometheusName)s to Alertmanager {{$labels.alertmanager}}.' % $._config,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
|
||||||
alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager',
|
|
||||||
expr: |||
|
|
||||||
min without(alertmanager) (
|
|
||||||
rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
|
|
||||||
/
|
|
||||||
rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m])
|
|
||||||
)
|
|
||||||
* 100
|
|
||||||
> 3
|
|
||||||
||| % $._config,
|
|
||||||
'for': '15m',
|
|
||||||
labels: {
|
|
||||||
severity: 'critical',
|
|
||||||
},
|
|
||||||
annotations: {
|
|
||||||
summary: 'Prometheus encounters more than 3% errors sending alerts to any Alertmanager.',
|
|
||||||
description: '{{ printf "%%.1f" $value }}%% minimum errors while sending alerts from Prometheus %(prometheusName)s to any Alertmanager.' % $._config,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
alert: 'PrometheusNotConnectedToAlertmanagers',
|
alert: 'PrometheusNotConnectedToAlertmanagers',
|
||||||
expr: |||
|
expr: |||
|
||||||
|
@ -281,6 +261,123 @@
|
||||||
description: 'Prometheus %(prometheusName)s has dropped {{ printf "%%.0f" $value }} targets because the number of targets exceeded the configured target_limit.' % $._config,
|
description: 'Prometheus %(prometheusName)s has dropped {{ printf "%%.0f" $value }} targets because the number of targets exceeded the configured target_limit.' % $._config,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
] + if $._config.prometheusHAGroupLabels == '' then self.rulesWithoutHA else self.rulesWithHA,
|
||||||
|
rulesWithoutHA:: [
|
||||||
|
{
|
||||||
|
alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager',
|
||||||
|
expr: |||
|
||||||
|
min without (alertmanager) (
|
||||||
|
rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
|
||||||
|
/
|
||||||
|
rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m])
|
||||||
|
)
|
||||||
|
* 100
|
||||||
|
> 3
|
||||||
|
||| % $._config,
|
||||||
|
'for': '15m',
|
||||||
|
labels: {
|
||||||
|
severity: 'critical',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
summary: 'Prometheus encounters more than 3% errors sending alerts to any Alertmanager.',
|
||||||
|
description: '{{ printf "%%.1f" $value }}%% minimum errors while sending alerts from Prometheus %(prometheusName)s to any Alertmanager.' % $._config,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
rulesWithHA:: [
|
||||||
|
{
|
||||||
|
alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager',
|
||||||
|
expr: |||
|
||||||
|
min by (%(prometheusHAGroupLabels)s) (
|
||||||
|
rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
|
||||||
|
/
|
||||||
|
rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m])
|
||||||
|
)
|
||||||
|
* 100
|
||||||
|
> 3
|
||||||
|
||| % $._config,
|
||||||
|
'for': '15m',
|
||||||
|
labels: {
|
||||||
|
severity: 'critical',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
summary: 'Each Prometheus server in an HA group encounters more than 3% errors sending alerts to any Alertmanager.',
|
||||||
|
description: '{{ printf "%%.1f" $value }}%% minimum errors while sending alerts from any Prometheus server in HA group %(prometheusHAGroupName)s to any Alertmanager.' % $._config,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'PrometheusHAGroupNotIngestingSamples',
|
||||||
|
expr: |||
|
||||||
|
max by (%(prometheusHAGroupLabels)s) (
|
||||||
|
rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m])
|
||||||
|
and
|
||||||
|
(
|
||||||
|
sum without(scrape_job) (prometheus_target_metadata_cache_entries{%(prometheusSelector)s}) > 0
|
||||||
|
or
|
||||||
|
sum without(rule_group) (prometheus_rule_group_rules{%(prometheusSelector)s}) > 0
|
||||||
|
)
|
||||||
|
)
|
||||||
|
<= 0
|
||||||
|
||| % $._config,
|
||||||
|
'for': '10m',
|
||||||
|
labels: {
|
||||||
|
severity: 'critical',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
summary: 'A whole Prometheus HA group is not ingesting samples.',
|
||||||
|
description: 'None of the Prometheus instances in HA group %(prometheusHAGroupName)s is ingesting any samples.' % $._config,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Both the following critical alerts, PrometheusHAGroupDown and
|
||||||
|
// PrometheusHAGroupCrashlooping, fire if a whole HA group is
|
||||||
|
// unhealthy. It is implied that a generic warning alert is in place
|
||||||
|
// for individual instances being down or crashlooping.
|
||||||
|
{
|
||||||
|
alert: 'PrometheusHAGroupDown',
|
||||||
|
expr: |||
|
||||||
|
(
|
||||||
|
count by (%(prometheusHAGroupLabels)s) (
|
||||||
|
avg_over_time(up{%(prometheusSelector)s}[5m]) < 0.5
|
||||||
|
)
|
||||||
|
/
|
||||||
|
count by (%(prometheusHAGroupLabels)s) (
|
||||||
|
up{%(prometheusSelector)s}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
> 0.5
|
||||||
|
||| % $._config,
|
||||||
|
'for': '5m',
|
||||||
|
labels: {
|
||||||
|
severity: 'critical',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
summary: 'More than half of the Prometheus instances within the same HA group are down.',
|
||||||
|
description: '{{ $value | humanizePercentage }} of Prometheus instances within the %(prometheusHAGroupName)s HA group have been up for less than half of the last 5m.' % $._config,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'PrometheusHAGroupCrashlooping',
|
||||||
|
expr: |||
|
||||||
|
(
|
||||||
|
count by (%(prometheusHAGroupLabels)s) (
|
||||||
|
changes(process_start_time_seconds{%(prometheusSelector)s}[30m]) > 4
|
||||||
|
)
|
||||||
|
/
|
||||||
|
count by (%(prometheusHAGroupLabels)s) (
|
||||||
|
up{%(prometheusSelector)s}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
> 0.5
|
||||||
|
||| % $._config,
|
||||||
|
'for': '5m',
|
||||||
|
labels: {
|
||||||
|
severity: 'critical',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
summary: 'More than half of the Prometheus instances within the same HA group are crashlooping.',
|
||||||
|
description: '{{ $value | humanizePercentage }} of Prometheus instances within the %(prometheusHAGroupName)s HA group have restarted at least 5 times in the last 30m.' % $._config,
|
||||||
|
},
|
||||||
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
|
@ -5,6 +5,16 @@
|
||||||
// servers.
|
// servers.
|
||||||
prometheusSelector: 'job="prometheus"',
|
prometheusSelector: 'job="prometheus"',
|
||||||
|
|
||||||
|
// prometheusHAGroupLabels is a string with comma-separated labels
|
||||||
|
// that are common labels of instances belonging to the same
|
||||||
|
// high-availability group of Prometheus servers, i.e. identically
|
||||||
|
// configured Prometheus servers. Include not only enough labels
|
||||||
|
// to identify the members of the HA group, but also all common
|
||||||
|
// labels you want to keep for resulting HA-group-level alerts.
|
||||||
|
//
|
||||||
|
// If this is set to an empty string, no HA-related alerts are applied.
|
||||||
|
prometheusHAGroupLabels: '',
|
||||||
|
|
||||||
// prometheusName is inserted into annotations to name the Prometheus
|
// prometheusName is inserted into annotations to name the Prometheus
|
||||||
// instance affected by the alert.
|
// instance affected by the alert.
|
||||||
prometheusName: '{{$labels.instance}}',
|
prometheusName: '{{$labels.instance}}',
|
||||||
|
@ -12,5 +22,10 @@
|
||||||
// Operator, you can make use of the configured target labels for
|
// Operator, you can make use of the configured target labels for
|
||||||
// nicer naming:
|
// nicer naming:
|
||||||
// prometheusNameTemplate: '{{$labels.namespace}}/{{$labels.pod}}'
|
// prometheusNameTemplate: '{{$labels.namespace}}/{{$labels.pod}}'
|
||||||
|
|
||||||
|
// prometheusHAGroupName is inserted into annotations to name an
|
||||||
|
// HA group. All labels used here must also be present in
|
||||||
|
// prometheusHAGroupLabels above.
|
||||||
|
prometheusHAGroupName: '{{$labels.job}}',
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue