mirror of
https://github.com/prometheus/prometheus.git
synced 2025-01-26 05:01:23 -08:00
prometheus-mixin: add HA-group aware alerts
There is certainly a potential to add more of these. This is mostly meant to introduce the concept and cover a few critical parts. Signed-off-by: beorn7 <beorn@grafana.com>
This commit is contained in:
parent
cda52234eb
commit
371ca9ff46
|
@ -60,26 +60,6 @@
|
|||
description: '{{ printf "%%.1f" $value }}%% errors while sending alerts from Prometheus %(prometheusName)s to Alertmanager {{$labels.alertmanager}}.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager',
|
||||
expr: |||
|
||||
min without(alertmanager) (
|
||||
rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
||| % $._config,
|
||||
'for': '15m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Prometheus encounters more than 3% errors sending alerts to any Alertmanager.',
|
||||
description: '{{ printf "%%.1f" $value }}%% minimum errors while sending alerts from Prometheus %(prometheusName)s to any Alertmanager.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusNotConnectedToAlertmanagers',
|
||||
expr: |||
|
||||
|
@ -281,6 +261,123 @@
|
|||
description: 'Prometheus %(prometheusName)s has dropped {{ printf "%%.0f" $value }} targets because the number of targets exceeded the configured target_limit.' % $._config,
|
||||
},
|
||||
},
|
||||
] + if $._config.prometheusHAGroupLabels == '' then self.rulesWithoutHA else self.rulesWithHA,
|
||||
rulesWithoutHA:: [
|
||||
{
|
||||
alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager',
|
||||
expr: |||
|
||||
min without (alertmanager) (
|
||||
rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
||| % $._config,
|
||||
'for': '15m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Prometheus encounters more than 3% errors sending alerts to any Alertmanager.',
|
||||
description: '{{ printf "%%.1f" $value }}%% minimum errors while sending alerts from Prometheus %(prometheusName)s to any Alertmanager.' % $._config,
|
||||
},
|
||||
},
|
||||
],
|
||||
rulesWithHA:: [
|
||||
{
|
||||
alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager',
|
||||
expr: |||
|
||||
min by (%(prometheusHAGroupLabels)s) (
|
||||
rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
||| % $._config,
|
||||
'for': '15m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Each Prometheus server in an HA group encounters more than 3% errors sending alerts to any Alertmanager.',
|
||||
description: '{{ printf "%%.1f" $value }}%% minimum errors while sending alerts from any Prometheus server in HA group %(prometheusHAGroupName)s to any Alertmanager.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusHAGroupNotIngestingSamples',
|
||||
expr: |||
|
||||
max by (%(prometheusHAGroupLabels)s) (
|
||||
rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m])
|
||||
and
|
||||
(
|
||||
sum without(scrape_job) (prometheus_target_metadata_cache_entries{%(prometheusSelector)s}) > 0
|
||||
or
|
||||
sum without(rule_group) (prometheus_rule_group_rules{%(prometheusSelector)s}) > 0
|
||||
)
|
||||
)
|
||||
<= 0
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A whole Prometheus HA group is not ingesting samples.',
|
||||
description: 'None of the Prometheus instances in HA group %(prometheusHAGroupName)s is ingesting any samples.' % $._config,
|
||||
},
|
||||
},
|
||||
// Both the following critical alerts, PrometheusHAGroupDown and
|
||||
// PrometheusHAGroupCrashlooping, fire if a whole HA group is
|
||||
// unhealthy. It is implied that a generic warning alert is in place
|
||||
// for individual instances being down or crashlooping.
|
||||
{
|
||||
alert: 'PrometheusHAGroupDown',
|
||||
expr: |||
|
||||
(
|
||||
count by (%(prometheusHAGroupLabels)s) (
|
||||
avg_over_time(up{%(prometheusSelector)s}[5m]) < 0.5
|
||||
)
|
||||
/
|
||||
count by (%(prometheusHAGroupLabels)s) (
|
||||
up{%(prometheusSelector)s}
|
||||
)
|
||||
)
|
||||
> 0.5
|
||||
||| % $._config,
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'More than half of the Prometheus instances within the same HA group are down.',
|
||||
description: '{{ $value | humanizePercentage }} of Prometheus instances within the %(prometheusHAGroupName)s HA group have been up for less than half of the last 5m.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusHAGroupCrashlooping',
|
||||
expr: |||
|
||||
(
|
||||
count by (%(prometheusHAGroupLabels)s) (
|
||||
changes(process_start_time_seconds{%(prometheusSelector)s}[30m]) > 4
|
||||
)
|
||||
/
|
||||
count by (%(prometheusHAGroupLabels)s) (
|
||||
up{%(prometheusSelector)s}
|
||||
)
|
||||
)
|
||||
> 0.5
|
||||
||| % $._config,
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'More than half of the Prometheus instances within the same HA group are crashlooping.',
|
||||
description: '{{ $value | humanizePercentage }} of Prometheus instances within the %(prometheusHAGroupName)s HA group have restarted at least 5 times in the last 30m.' % $._config,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
|
|
|
@ -5,6 +5,16 @@
|
|||
// servers.
|
||||
prometheusSelector: 'job="prometheus"',
|
||||
|
||||
// prometheusHAGroupLabels is a string with comma-separated labels
|
||||
// that are common labels of instances belonging to the same
|
||||
// high-availability group of Prometheus servers, i.e. identically
|
||||
// configured Prometheus servers. Include not only enough labels
|
||||
// to identify the members of the HA group, but also all common
|
||||
// labels you want to keep for resulting HA-group-level alerts.
|
||||
//
|
||||
// If this is set to an empty string, no HA-related alerts are applied.
|
||||
prometheusHAGroupLabels: '',
|
||||
|
||||
// prometheusName is inserted into annotations to name the Prometheus
|
||||
// instance affected by the alert.
|
||||
prometheusName: '{{$labels.instance}}',
|
||||
|
@ -12,5 +22,10 @@
|
|||
// Operator, you can make use of the configured target labels for
|
||||
// nicer naming:
|
||||
// prometheusNameTemplate: '{{$labels.namespace}}/{{$labels.pod}}'
|
||||
|
||||
// prometheusHAGroupName is inserted into annotations to name an
|
||||
// HA group. All labels used here must also be present in
|
||||
// prometheusHAGroupLabels above.
|
||||
prometheusHAGroupName: '{{$labels.job}}',
|
||||
},
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue