mirror of
https://github.com/prometheus/node_exporter.git
synced 2024-12-28 15:09:45 -08:00
Merge pull request #2644 from v-zhuravlev/mixin_alerts
Mixin: Add and update alerts
This commit is contained in:
commit
ed57c15e2c
|
@ -21,7 +21,7 @@
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
summary: 'Filesystem is predicted to run out of space within the next 24 hours.',
|
summary: 'Filesystem is predicted to run out of space within the next 24 hours.',
|
||||||
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.',
|
description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -41,7 +41,7 @@
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
summary: 'Filesystem is predicted to run out of space within the next 4 hours.',
|
summary: 'Filesystem is predicted to run out of space within the next 4 hours.',
|
||||||
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.',
|
description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -59,7 +59,7 @@
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
summary: 'Filesystem has less than %(fsSpaceAvailableWarningThreshold)d%% space left.' % $._config,
|
summary: 'Filesystem has less than %(fsSpaceAvailableWarningThreshold)d%% space left.' % $._config,
|
||||||
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.',
|
description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -77,7 +77,7 @@
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
summary: 'Filesystem has less than %(fsSpaceAvailableCriticalThreshold)d%% space left.' % $._config,
|
summary: 'Filesystem has less than %(fsSpaceAvailableCriticalThreshold)d%% space left.' % $._config,
|
||||||
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.',
|
description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -97,7 +97,7 @@
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.',
|
summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.',
|
||||||
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.',
|
description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -117,7 +117,7 @@
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.',
|
summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.',
|
||||||
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.',
|
description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -135,7 +135,7 @@
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
summary: 'Filesystem has less than 5% inodes left.',
|
summary: 'Filesystem has less than 5% inodes left.',
|
||||||
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.',
|
description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -153,13 +153,13 @@
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
summary: 'Filesystem has less than 3% inodes left.',
|
summary: 'Filesystem has less than 3% inodes left.',
|
||||||
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.',
|
description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
alert: 'NodeNetworkReceiveErrs',
|
alert: 'NodeNetworkReceiveErrs',
|
||||||
expr: |||
|
expr: |||
|
||||||
rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
|
rate(node_network_receive_errs_total{%(nodeExporterSelector)s}[2m]) / rate(node_network_receive_packets_total{%(nodeExporterSelector)s}[2m]) > 0.01
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
'for': '1h',
|
'for': '1h',
|
||||||
labels: {
|
labels: {
|
||||||
|
@ -173,7 +173,7 @@
|
||||||
{
|
{
|
||||||
alert: 'NodeNetworkTransmitErrs',
|
alert: 'NodeNetworkTransmitErrs',
|
||||||
expr: |||
|
expr: |||
|
||||||
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
|
rate(node_network_transmit_errs_total{%(nodeExporterSelector)s}[2m]) / rate(node_network_transmit_packets_total{%(nodeExporterSelector)s}[2m]) > 0.01
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
'for': '1h',
|
'for': '1h',
|
||||||
labels: {
|
labels: {
|
||||||
|
@ -187,7 +187,7 @@
|
||||||
{
|
{
|
||||||
alert: 'NodeHighNumberConntrackEntriesUsed',
|
alert: 'NodeHighNumberConntrackEntriesUsed',
|
||||||
expr: |||
|
expr: |||
|
||||||
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
|
(node_nf_conntrack_entries{%(nodeExporterSelector)s} / node_nf_conntrack_entries_limit) > 0.75
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
annotations: {
|
annotations: {
|
||||||
summary: 'Number of conntrack are getting close to the limit.',
|
summary: 'Number of conntrack are getting close to the limit.',
|
||||||
|
@ -204,7 +204,7 @@
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
annotations: {
|
annotations: {
|
||||||
summary: 'Node Exporter text file collector failed to scrape.',
|
summary: 'Node Exporter text file collector failed to scrape.',
|
||||||
description: 'Node Exporter text file collector failed to scrape.',
|
description: 'Node Exporter text file collector on {{ $labels.instance }} failed to scrape.',
|
||||||
},
|
},
|
||||||
labels: {
|
labels: {
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
|
@ -231,7 +231,7 @@
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
summary: 'Clock skew detected.',
|
summary: 'Clock skew detected.',
|
||||||
description: 'Clock on {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.',
|
description: 'Clock at {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -247,7 +247,7 @@
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
summary: 'Clock not synchronising.',
|
summary: 'Clock not synchronising.',
|
||||||
description: 'Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.',
|
description: 'Clock at {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -260,8 +260,8 @@
|
||||||
severity: 'critical',
|
severity: 'critical',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
summary: 'RAID Array is degraded',
|
summary: 'RAID Array is degraded.',
|
||||||
description: "RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.",
|
description: "RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -273,8 +273,8 @@
|
||||||
severity: 'warning',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
summary: 'Failed device in RAID array',
|
summary: 'Failed device in RAID array.',
|
||||||
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.",
|
description: "At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -309,6 +309,104 @@
|
||||||
description: 'File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.',
|
description: 'File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
alert: 'NodeCPUHighUsage',
|
||||||
|
expr: |||
|
||||||
|
sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode!="idle"}[2m]))) * 100 > %(cpuHighUsageThreshold)d
|
||||||
|
||| % $._config,
|
||||||
|
'for': '15m',
|
||||||
|
labels: {
|
||||||
|
severity: 'info',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
summary: 'High CPU usage.',
|
||||||
|
description: |||
|
||||||
|
CPU usage at {{ $labels.instance }} has been above %(cpuHighUsageThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%.
|
||||||
|
||| % $._config,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'NodeSystemSaturation',
|
||||||
|
expr: |||
|
||||||
|
node_load1{%(nodeExporterSelector)s}
|
||||||
|
/ count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}) > %(systemSaturationPerCoreThreshold)d
|
||||||
|
||| % $._config,
|
||||||
|
'for': '15m',
|
||||||
|
labels: {
|
||||||
|
severity: 'warning',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
summary: 'System saturated, load per core is very high.',
|
||||||
|
description: |||
|
||||||
|
System load per core at {{ $labels.instance }} has been above %(systemSaturationPerCoreThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}.
|
||||||
|
This might indicate this instance resources saturation and can cause it becoming unresponsive.
|
||||||
|
||| % $._config,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'NodeMemoryMajorPagesFaults',
|
||||||
|
expr: |||
|
||||||
|
rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[5m]) > %(memoryMajorPagesFaultsThreshold)d
|
||||||
|
||| % $._config,
|
||||||
|
'for': '15m',
|
||||||
|
labels: {
|
||||||
|
severity: 'warning',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
summary: 'Memory major page faults are occurring at very high rate.',
|
||||||
|
description: |||
|
||||||
|
Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsThreshold)d major page faults per second for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}.
|
||||||
|
Please check that there is enough memory available at this instance.
|
||||||
|
||| % $._config,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'NodeMemoryHighUtilization',
|
||||||
|
expr: |||
|
||||||
|
100 - (node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s} * 100) > %(memoryHighUtilizationThreshold)d
|
||||||
|
||| % $._config,
|
||||||
|
'for': '15m',
|
||||||
|
labels: {
|
||||||
|
severity: 'warning',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
summary: 'Host is running out of memory.',
|
||||||
|
description: |||
|
||||||
|
Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%.
|
||||||
|
||| % $._config,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'NodeDiskIOSaturation',
|
||||||
|
expr: |||
|
||||||
|
rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[5m]) > %(diskIOSaturationThreshold)d
|
||||||
|
||| % $._config,
|
||||||
|
'for': '30m',
|
||||||
|
labels: {
|
||||||
|
severity: 'warning',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
summary: 'Disk IO queue is high.',
|
||||||
|
description: |||
|
||||||
|
Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above %(diskIOSaturationThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}.
|
||||||
|
This symptom might indicate disk saturation.
|
||||||
|
||| % $._config,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'NodeSystemdServiceFailed',
|
||||||
|
expr: |||
|
||||||
|
node_systemd_unit_state{%(nodeExporterSelector)s, state="failed"} == 1
|
||||||
|
||| % $._config,
|
||||||
|
'for': '5m',
|
||||||
|
labels: {
|
||||||
|
severity: 'warning',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
summary: 'Systemd service has entered failed state.',
|
||||||
|
description: 'Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }}',
|
||||||
|
},
|
||||||
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
|
@ -43,6 +43,13 @@
|
||||||
// just a warning for K8s nodes.
|
// just a warning for K8s nodes.
|
||||||
nodeCriticalSeverity: 'critical',
|
nodeCriticalSeverity: 'critical',
|
||||||
|
|
||||||
|
// CPU utilization (%) on which to trigger the
|
||||||
|
// 'NodeCPUHighUsage' alert.
|
||||||
|
cpuHighUsageThreshold: 90,
|
||||||
|
// Load average 1m (per core) on which to trigger the
|
||||||
|
// 'NodeSystemSaturation' alert.
|
||||||
|
systemSaturationPerCoreThreshold: 2,
|
||||||
|
|
||||||
// Available disk space (%) thresholds on which to trigger the
|
// Available disk space (%) thresholds on which to trigger the
|
||||||
// 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk
|
// 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk
|
||||||
// usage grows in a way that it is predicted to run out in 4h or 1d
|
// usage grows in a way that it is predicted to run out in 4h or 1d
|
||||||
|
@ -60,6 +67,18 @@
|
||||||
fsSpaceAvailableWarningThreshold: 5,
|
fsSpaceAvailableWarningThreshold: 5,
|
||||||
fsSpaceAvailableCriticalThreshold: 3,
|
fsSpaceAvailableCriticalThreshold: 3,
|
||||||
|
|
||||||
|
// Memory utilzation (%) level on which to trigger the
|
||||||
|
// 'NodeMemoryHighUtilization' alert.
|
||||||
|
memoryHighUtilizationThreshold: 90,
|
||||||
|
|
||||||
|
// Threshold for the rate of memory major page faults to trigger
|
||||||
|
// 'NodeMemoryMajorPagesFaults' alert.
|
||||||
|
memoryMajorPagesFaultsThreshold: 500,
|
||||||
|
|
||||||
|
// Disk IO queue level above which to trigger
|
||||||
|
// 'NodeDiskIOSaturation' alert.
|
||||||
|
diskIOSaturationThreshold: 10,
|
||||||
|
|
||||||
rateInterval: '5m',
|
rateInterval: '5m',
|
||||||
// Opt-in for multi-cluster support.
|
// Opt-in for multi-cluster support.
|
||||||
showMultiCluster: false,
|
showMultiCluster: false,
|
||||||
|
|
Loading…
Reference in a new issue