mirror of
https://github.com/prometheus/node_exporter.git
synced 2025-03-05 21:00:12 -08:00
Address review comments, batch 2
Signed-off-by: beorn7 <beorn@grafana.com>
This commit is contained in:
parent
3ab1f41d12
commit
a92d1d7889
|
@ -43,7 +43,7 @@
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
alert: 'NodeFilesystemOutOfSpace',
|
alert: 'NodeFilesystemAlmostOutOfSpace',
|
||||||
expr: |||
|
expr: |||
|
||||||
(
|
(
|
||||||
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5
|
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5
|
||||||
|
@ -60,7 +60,7 @@
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
alert: 'NodeFilesystemOutOfSpace',
|
alert: 'NodeFilesystemAlmostOutOfSpace',
|
||||||
expr: |||
|
expr: |||
|
||||||
(
|
(
|
||||||
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3
|
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3
|
||||||
|
@ -115,7 +115,7 @@
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
alert: 'NodeFilesystemOutOfFiles',
|
alert: 'NodeFilesystemAlmostOutOfFiles',
|
||||||
expr: |||
|
expr: |||
|
||||||
(
|
(
|
||||||
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5
|
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5
|
||||||
|
@ -132,7 +132,7 @@
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
alert: 'NodeFilesystemOutOfSpace',
|
alert: 'NodeFilesystemAlmostOutOfFiles',
|
||||||
expr: |||
|
expr: |||
|
||||||
(
|
(
|
||||||
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3
|
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3
|
||||||
|
@ -155,7 +155,7 @@
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
'for': '1h',
|
'for': '1h',
|
||||||
labels: {
|
labels: {
|
||||||
severity: 'critical',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while receiving packets ({{ $value }} errors in two minutes).',
|
message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while receiving packets ({{ $value }} errors in two minutes).',
|
||||||
|
@ -168,7 +168,7 @@
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
'for': '1h',
|
'for': '1h',
|
||||||
labels: {
|
labels: {
|
||||||
severity: 'critical',
|
severity: 'warning',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while transmitting packets ({{ $value }} errors in two minutes).',
|
message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while transmitting packets ({{ $value }} errors in two minutes).',
|
||||||
|
|
|
@ -3,10 +3,11 @@
|
||||||
// Selectors are inserted between {} in Prometheus queries.
|
// Selectors are inserted between {} in Prometheus queries.
|
||||||
|
|
||||||
// Select the metrics coming from the node exporter.
|
// Select the metrics coming from the node exporter.
|
||||||
nodeExporterSelector: 'job="node-exporter"',
|
nodeExporterSelector: 'job="node"',
|
||||||
|
|
||||||
// Select the fstype for filesystem-related queries.
|
// Select the fstype for filesystem-related queries.
|
||||||
fsSelector: 'fstype=~"ext.|xfs",mountpoint!="/var/lib/docker/aufs"',
|
// TODO: What is a good default selector here?
|
||||||
|
fsSelector: 'fstype=~"ext.|xfs|jfs|btrfs|vfat|ntfs"',
|
||||||
|
|
||||||
// Select the device for disk-related queries.
|
// Select the device for disk-related queries.
|
||||||
diskDeviceSelector: 'device=~"(sd|xvd).+"',
|
diskDeviceSelector: 'device=~"(sd|xvd).+"',
|
||||||
|
|
|
@ -20,8 +20,9 @@ local gauge = promgrafonnet.gauge;
|
||||||
min=0,
|
min=0,
|
||||||
)
|
)
|
||||||
.addTarget(prometheus.target(
|
.addTarget(prometheus.target(
|
||||||
|
// TODO: Consider using `${__interval}` as range and a 1m min step.
|
||||||
|||
|
|||
|
||||||
1 - avg by (cpu) (irate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m]))
|
1 - avg by (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m]))
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
legendFormat='{{cpu}}',
|
legendFormat='{{cpu}}',
|
||||||
intervalFactor=10,
|
intervalFactor=10,
|
||||||
|
@ -81,9 +82,10 @@ local gauge = promgrafonnet.gauge;
|
||||||
datasource='$datasource',
|
datasource='$datasource',
|
||||||
span=9,
|
span=9,
|
||||||
)
|
)
|
||||||
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read'))
|
// TODO: Consider using `${__interval}` as range and a 1m min step.
|
||||||
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written'))
|
.addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} read'))
|
||||||
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) +
|
.addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} written'))
|
||||||
|
.addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} io time')) +
|
||||||
{
|
{
|
||||||
seriesOverrides: [
|
seriesOverrides: [
|
||||||
{
|
{
|
||||||
|
@ -122,7 +124,8 @@ local gauge = promgrafonnet.gauge;
|
||||||
span=6,
|
span=6,
|
||||||
format='bytes',
|
format='bytes',
|
||||||
)
|
)
|
||||||
.addTarget(prometheus.target('irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
|
// TODO: Consider using `${__interval}` as range and a 1m min step.
|
||||||
|
.addTarget(prometheus.target('rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
|
||||||
|
|
||||||
local networkTransmitted =
|
local networkTransmitted =
|
||||||
graphPanel.new(
|
graphPanel.new(
|
||||||
|
@ -131,7 +134,8 @@ local gauge = promgrafonnet.gauge;
|
||||||
span=6,
|
span=6,
|
||||||
format='bytes',
|
format='bytes',
|
||||||
)
|
)
|
||||||
.addTarget(prometheus.target('irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
|
// TODO: Consider using `${__interval}` as range and a 1m min step.
|
||||||
|
.addTarget(prometheus.target('rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
|
||||||
|
|
||||||
dashboard.new('Nodes', time_from='now-1h')
|
dashboard.new('Nodes', time_from='now-1h')
|
||||||
.addTemplate(
|
.addTemplate(
|
||||||
|
|
|
@ -12,7 +12,7 @@ local g = import 'grafana-builder/grafana.libsonnet';
|
||||||
g.panel('CPU Utilisation') +
|
g.panel('CPU Utilisation') +
|
||||||
g.queryPanel(|||
|
g.queryPanel(|||
|
||||||
(
|
(
|
||||||
instance:node_cpu_utilisation:avg1m
|
instance:node_cpu_utilisation:avg_rate1m
|
||||||
*
|
*
|
||||||
instance:node_num_cpu:sum
|
instance:node_num_cpu:sum
|
||||||
/ ignoring (instance) group_left
|
/ ignoring (instance) group_left
|
||||||
|
@ -60,9 +60,9 @@ local g = import 'grafana-builder/grafana.libsonnet';
|
||||||
// 1 second per second doing I/O, normalize by metric cardinality for stacked charts.
|
// 1 second per second doing I/O, normalize by metric cardinality for stacked charts.
|
||||||
g.queryPanel(|||
|
g.queryPanel(|||
|
||||||
(
|
(
|
||||||
instance:node_disk_utilisation:sum_irate
|
instance:node_disk_io_time:sum_rate1m
|
||||||
/ ignoring (instance) group_left
|
/ ignoring (instance) group_left
|
||||||
count without (instance) (instance:node_disk_utilisation:sum_irate)
|
count without (instance) (instance:node_disk_io_time:sum_rate1m)
|
||||||
)
|
)
|
||||||
|||, '{{instance}}', legendLink) +
|
|||, '{{instance}}', legendLink) +
|
||||||
g.stack +
|
g.stack +
|
||||||
|
@ -72,9 +72,9 @@ local g = import 'grafana-builder/grafana.libsonnet';
|
||||||
g.panel('Disk IO Saturation') +
|
g.panel('Disk IO Saturation') +
|
||||||
g.queryPanel(|||
|
g.queryPanel(|||
|
||||||
(
|
(
|
||||||
instance:node_disk_saturation:sum_irate
|
instance:node_disk_io_time_weighted:sum_rate1m
|
||||||
/ ignoring (instance) group_left
|
/ ignoring (instance) group_left
|
||||||
count without (instance) (instance:node_disk_saturation:sum_irate)
|
count without (instance) (instance:node_disk_io_time_weighted:sum_rate1m)
|
||||||
)
|
)
|
||||||
|||, '{{instance}}', legendLink) +
|
|||, '{{instance}}', legendLink) +
|
||||||
g.stack +
|
g.stack +
|
||||||
|
@ -127,7 +127,7 @@ local g = import 'grafana-builder/grafana.libsonnet';
|
||||||
g.row('CPU')
|
g.row('CPU')
|
||||||
.addPanel(
|
.addPanel(
|
||||||
g.panel('CPU Utilisation') +
|
g.panel('CPU Utilisation') +
|
||||||
g.queryPanel('instance:node_cpu_utilisation:avg1m{instance="$instance"}', 'Utilisation') +
|
g.queryPanel('instance:node_cpu_utilisation:avg_rate1m{instance="$instance"}', 'Utilisation') +
|
||||||
{ yaxes: g.yaxes('percentunit') },
|
{ yaxes: g.yaxes('percentunit') },
|
||||||
)
|
)
|
||||||
.addPanel(
|
.addPanel(
|
||||||
|
@ -145,7 +145,7 @@ local g = import 'grafana-builder/grafana.libsonnet';
|
||||||
)
|
)
|
||||||
.addPanel(
|
.addPanel(
|
||||||
g.panel('Memory Saturation (pages swapped per second)') +
|
g.panel('Memory Saturation (pages swapped per second)') +
|
||||||
g.queryPanel('instance:node_memory_swap_io_pages:sum_rate{instance="$instance"}', 'Swap IO') +
|
g.queryPanel('instance:node_memory_swap_io_pages:rate1m{instance="$instance"}', 'Swap IO') +
|
||||||
{ yaxes: g.yaxes('short') },
|
{ yaxes: g.yaxes('short') },
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
@ -153,26 +153,32 @@ local g = import 'grafana-builder/grafana.libsonnet';
|
||||||
g.row('Disk')
|
g.row('Disk')
|
||||||
.addPanel(
|
.addPanel(
|
||||||
g.panel('Disk IO Utilisation') +
|
g.panel('Disk IO Utilisation') +
|
||||||
g.queryPanel('instance:node_disk_utilisation:sum_irate{instance="$instance"}', 'Utilisation') +
|
g.queryPanel('instance:node_disk_io_time:sum_rate1m{instance="$instance"}', 'Utilisation') +
|
||||||
{ yaxes: g.yaxes('percentunit') },
|
{ yaxes: g.yaxes('percentunit') },
|
||||||
)
|
)
|
||||||
.addPanel(
|
.addPanel(
|
||||||
g.panel('Disk IO Saturation') +
|
g.panel('Disk IO Saturation') +
|
||||||
g.queryPanel('instance:node_disk_saturation:sum_irate{instance="$instance"}', 'Saturation') +
|
g.queryPanel('instance:node_disk_io_time_weighted:sum_rate1m{instance="$instance"}', 'Saturation') +
|
||||||
{ yaxes: g.yaxes('percentunit') },
|
{ yaxes: g.yaxes('percentunit') },
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
.addRow(
|
.addRow(
|
||||||
g.row('Net')
|
g.row('Net')
|
||||||
.addPanel(
|
.addPanel(
|
||||||
g.panel('Net Utilisation (Transmitted)') +
|
g.panel('Net Utilisation (Bytes Receive/Transmit)') +
|
||||||
g.queryPanel('instance:node_net_utilisation:sum_irate{instance="$instance"}', 'Utilisation') +
|
g.queryPanel(
|
||||||
|
['node_network_receive_bytes_total{instance="$instance"}', '-node_network_transmit_bytes_total{instance="$instance"}'],
|
||||||
|
['Receive', 'Transmit'],
|
||||||
|
) +
|
||||||
{ yaxes: g.yaxes('Bps') },
|
{ yaxes: g.yaxes('Bps') },
|
||||||
)
|
)
|
||||||
.addPanel(
|
.addPanel(
|
||||||
g.panel('Net Saturation (Dropped)') +
|
g.panel('Net Saturation (Drops Receive/Transmit)') +
|
||||||
g.queryPanel('instance:node_net_saturation:sum_irate{instance="$instance"}', 'Saturation') +
|
g.queryPanel(
|
||||||
{ yaxes: g.yaxes('Bps') },
|
['node_network_receive_drop_total{instance="$instance"}', '-node_network_transmit_drop_total{instance="$instance"}'],
|
||||||
|
['Receive drops', 'Transmit drops'],
|
||||||
|
) +
|
||||||
|
{ yaxes: g.yaxes('rps') },
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
.addRow(
|
.addRow(
|
||||||
|
|
|
@ -17,7 +17,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
// CPU utilisation is % CPU is not idle.
|
// CPU utilisation is % CPU is not idle.
|
||||||
record: 'instance:node_cpu_utilisation:avg1m',
|
record: 'instance:node_cpu_utilisation:avg_rate1m',
|
||||||
expr: |||
|
expr: |||
|
||||||
1 - avg without (cpu, mode) (
|
1 - avg without (cpu, mode) (
|
||||||
rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}[1m])
|
rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}[1m])
|
||||||
|
@ -48,7 +48,7 @@
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
record: 'instance:node_memory_swap_io_pages:sum_rate',
|
record: 'instance:node_memory_swap_io_pages:rate1m',
|
||||||
expr: |||
|
expr: |||
|
||||||
(
|
(
|
||||||
rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m])
|
rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m])
|
||||||
|
@ -58,42 +58,54 @@
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
// Disk utilisation (ms spent, 1 second irate())
|
// Disk utilisation (seconds spent, 1 second rate)
|
||||||
record: 'instance:node_disk_utilisation:sum_irate',
|
record: 'instance:node_disk_io_time:sum_rate1m',
|
||||||
expr: |||
|
expr: |||
|
||||||
sum without (device) (
|
sum without (device) (
|
||||||
irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m])
|
rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m])
|
||||||
)
|
)
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
// Disk saturation (ms spent, by rate() it's bound by 1 second)
|
// Disk saturation (weighted seconds spent, 1 second rate)
|
||||||
record: 'instance:node_disk_saturation:sum_irate',
|
record: 'instance:node_disk_io_time_weighted:sum_rate1m',
|
||||||
expr: |||
|
expr: |||
|
||||||
sum without (device) (
|
sum without (device) (
|
||||||
irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m])
|
rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m])
|
||||||
)
|
)
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
},
|
},
|
||||||
// TODO: For the following two rules, consider configurable filtering to exclude more network
|
// TODO: For the following rules, consider configurable filtering to exclude more network
|
||||||
// device names than just "lo".
|
// device names than just "lo".
|
||||||
{
|
{
|
||||||
record: 'instance:node_net_utilisation:sum_irate',
|
record: 'instance:node_network_receive_bytes:sum_rate1m',
|
||||||
expr: |||
|
expr: |||
|
||||||
sum without (device) (
|
sum without (device) (
|
||||||
irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m])
|
rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m])
|
||||||
+
|
|
||||||
irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m])
|
|
||||||
)
|
)
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
record: 'instance:node_net_saturation:sum_irate',
|
record: 'instance:node_network_transmit_bytes:sum_rate1m',
|
||||||
expr: |||
|
expr: |||
|
||||||
sum without (device) (
|
sum without (device) (
|
||||||
irate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m])
|
rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m])
|
||||||
+
|
)
|
||||||
irate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m])
|
||| % $._config,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
record: 'instance:node_network_receive_drop:sum_rate1m',
|
||||||
|
expr: |||
|
||||||
|
sum without (device) (
|
||||||
|
rate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m])
|
||||||
|
)
|
||||||
|
||| % $._config,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
record: 'instance:node_network_transmit_drop:sum_rate1m',
|
||||||
|
expr: |||
|
||||||
|
sum without (device) (
|
||||||
|
rate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m])
|
||||||
)
|
)
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
},
|
},
|
||||||
|
|
Loading…
Reference in a new issue