node_exporter/docs/node-mixin/dashboards/use.libsonnet
Benoît Knecht 5a7b85876d docs/node-mixin: Improve memory pressure rule
The `instance:node_memory_swap_io_pages:rate1m` rule was intended to
measure the amount of memory pressure a system is under, but its name is
a bit misleading (it specifically refers to swap), and the rate of
`node_vmstat_pgmajfault` is a better metric for memory pressure
(see #1524).

This commit renames `instance:node_memory_swap_io_pages:rate1m` to
`instance:node_vmstat_pgmajfault:rate1m`, and defines it as
`rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[1m])`. The
dashboards are updated accordingly.

Signed-off-by: Benoît Knecht <benoit.knecht@fsfe.org>
2019-10-28 15:12:42 +01:00

296 lines
11 KiB
Plaintext

local g = import 'grafana-builder/grafana.libsonnet';
{
grafanaDashboards+:: {
'node-cluster-rsrc-use.json':
local legendLink = '%s/dashboard/file/node-rsrc-use.json' % $._config.grafana_prefix;
g.dashboard('USE Method / Cluster')
.addRow(
g.row('CPU')
.addPanel(
g.panel('CPU Utilisation') +
g.queryPanel(|||
(
instance:node_cpu_utilisation:rate1m{%(nodeExporterSelector)s}
*
instance:node_num_cpu:sum{%(nodeExporterSelector)s}
/ ignoring (instance) group_left
sum without (instance) (instance:node_num_cpu:sum{%(nodeExporterSelector)s})
)
||| % $._config, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
.addPanel(
// TODO: Is this a useful panel? At least there should be some explanation how load
// average relates to the "CPU saturation" in the title.
g.panel('CPU Saturation (load1 per CPU)') +
g.queryPanel(|||
(
instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}
/ ignoring (instance) group_left
count without (instance) (instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s})
)
||| % $._config, '{{instance}}', legendLink) +
g.stack +
// TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios.
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
)
.addRow(
g.row('Memory')
.addPanel(
g.panel('Memory Utilisation') +
g.queryPanel(|||
(
instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}
/ ignoring (instance) group_left
count without (instance) (instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s})
)
||| % $._config, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
.addPanel(
g.panel('Memory Saturation (Major Page Faults)') +
g.queryPanel('instance:node_vmstat_pgmajfault:rate1m{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes('rps') },
)
)
.addRow(
g.row('Network')
.addPanel(
g.panel('Net Utilisation (Bytes Receive/Transmit)') +
g.queryPanel(
[
'instance:node_network_receive_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config,
'instance:node_network_transmit_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config,
],
['{{instance}} Receive', '{{instance}} Transmit'],
legendLink,
) +
g.stack +
{
yaxes: g.yaxes({ format: 'Bps', min: null }),
seriesOverrides: [
{
alias: '/ Receive/',
stack: 'A',
},
{
alias: '/ Transmit/',
stack: 'B',
transform: 'negative-Y',
},
],
},
)
.addPanel(
g.panel('Net Saturation (Drops Receive/Transmit)') +
g.queryPanel(
[
'instance:node_network_receive_drop_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config,
'instance:node_network_transmit_drop_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config,
],
['{{instance}} Receive', '{{instance}} Transmit'],
legendLink,
) +
g.stack +
{
yaxes: g.yaxes({ format: 'rps', min: null }),
seriesOverrides: [
{
alias: '/ Receive/',
stack: 'A',
},
{
alias: '/ Transmit/',
stack: 'B',
transform: 'negative-Y',
},
],
},
)
)
.addRow(
g.row('Disk IO')
.addPanel(
g.panel('Disk IO Utilisation') +
// Full utilisation would be all disks on each node spending an average of
// 1 second per second doing I/O, normalize by metric cardinality for stacked charts.
// TODO: Does the partition by device make sense? Using the most utilized device per
// instance might make more sense.
g.queryPanel(|||
(
instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s}
/ ignoring (instance, device) group_left
count without (instance, device) (instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s})
)
||| % $._config, '{{instance}} {{device}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
.addPanel(
g.panel('Disk IO Saturation') +
g.queryPanel(|||
(
instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s}
/ ignoring (instance, device) group_left
count without (instance, device) (instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s})
)
||| % $._config, '{{instance}} {{device}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
)
.addRow(
g.row('Disk Space')
.addPanel(
g.panel('Disk Space Utilisation') +
g.queryPanel(|||
(
sum without (device) (
max without (fstype, mountpoint) (
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
)
)
/ ignoring (instance) group_left
sum without (instance, device) (
max without (fstype, mountpoint) (
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
)
)
)
||| % $._config, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
),
),
'node-rsrc-use.json':
g.dashboard('USE Method / Node')
.addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance')
.addRow(
g.row('CPU')
.addPanel(
g.panel('CPU Utilisation') +
g.queryPanel('instance:node_cpu_utilisation:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation') +
{
yaxes: g.yaxes('percentunit'),
legend+: { show: false },
},
)
.addPanel(
// TODO: Is this a useful panel? At least there should be some explanation how load
// average relates to the "CPU saturation" in the title.
g.panel('CPU Saturation (Load1 per CPU)') +
g.queryPanel('instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') +
{
yaxes: g.yaxes('percentunit'),
legend+: { show: false },
},
)
)
.addRow(
g.row('Memory')
.addPanel(
g.panel('Memory Utilisation') +
g.queryPanel('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Memory') +
{ yaxes: g.yaxes('percentunit') },
)
.addPanel(
g.panel('Memory Saturation (Major Page Faults)') +
g.queryPanel('instance:node_vmstat_pgmajfault:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Major page faults') +
{
yaxes: g.yaxes('short'),
legend+: { show: false },
},
)
)
.addRow(
g.row('Net')
.addPanel(
g.panel('Net Utilisation (Bytes Receive/Transmit)') +
g.queryPanel(
[
'instance:node_network_receive_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config,
'instance:node_network_transmit_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config,
],
['Receive', 'Transmit'],
) +
{
yaxes: g.yaxes({ format: 'Bps', min: null }),
seriesOverrides: [
{
alias: '/Receive/',
stack: 'A',
},
{
alias: '/Transmit/',
stack: 'B',
transform: 'negative-Y',
},
],
},
)
.addPanel(
g.panel('Net Saturation (Drops Receive/Transmit)') +
g.queryPanel(
[
'instance:node_network_receive_drop_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config,
'instance:node_network_transmit_drop_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config,
],
['Receive drops', 'Transmit drops'],
) +
{
yaxes: g.yaxes({ format: 'rps', min: null }),
seriesOverrides: [
{
alias: '/Receive/',
stack: 'A',
},
{
alias: '/Transmit/',
stack: 'B',
transform: 'negative-Y',
},
],
},
)
)
.addRow(
g.row('Disk IO')
.addPanel(
g.panel('Disk IO Utilisation') +
g.queryPanel('instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, '{{device}}') +
{ yaxes: g.yaxes('percentunit') },
)
.addPanel(
g.panel('Disk IO Saturation') +
g.queryPanel('instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, '{{device}}') +
{ yaxes: g.yaxes('percentunit') },
)
)
.addRow(
g.row('Disk Space')
.addPanel(
g.panel('Disk Space Utilisation') +
g.queryPanel(|||
1 -
(
max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"})
/
max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"})
)
||| % $._config, '{{device}}') +
{
yaxes: g.yaxes('percentunit'),
legend+: { show: false },
},
),
),
},
}