Address first batch of old review comments

Signed-off-by: beorn7 <beorn@grafana.com>
This commit is contained in:
beorn7 2019-07-12 22:58:43 +02:00
parent b3b47f2d07
commit 2180c2f3bf
3 changed files with 105 additions and 65 deletions

View file

@ -21,22 +21,23 @@ local gauge = promgrafonnet.gauge;
) )
.addTarget(prometheus.target( .addTarget(prometheus.target(
||| |||
1 - avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])) 1 - avg by (cpu) (irate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m]))
||| % $._config, ||| % $._config,
legendFormat='{{cpu}}', legendFormat='{{cpu}}',
intervalFactor=10, intervalFactor=10,
)); ));
// TODO: Is this panel useful?
local systemLoad = local systemLoad =
graphPanel.new( graphPanel.new(
'System load', 'Load Average',
datasource='$datasource', datasource='$datasource',
span=6, span=6,
format='percentunit', format='short',
) )
.addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 1m')) .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='1m load average'))
.addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 5m')) .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='5m load average'))
.addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 15m')); .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='15m load average'));
local memoryGraph = local memoryGraph =
graphPanel.new( graphPanel.new(
@ -48,27 +49,27 @@ local gauge = promgrafonnet.gauge;
.addTarget(prometheus.target( .addTarget(prometheus.target(
||| |||
( (
node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"}
- -
node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"} node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}
- -
node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"} node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}
- -
node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"} node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}
) )
||| % $._config, legendFormat='memory used' ||| % $._config, legendFormat='memory used'
)) ))
.addTarget(prometheus.target('node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers')) .addTarget(prometheus.target('node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers'))
.addTarget(prometheus.target('node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) .addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached'))
.addTarget(prometheus.target('node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); .addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free'));
local memoryGauge = gauge.new( local memoryGauge = gauge.new(
'Memory Usage', 'Memory Usage',
||| |||
( (
node_memory_MemAvailable{%(nodeExporterSelector)s, instance="$instance"} node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"}
/ /
node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"}
) )
* 100 * 100
||| % $._config, ||| % $._config,
@ -80,9 +81,9 @@ local gauge = promgrafonnet.gauge;
datasource='$datasource', datasource='$datasource',
span=9, span=9,
) )
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_read_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read')) .addTarget(prometheus.target('sum by (instance) (irate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read'))
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_written_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written')) .addTarget(prometheus.target('sum by (instance) (irate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written'))
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_ms{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) + .addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) +
{ {
seriesOverrides: [ seriesOverrides: [
{ {
@ -96,18 +97,19 @@ local gauge = promgrafonnet.gauge;
], ],
yaxes: [ yaxes: [
self.yaxe(format='bytes'), self.yaxe(format='bytes'),
self.yaxe(format='ms'), self.yaxe(format='s'),
], ],
}; };
// TODO: Should this be partitioned by mountpoint?
local diskSpaceUsage = gauge.new( local diskSpaceUsage = gauge.new(
'Disk Space Usage', 'Disk Space Usage',
||| |||
100 - 100 -
( (
sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} sum(node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"}
/ /
sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} sum(node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"}
* 100 * 100
) )
||| % $._config, ||| % $._config,
@ -120,7 +122,7 @@ local gauge = promgrafonnet.gauge;
span=6, span=6,
format='bytes', format='bytes',
) )
.addTarget(prometheus.target('irate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}')); .addTarget(prometheus.target('irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
local networkTransmitted = local networkTransmitted =
graphPanel.new( graphPanel.new(
@ -129,7 +131,7 @@ local gauge = promgrafonnet.gauge;
span=6, span=6,
format='bytes', format='bytes',
) )
.addTarget(prometheus.target('irate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}')); .addTarget(prometheus.target('irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
dashboard.new('Nodes', time_from='now-1h') dashboard.new('Nodes', time_from='now-1h')
.addTemplate( .addTemplate(
@ -152,7 +154,7 @@ local gauge = promgrafonnet.gauge;
template.new( template.new(
'instance', 'instance',
'$datasource', '$datasource',
'label_values(node_boot_time{%(nodeExporterSelector)s}, instance)' % $._config, 'label_values(node_boot_time_seconds{%(nodeExporterSelector)s}, instance)' % $._config,
refresh='time', refresh='time',
) )
) )

View file

@ -10,16 +10,30 @@ local g = import 'grafana-builder/grafana.libsonnet';
g.row('CPU') g.row('CPU')
.addPanel( .addPanel(
g.panel('CPU Utilisation') + g.panel('CPU Utilisation') +
g.queryPanel('instance:node_cpu_utilisation:avg1m * instance:node_num_cpu:sum / scalar(sum(instance:node_num_cpu:sum))', '{{instance}}', legendLink) + g.queryPanel(|||
(
instance:node_cpu_utilisation:avg1m
*
instance:node_num_cpu:sum
/ ignoring (instance) group_left
sum without (instance) (instance:node_num_cpu:sum)
)
|||, '{{instance}}', legendLink) +
g.stack + g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
) )
.addPanel( .addPanel(
g.panel('CPU Saturation (Load1)') + // TODO: Is this a useful panel?
g.panel('CPU Saturation (load1 per CPU)') +
g.queryPanel(||| g.queryPanel(|||
instance:node_cpu_saturation_load1: / scalar(sum(up{%(nodeExporterSelector)s})) (
||| % $._config, '{{instance}}', legendLink) + instance:node_load1_per_cpu:ratio
/ ignoring (instance) group_left
count without (instance) (instance:node_load1_per_cpu:ratio)
)
|||, '{{instance}}', legendLink) +
g.stack + g.stack +
// TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios.
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
) )
) )
@ -43,16 +57,26 @@ local g = import 'grafana-builder/grafana.libsonnet';
.addPanel( .addPanel(
g.panel('Disk IO Utilisation') + g.panel('Disk IO Utilisation') +
// Full utilisation would be all disks on each node spending an average of // Full utilisation would be all disks on each node spending an average of
// 1 sec per second doing I/O, normalize by node count for stacked charts // 1 second per second doing I/O, normalize by metric cardinality for stacked charts.
g.queryPanel('instance:node_disk_utilisation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s}))' % $._config, '{{instance}}', legendLink) + g.queryPanel(|||
(
instance:node_disk_utilisation:sum_irate
/ ignoring (instance) group_left
count without (instance) (instance:node_disk_utilisation:sum_irate)
)
|||, '{{instance}}', legendLink) +
g.stack + g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
) )
.addPanel( .addPanel(
g.panel('Disk IO Saturation') + g.panel('Disk IO Saturation') +
g.queryPanel(||| g.queryPanel(|||
instance:node_disk_saturation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s})) (
||| % $._config, '{{instance}}', legendLink) + instance:node_disk_saturation:sum_irate
/ ignoring (instance) group_left
count without (instance) (instance:node_disk_saturation:sum_irate)
)
|||, '{{instance}}', legendLink) +
g.stack + g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
) )
@ -76,7 +100,21 @@ local g = import 'grafana-builder/grafana.libsonnet';
g.row('Storage') g.row('Storage')
.addPanel( .addPanel(
g.panel('Disk Capacity') + g.panel('Disk Capacity') +
g.queryPanel('sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"} - node_filesystem_free_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) + g.queryPanel(|||
(
sum without (device) (
max without (fstype, mountpoint) (
node_filesystem_size_bytes{fstype=~"ext[24]"} - node_filesystem_avail_bytes{fstype=~"ext[24]"}
)
)
/ ignoring (instance) group_left
sum without (instance, device) (
max without (fstype, mountpoint) (
node_filesystem_size_bytes{fstype=~"ext[24]"}
)
)
)
|||, '{{instance}}', legendLink) +
g.stack + g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
), ),
@ -106,9 +144,9 @@ local g = import 'grafana-builder/grafana.libsonnet';
{ yaxes: g.yaxes('percentunit') }, { yaxes: g.yaxes('percentunit') },
) )
.addPanel( .addPanel(
g.panel('Memory Saturation (Swap I/O)') + g.panel('Memory Saturation (pages swapped per second)') +
g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{instance="$instance"}', 'Swap IO') + g.queryPanel('instance:node_memory_swap_io_pages:sum_rate{instance="$instance"}', 'Swap IO') +
{ yaxes: g.yaxes('Bps') }, { yaxes: g.yaxes('short') },
) )
) )
.addRow( .addRow(
@ -141,7 +179,14 @@ local g = import 'grafana-builder/grafana.libsonnet';
g.row('Disk') g.row('Disk')
.addPanel( .addPanel(
g.panel('Disk Utilisation') + g.panel('Disk Utilisation') +
g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free_bytes{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size_bytes{fstype=~"ext[24]"}))', 'Disk') + g.queryPanel(|||
1 -
(
sum(max without (mountpoint, fstype) (node_filesystem_avail_bytes{fstype=~"ext[24]"}))
/
sum(max without (mountpoint, fstype) (node_filesystem_size_bytes{fstype=~"ext[24]"}))
)
|||, 'Disk') +
{ yaxes: g.yaxes('percentunit') }, { yaxes: g.yaxes('percentunit') },
), ),
), ),

View file

@ -8,8 +8,8 @@
// This rule gives the number of CPUs per node. // This rule gives the number of CPUs per node.
record: 'instance:node_num_cpu:sum', record: 'instance:node_num_cpu:sum',
expr: ||| expr: |||
count by (instance) ( count without (cpu) (
sum by (instance, cpu) ( sum without (mode) (
node_cpu_seconds_total{%(nodeExporterSelector)s} node_cpu_seconds_total{%(nodeExporterSelector)s}
) )
) )
@ -19,29 +19,20 @@
// CPU utilisation is % CPU is not idle. // CPU utilisation is % CPU is not idle.
record: 'instance:node_cpu_utilisation:avg1m', record: 'instance:node_cpu_utilisation:avg1m',
expr: ||| expr: |||
1 - avg by (instance) ( 1 - avg without (cpu, mode) (
rate(node_cpu_seconds_total{%(nodeExporterSelector)s,mode="idle"}[1m]) rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}[1m])
) )
||| % $._config, ||| % $._config,
}, },
{ {
// CPU saturation is 1min avg run queue length / number of CPUs. // This is CPU saturation: 1min avg run queue length / number of CPUs.
// Can go over 100%. >100% is bad. // Can go over 1. >1 is bad.
record: 'instance:node_cpu_saturation_load1:', record: 'instance:node_load1_per_cpu:ratio',
expr: ||| expr: |||
( (
sum by (instance) (node_load1{%(nodeExporterSelector)s}) node_load1{%(nodeExporterSelector)s}
/ /
instance:node_num_cpu:sum instance:node_num_cpu:sum{%(nodeExporterSelector)s}
)
||| % $._config,
},
{
// Total memory per node
record: 'instance:node_memory_bytes_total:sum',
expr: |||
sum by (instance) (
node_memory_MemTotal_bytes{%(nodeExporterSelector)s}
) )
||| % $._config, ||| % $._config,
}, },
@ -57,9 +48,9 @@
||| % $._config, ||| % $._config,
}, },
{ {
record: 'instance:node_memory_swap_io_bytes:sum_rate', record: 'instance:node_memory_swap_io_pages:sum_rate',
expr: ||| expr: |||
1e3 * sum by (instance) ( (
rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m]) rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m])
+ +
rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m]) rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m])
@ -70,7 +61,7 @@
// Disk utilisation (ms spent, 1 second irate()) // Disk utilisation (ms spent, 1 second irate())
record: 'instance:node_disk_utilisation:sum_irate', record: 'instance:node_disk_utilisation:sum_irate',
expr: ||| expr: |||
sum by (instance) ( sum without (device) (
irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m])
) )
||| % $._config, ||| % $._config,
@ -79,28 +70,30 @@
// Disk saturation (ms spent, by rate() it's bound by 1 second) // Disk saturation (ms spent, by rate() it's bound by 1 second)
record: 'instance:node_disk_saturation:sum_irate', record: 'instance:node_disk_saturation:sum_irate',
expr: ||| expr: |||
sum by (instance) ( sum without (device) (
irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m])
) )
||| % $._config, ||| % $._config,
}, },
// TODO: For the following two rules, consider configurable filtering to exclude more network
// device names than just "lo".
{ {
record: 'instance:node_net_utilisation:sum_irate', record: 'instance:node_net_utilisation:sum_irate',
expr: ||| expr: |||
sum by (instance) ( sum without (device) (
irate(node_network_receive_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m])
+ +
irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m])
) )
||| % $._config, ||| % $._config,
}, },
{ {
record: 'instance:node_net_saturation:sum_irate', record: 'instance:node_net_saturation:sum_irate',
expr: ||| expr: |||
sum by (instance) ( sum without (device) (
irate(node_network_receive_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) irate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m])
+ +
irate(node_network_transmit_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) irate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m])
) )
||| % $._config, ||| % $._config,
}, },