2018-05-08 03:10:29 -07:00
|
|
|
local g = import 'grafana-builder/grafana.libsonnet';
|
|
|
|
|
|
|
|
{
|
|
|
|
grafanaDashboards+:: {
|
|
|
|
'node-cluster-rsrc-use.json':
|
|
|
|
local legendLink = '%s/dashboard/file/k8s-node-rsrc-use.json' % $._config.grafana_prefix;
|
|
|
|
|
|
|
|
g.dashboard('USE Method / Cluster')
|
|
|
|
.addRow(
|
|
|
|
g.row('CPU')
|
|
|
|
.addPanel(
|
|
|
|
g.panel('CPU Utilisation') +
|
2019-07-12 13:58:43 -07:00
|
|
|
g.queryPanel(|||
|
|
|
|
(
|
2019-07-16 12:18:17 -07:00
|
|
|
instance:node_cpu_utilisation:avg_rate1m
|
2019-07-12 13:58:43 -07:00
|
|
|
*
|
|
|
|
instance:node_num_cpu:sum
|
|
|
|
/ ignoring (instance) group_left
|
|
|
|
sum without (instance) (instance:node_num_cpu:sum)
|
|
|
|
)
|
|
|
|
|||, '{{instance}}', legendLink) +
|
2018-05-08 03:10:29 -07:00
|
|
|
g.stack +
|
|
|
|
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
|
|
|
)
|
|
|
|
.addPanel(
|
2019-07-12 13:58:43 -07:00
|
|
|
// TODO: Is this a useful panel?
|
|
|
|
g.panel('CPU Saturation (load1 per CPU)') +
|
2018-05-08 03:10:29 -07:00
|
|
|
g.queryPanel(|||
|
2019-07-12 13:58:43 -07:00
|
|
|
(
|
|
|
|
instance:node_load1_per_cpu:ratio
|
|
|
|
/ ignoring (instance) group_left
|
|
|
|
count without (instance) (instance:node_load1_per_cpu:ratio)
|
|
|
|
)
|
|
|
|
|||, '{{instance}}', legendLink) +
|
2018-05-08 03:10:29 -07:00
|
|
|
g.stack +
|
2019-07-12 13:58:43 -07:00
|
|
|
// TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios.
|
2018-05-08 03:10:29 -07:00
|
|
|
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
|
|
|
)
|
|
|
|
)
|
|
|
|
.addRow(
|
|
|
|
g.row('Memory')
|
|
|
|
.addPanel(
|
|
|
|
g.panel('Memory Utilisation') +
|
|
|
|
g.queryPanel('instance:node_memory_utilisation:ratio', '{{instance}}', legendLink) +
|
|
|
|
g.stack +
|
|
|
|
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
|
|
|
)
|
|
|
|
.addPanel(
|
|
|
|
g.panel('Memory Saturation (Swap I/O)') +
|
|
|
|
g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate', '{{instance}}', legendLink) +
|
|
|
|
g.stack +
|
|
|
|
{ yaxes: g.yaxes('Bps') },
|
|
|
|
)
|
|
|
|
)
|
|
|
|
.addRow(
|
|
|
|
g.row('Disk')
|
|
|
|
.addPanel(
|
|
|
|
g.panel('Disk IO Utilisation') +
|
|
|
|
// Full utilisation would be all disks on each node spending an average of
|
2019-07-12 13:58:43 -07:00
|
|
|
// 1 second per second doing I/O, normalize by metric cardinality for stacked charts.
|
|
|
|
g.queryPanel(|||
|
|
|
|
(
|
2019-07-16 12:18:17 -07:00
|
|
|
instance:node_disk_io_time:sum_rate1m
|
2019-07-12 13:58:43 -07:00
|
|
|
/ ignoring (instance) group_left
|
2019-07-16 12:18:17 -07:00
|
|
|
count without (instance) (instance:node_disk_io_time:sum_rate1m)
|
2019-07-12 13:58:43 -07:00
|
|
|
)
|
|
|
|
|||, '{{instance}}', legendLink) +
|
2018-05-08 03:10:29 -07:00
|
|
|
g.stack +
|
|
|
|
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
|
|
|
)
|
|
|
|
.addPanel(
|
|
|
|
g.panel('Disk IO Saturation') +
|
|
|
|
g.queryPanel(|||
|
2019-07-12 13:58:43 -07:00
|
|
|
(
|
2019-07-16 12:18:17 -07:00
|
|
|
instance:node_disk_io_time_weighted:sum_rate1m
|
2019-07-12 13:58:43 -07:00
|
|
|
/ ignoring (instance) group_left
|
2019-07-16 12:18:17 -07:00
|
|
|
count without (instance) (instance:node_disk_io_time_weighted:sum_rate1m)
|
2019-07-12 13:58:43 -07:00
|
|
|
)
|
|
|
|
|||, '{{instance}}', legendLink) +
|
2018-05-08 03:10:29 -07:00
|
|
|
g.stack +
|
|
|
|
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
|
|
|
)
|
|
|
|
)
|
|
|
|
.addRow(
|
|
|
|
g.row('Network')
|
|
|
|
.addPanel(
|
|
|
|
g.panel('Net Utilisation (Transmitted)') +
|
|
|
|
g.queryPanel('instance:node_net_utilisation:sum_irate', '{{instance}}', legendLink) +
|
|
|
|
g.stack +
|
|
|
|
{ yaxes: g.yaxes('Bps') },
|
|
|
|
)
|
|
|
|
.addPanel(
|
|
|
|
g.panel('Net Saturation (Dropped)') +
|
|
|
|
g.queryPanel('instance:node_net_saturation:sum_irate', '{{instance}}', legendLink) +
|
|
|
|
g.stack +
|
|
|
|
{ yaxes: g.yaxes('Bps') },
|
|
|
|
)
|
|
|
|
)
|
|
|
|
.addRow(
|
|
|
|
g.row('Storage')
|
|
|
|
.addPanel(
|
|
|
|
g.panel('Disk Capacity') +
|
2019-07-12 13:58:43 -07:00
|
|
|
g.queryPanel(|||
|
|
|
|
(
|
|
|
|
sum without (device) (
|
|
|
|
max without (fstype, mountpoint) (
|
2019-07-16 10:34:27 -07:00
|
|
|
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
|
2019-07-12 13:58:43 -07:00
|
|
|
)
|
|
|
|
)
|
|
|
|
/ ignoring (instance) group_left
|
|
|
|
sum without (instance, device) (
|
|
|
|
max without (fstype, mountpoint) (
|
2019-07-16 10:34:27 -07:00
|
|
|
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
|
2019-07-12 13:58:43 -07:00
|
|
|
)
|
|
|
|
)
|
|
|
|
)
|
2019-07-16 10:34:27 -07:00
|
|
|
||| % $._config, '{{instance}}', legendLink) +
|
2018-05-08 03:10:29 -07:00
|
|
|
g.stack +
|
|
|
|
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
|
|
|
),
|
|
|
|
),
|
|
|
|
|
2018-05-11 06:40:20 -07:00
|
|
|
'node-rsrc-use.json':
|
|
|
|
g.dashboard('USE Method / Node')
|
2018-05-08 03:10:29 -07:00
|
|
|
.addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance')
|
|
|
|
.addRow(
|
|
|
|
g.row('CPU')
|
|
|
|
.addPanel(
|
|
|
|
g.panel('CPU Utilisation') +
|
2019-07-16 12:18:17 -07:00
|
|
|
g.queryPanel('instance:node_cpu_utilisation:avg_rate1m{instance="$instance"}', 'Utilisation') +
|
2018-05-08 03:10:29 -07:00
|
|
|
{ yaxes: g.yaxes('percentunit') },
|
|
|
|
)
|
|
|
|
.addPanel(
|
|
|
|
g.panel('CPU Saturation (Load1)') +
|
|
|
|
g.queryPanel('instance:node_cpu_saturation_load1:{instance="$instance"}', 'Saturation') +
|
|
|
|
{ yaxes: g.yaxes('percentunit') },
|
|
|
|
)
|
|
|
|
)
|
|
|
|
.addRow(
|
|
|
|
g.row('Memory')
|
|
|
|
.addPanel(
|
|
|
|
g.panel('Memory Utilisation') +
|
2018-05-10 02:35:48 -07:00
|
|
|
g.queryPanel('instance:node_memory_utilisation:ratio{instance="$instance"}', 'Memory') +
|
2018-05-08 03:10:29 -07:00
|
|
|
{ yaxes: g.yaxes('percentunit') },
|
|
|
|
)
|
|
|
|
.addPanel(
|
2019-07-12 13:58:43 -07:00
|
|
|
g.panel('Memory Saturation (pages swapped per second)') +
|
2019-07-16 12:18:17 -07:00
|
|
|
g.queryPanel('instance:node_memory_swap_io_pages:rate1m{instance="$instance"}', 'Swap IO') +
|
2019-07-12 13:58:43 -07:00
|
|
|
{ yaxes: g.yaxes('short') },
|
2018-05-08 03:10:29 -07:00
|
|
|
)
|
|
|
|
)
|
|
|
|
.addRow(
|
|
|
|
g.row('Disk')
|
|
|
|
.addPanel(
|
|
|
|
g.panel('Disk IO Utilisation') +
|
2019-07-16 12:18:17 -07:00
|
|
|
g.queryPanel('instance:node_disk_io_time:sum_rate1m{instance="$instance"}', 'Utilisation') +
|
2018-05-08 03:10:29 -07:00
|
|
|
{ yaxes: g.yaxes('percentunit') },
|
|
|
|
)
|
|
|
|
.addPanel(
|
|
|
|
g.panel('Disk IO Saturation') +
|
2019-07-16 12:18:17 -07:00
|
|
|
g.queryPanel('instance:node_disk_io_time_weighted:sum_rate1m{instance="$instance"}', 'Saturation') +
|
2018-05-08 03:10:29 -07:00
|
|
|
{ yaxes: g.yaxes('percentunit') },
|
|
|
|
)
|
|
|
|
)
|
|
|
|
.addRow(
|
|
|
|
g.row('Net')
|
|
|
|
.addPanel(
|
2019-07-16 12:18:17 -07:00
|
|
|
g.panel('Net Utilisation (Bytes Receive/Transmit)') +
|
|
|
|
g.queryPanel(
|
|
|
|
['node_network_receive_bytes_total{instance="$instance"}', '-node_network_transmit_bytes_total{instance="$instance"}'],
|
|
|
|
['Receive', 'Transmit'],
|
|
|
|
) +
|
2018-05-08 03:10:29 -07:00
|
|
|
{ yaxes: g.yaxes('Bps') },
|
|
|
|
)
|
|
|
|
.addPanel(
|
2019-07-16 12:18:17 -07:00
|
|
|
g.panel('Net Saturation (Drops Receive/Transmit)') +
|
|
|
|
g.queryPanel(
|
|
|
|
['node_network_receive_drop_total{instance="$instance"}', '-node_network_transmit_drop_total{instance="$instance"}'],
|
|
|
|
['Receive drops', 'Transmit drops'],
|
|
|
|
) +
|
|
|
|
{ yaxes: g.yaxes('rps') },
|
2018-05-08 03:10:29 -07:00
|
|
|
)
|
|
|
|
)
|
|
|
|
.addRow(
|
|
|
|
g.row('Disk')
|
|
|
|
.addPanel(
|
|
|
|
g.panel('Disk Utilisation') +
|
2019-07-12 13:58:43 -07:00
|
|
|
g.queryPanel(|||
|
|
|
|
1 -
|
|
|
|
(
|
2019-07-16 10:34:27 -07:00
|
|
|
sum(max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s}))
|
2019-07-12 13:58:43 -07:00
|
|
|
/
|
2019-07-16 10:34:27 -07:00
|
|
|
sum(max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s}))
|
2019-07-12 13:58:43 -07:00
|
|
|
)
|
2019-07-16 10:34:27 -07:00
|
|
|
||| % $._config, 'Disk') +
|
2018-05-08 03:10:29 -07:00
|
|
|
{ yaxes: g.yaxes('percentunit') },
|
|
|
|
),
|
|
|
|
),
|
|
|
|
},
|
|
|
|
}
|