Refactor USE method mixin dashboards with grafonnet-lib, add multi-cluster support.

Aiming for cleaner code and following standards used on younger mixins.

Signed-off-by: ArthurSens <arthursens2005@gmail.com>
This commit is contained in:
ArthurSens 2021-04-02 00:34:23 +00:00 committed by Johannes 'fish' Ziemke
parent 129b5f5b5f
commit 3731f93fd7
3 changed files with 475 additions and 272 deletions

View file

@ -52,8 +52,12 @@
fsSpaceAvailableCriticalThreshold: 5, fsSpaceAvailableCriticalThreshold: 5,
fsSpaceAvailableWarningThreshold: 3, fsSpaceAvailableWarningThreshold: 3,
grafana_prefix: '',
rateInterval: '5m', rateInterval: '5m',
// Opt-in for multi-cluster support.
showMultiCluster: false,
clusterLabel: 'cluster',
dashboardNamePrefix: 'Node Exporter / ',
dashboardTags: ['node-exporter-mixin'],
}, },
} }

View file

@ -200,7 +200,14 @@ local gauge = promgrafonnet.gauge;
legendFormat='{{device}}', legendFormat='{{device}}',
)); ));
dashboard.new('Nodes', time_from='now-1h') dashboard.new(
'%sNodes' % $._config.dashboardNamePrefix,
time_from='now-1h',
tags=($._config.dashboardTags),
timezone='utc',
refresh='30s',
graphTooltip='shared_crosshair'
)
.addTemplate( .addTemplate(
{ {
current: { current: {

View file

@ -1,275 +1,467 @@
local g = import 'github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet'; local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet';
local dashboard = grafana.dashboard;
local row = grafana.row;
local prometheus = grafana.prometheus;
local template = grafana.template;
local graphPanel = grafana.graphPanel;
local c = import '../config.libsonnet';
local datasourceTemplate = {
current: {
text: 'Prometheus',
value: 'Prometheus',
},
hide: 0,
label: null,
name: 'datasource',
options: [],
query: 'prometheus',
refresh: 1,
regex: '',
type: 'datasource',
};
local clusterTemplate =
template.new(
name='cluster',
datasource='$datasource',
query='label_values(node_time_seconds, %s)' % c._config.clusterLabel,
current='',
hide=if c._config.showMultiCluster then '' else '2',
refresh=2,
includeAll=false,
sort=1
);
local CPUUtilisation =
graphPanel.new(
'CPU Utilisation',
datasource='$datasource',
span=6,
format='percentunit',
stack=true,
fill=10,
legend_show=false,
) { tooltip+: { sort: 2 } };
local CPUSaturation =
// TODO: Is this a useful panel? At least there should be some explanation how load
// average relates to the "CPU saturation" in the title.
graphPanel.new(
'CPU Saturation (Load1 per CPU)',
datasource='$datasource',
span=6,
format='percentunit',
stack=true,
fill=10,
legend_show=false,
) { tooltip+: { sort: 2 } };
local memoryUtilisation =
graphPanel.new(
'Memory Utilisation',
datasource='$datasource',
span=6,
format='percentunit',
stack=true,
fill=10,
legend_show=false,
) { tooltip+: { sort: 2 } };
local memorySaturation =
graphPanel.new(
'Memory Saturation (Major Page Faults)',
datasource='$datasource',
span=6,
format='rds',
stack=true,
fill=10,
legend_show=false,
) { tooltip+: { sort: 2 } };
local networkUtilisation =
graphPanel.new(
'Network Utilisation (Bytes Receive/Transmit)',
datasource='$datasource',
span=6,
format='Bps',
stack=true,
fill=10,
legend_show=false,
)
.addSeriesOverride({ alias: '/Receive/', stack: 'A' })
.addSeriesOverride({ alias: '/Transmit/', stack: 'B', transform: 'negative-Y' })
{ tooltip+: { sort: 2 } };
local networkSaturation =
graphPanel.new(
'Network Saturation (Drops Receive/Transmit)',
datasource='$datasource',
span=6,
format='Bps',
stack=true,
fill=10,
legend_show=false,
)
.addSeriesOverride({ alias: '/ Receive/', stack: 'A' })
.addSeriesOverride({ alias: '/ Transmit/', stack: 'B', transform: 'negative-Y' })
{ tooltip+: { sort: 2 } };
local diskIOUtilisation =
graphPanel.new(
'Disk IO Utilisation',
datasource='$datasource',
span=6,
format='percentunit',
stack=true,
fill=10,
legend_show=false,
) { tooltip+: { sort: 2 } };
local diskIOSaturation =
graphPanel.new(
'Disk IO Saturation',
datasource='$datasource',
span=6,
format='percentunit',
stack=true,
fill=10,
legend_show=false,
) { tooltip+: { sort: 2 } };
local diskSpaceUtilisation =
graphPanel.new(
'Disk Space Utilisation',
datasource='$datasource',
span=12,
format='percentunit',
stack=true,
fill=10,
legend_show=false,
) { tooltip+: { sort: 2 } };
{ {
grafanaDashboards+:: { grafanaDashboards+:: {
'node-cluster-rsrc-use.json': 'node-rsrc-use.json':
local legendLink = '%s/dashboard/file/node-rsrc-use.json' % $._config.grafana_prefix;
g.dashboard('USE Method / Cluster') dashboard.new(
.addRow( '%sUSE Method / Node' % $._config.dashboardNamePrefix,
g.row('CPU') time_from='now-1h',
.addPanel( tags=($._config.dashboardTags),
g.panel('CPU Utilisation') + timezone='utc',
g.queryPanel(||| refresh='30s',
( graphTooltip='shared_crosshair'
instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s} )
* .addTemplate(datasourceTemplate)
instance:node_num_cpu:sum{%(nodeExporterSelector)s} .addTemplate(clusterTemplate)
) .addTemplate(
/ scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s})) template.new(
||| % $._config, '{{instance}}', legendLink) + 'instance',
g.stack + '$datasource',
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, 'label_values(node_exporter_build_info{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}, instance)' % $._config,
) refresh='time',
.addPanel( sort=1
// TODO: Is this a useful panel? At least there should be some explanation how load )
// average relates to the "CPU saturation" in the title. )
g.panel('CPU Saturation (load1 per CPU)') + .addRow(
g.queryPanel(||| row.new('CPU')
instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s} .addPanel(CPUUtilisation.addTarget(prometheus.target('instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Utilisation')))
/ scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s})) .addPanel(CPUSaturation.addTarget(prometheus.target('instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Saturation')))
||| % $._config, '{{instance}}', legendLink) + )
g.stack + .addRow(
// TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios. row.new('Memory')
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, .addPanel(memoryUtilisation.addTarget(prometheus.target('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Utilisation')))
) .addPanel(memorySaturation.addTarget(prometheus.target('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Major page Faults')))
) )
.addRow( .addRow(
g.row('Memory') row.new('Network')
.addPanel( .addPanel(
g.panel('Memory Utilisation') + networkUtilisation
g.queryPanel(||| .addTarget(prometheus.target('instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Receive'))
instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s} .addTarget(prometheus.target('instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Transmit'))
/ scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s})) )
||| % $._config, '{{instance}}', legendLink) + .addPanel(
g.stack + networkSaturation
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, .addTarget(prometheus.target('instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Receive'))
) .addTarget(prometheus.target('instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Transmit'))
.addPanel( )
g.panel('Memory Saturation (Major Page Faults)') + )
g.queryPanel('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) + .addRow(
g.stack + row.new('Disk IO')
{ yaxes: g.yaxes('rps') }, .addPanel(diskIOUtilisation.addTarget(prometheus.target('instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{device}}')))
) .addPanel(diskIOSaturation.addTarget(prometheus.target('instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{device}}')))
) )
.addRow( .addRow(
g.row('Network') row.new('Disk Space')
.addPanel( .addPanel(
g.panel('Net Utilisation (Bytes Receive/Transmit)') + diskSpaceUtilisation.addTarget(prometheus.target(
g.queryPanel( |||
[ sort_desc(1 -
'instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}' % $._config, (
'instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}' % $._config, max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"})
], /
['{{instance}} Receive', '{{instance}} Transmit'], max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"})
legendLink, ) != 0
) + )
g.stack + ||| % $._config, legendFormat='{{device}}'
{ ))
yaxes: g.yaxes({ format: 'Bps', min: null }), )
seriesOverrides: [ ),
{
alias: '/ Receive/',
stack: 'A',
},
{
alias: '/ Transmit/',
stack: 'B',
transform: 'negative-Y',
},
],
},
)
.addPanel(
g.panel('Net Saturation (Drops Receive/Transmit)') +
g.queryPanel(
[
'instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}' % $._config,
'instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}' % $._config,
],
['{{instance}} Receive', '{{instance}} Transmit'],
legendLink,
) +
g.stack +
{
yaxes: g.yaxes({ format: 'rps', min: null }),
seriesOverrides: [
{
alias: '/ Receive/',
stack: 'A',
},
{
alias: '/ Transmit/',
stack: 'B',
transform: 'negative-Y',
},
],
},
)
)
.addRow(
g.row('Disk IO')
.addPanel(
g.panel('Disk IO Utilisation') +
// Full utilisation would be all disks on each node spending an average of
// 1 second per second doing I/O, normalize by metric cardinality for stacked charts.
// TODO: Does the partition by device make sense? Using the most utilized device per
// instance might make more sense.
g.queryPanel(|||
instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}
/ scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}))
||| % $._config, '{{instance}} {{device}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
.addPanel(
g.panel('Disk IO Saturation') +
g.queryPanel(|||
instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}
/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}))
||| % $._config, '{{instance}} {{device}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
)
.addRow(
g.row('Disk Space')
.addPanel(
g.panel('Disk Space Utilisation') +
g.queryPanel(|||
sum without (device) (
max without (fstype, mountpoint) (
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
)
)
/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s})))
||| % $._config, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
),
),
'node-rsrc-use.json': 'node-cluster-rsrc-use.json':
g.dashboard('USE Method / Node') dashboard.new(
.addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance') '%sUSE Method / Cluster' % $._config.dashboardNamePrefix,
.addRow( time_from='now-1h',
g.row('CPU') tags=($._config.dashboardTags),
.addPanel( timezone='utc',
g.panel('CPU Utilisation') + refresh='30s',
g.queryPanel('instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation') + graphTooltip='shared_crosshair'
{ )
yaxes: g.yaxes('percentunit'), .addTemplate(datasourceTemplate)
legend+: { show: false }, .addTemplate(clusterTemplate)
}, .addRow(
) row.new('CPU')
.addPanel( .addPanel(
// TODO: Is this a useful panel? At least there should be some explanation how load CPUUtilisation
// average relates to the "CPU saturation" in the title. .addTarget(prometheus.target(
g.panel('CPU Saturation (Load1 per CPU)') + |||
g.queryPanel('instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') + ((
{ instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
yaxes: g.yaxes('percentunit'), *
legend+: { show: false }, instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
}, ) != 0 )
) / scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}))
) ||| % $._config, legendFormat='{{ instance }}'
.addRow( ))
g.row('Memory') )
.addPanel( .addPanel(
g.panel('Memory Utilisation') + CPUSaturation
g.queryPanel('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Memory') + .addTarget(prometheus.target(
{ yaxes: g.yaxes('percentunit') }, |||
) (
.addPanel( instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
g.panel('Memory Saturation (Major Page Faults)') + / scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}))
g.queryPanel('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Major page faults') + ) != 0
{ ||| % $._config, legendFormat='{{instance}}'
yaxes: g.yaxes('short'), ))
legend+: { show: false }, )
}, )
) .addRow(
) row.new('Memory')
.addRow( .addPanel(
g.row('Net') memoryUtilisation
.addPanel( .addTarget(prometheus.target(
g.panel('Net Utilisation (Bytes Receive/Transmit)') + |||
g.queryPanel( (
[ instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
'instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance"}' % $._config, / scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}))
'instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance"}' % $._config, ) != 0
], ||| % $._config, legendFormat='{{instance}}',
['Receive', 'Transmit'], ))
) + )
{ .addPanel(memorySaturation.addTarget(prometheus.target('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{instance}}')))
yaxes: g.yaxes({ format: 'Bps', min: null }), )
seriesOverrides: [ .addRow(
{ row.new('Network')
alias: '/Receive/', .addPanel(
stack: 'A', networkUtilisation
}, .addTarget(prometheus.target('instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Receive'))
{ .addTarget(prometheus.target('instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Transmit'))
alias: '/Transmit/', )
stack: 'B', .addPanel(
transform: 'negative-Y', networkSaturation
}, .addTarget(prometheus.target('instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Receive'))
], .addTarget(prometheus.target('instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Transmit'))
}, )
) )
.addPanel( .addRow(
g.panel('Net Saturation (Drops Receive/Transmit)') + row.new('Disk IO')
g.queryPanel( .addPanel(
[ diskIOUtilisation
'instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance"}' % $._config, .addTarget(prometheus.target(
'instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance"}' % $._config, |||
], (
['Receive drops', 'Transmit drops'], instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
) + / scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}))
{ ) != 0
yaxes: g.yaxes({ format: 'rps', min: null }), ||| % $._config, legendFormat='{{instance}} {{device}}'
seriesOverrides: [ ))
{ )
alias: '/Receive/', .addPanel(
stack: 'A', diskIOSaturation
}, .addTarget(prometheus.target(
{ |||
alias: '/Transmit/', (
stack: 'B', instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
transform: 'negative-Y', / scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}))
}, ) != 0
], ||| % $._config, legendFormat='{{instance}} {{device}}'
}, ))
) )
) )
.addRow( .addRow(
g.row('Disk IO') row.new('Disk Space')
.addPanel( .addPanel(
g.panel('Disk IO Utilisation') + diskSpaceUtilisation
g.queryPanel('instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance"}' % $._config, '{{device}}') + .addTarget(prometheus.target(
{ yaxes: g.yaxes('percentunit') }, |||
) sum without (device) (
.addPanel( max without (fstype, mountpoint) ((
g.panel('Disk IO Saturation') + node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(clusterLabel)s="$cluster"}
g.queryPanel('instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance"}' % $._config, '{{device}}') + -
{ yaxes: g.yaxes('percentunit') }, node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(clusterLabel)s="$cluster"}
) ) != 0)
) )
.addRow( / scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(clusterLabel)s="$cluster"})))
g.row('Disk Space') ||| % $._config, legendFormat='{{instance}}'
.addPanel( ))
g.panel('Disk Space Utilisation') + )
g.queryPanel(||| ),
1 - } +
( if $._config.showMultiCluster then {
max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"}) 'node-multicluster-rsrc-use.json':
/ dashboard.new(
max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"}) '%sUSE Method / Multi-cluster' % $._config.dashboardNamePrefix,
) time_from='now-1h',
||| % $._config, '{{device}}') + tags=($._config.dashboardTags),
{ timezone='utc',
yaxes: g.yaxes('percentunit'), refresh='30s',
legend+: { show: false }, graphTooltip='shared_crosshair'
}, )
), .addTemplate(datasourceTemplate)
), .addRow(
}, row.new('CPU')
.addPanel(
CPUUtilisation
.addTarget(prometheus.target(
|||
sum(
((
instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s}
*
instance:node_num_cpu:sum{%(nodeExporterSelector)s}
) != 0)
/ scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s}))
) by (%(clusterLabel)s)
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config
))
)
.addPanel(
CPUSaturation
.addTarget(prometheus.target(
|||
sum((
instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}
/ scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}))
) != 0) by (%(clusterLabel)s)
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config
))
)
)
.addRow(
row.new('Memory')
.addPanel(
memoryUtilisation
.addTarget(prometheus.target(
|||
sum((
instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}
/ scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}))
) != 0) by (%(clusterLabel)s)
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config
))
)
.addPanel(
memorySaturation
.addTarget(prometheus.target(
|||
sum((
instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s}
) != 0) by (%(clusterLabel)s)
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config
))
)
)
.addRow(
row.new('Network')
.addPanel(
networkUtilisation
.addTarget(prometheus.target(
|||
sum((
instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}
) != 0) by (%(clusterLabel)s)
||| % $._config, legendFormat='{{%(clusterLabel)s}} Receive' % $._config
))
.addTarget(prometheus.target(
|||
sum((
instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}
) != 0) by (%(clusterLabel)s)
||| % $._config, legendFormat='{{%(clusterLabel)s}} Transmit' % $._config
))
)
.addPanel(
networkSaturation
.addTarget(prometheus.target(
|||
sum((
instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}
) != 0) by (%(clusterLabel)s)
||| % $._config, legendFormat='{{%(clusterLabel)s}} Receive' % $._config
))
.addTarget(prometheus.target(
|||
sum((
instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}
) != 0) by (%(clusterLabel)s)
||| % $._config, legendFormat='{{%(clusterLabel)s}} Transmit' % $._config
))
)
)
.addRow(
row.new('Disk IO')
.addPanel(
diskIOUtilisation
.addTarget(prometheus.target(
|||
sum((
instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}
/ scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}))
) != 0) by (%(clusterLabel)s, device)
||| % $._config, legendFormat='{{%(clusterLabel)s}} {{device}}' % $._config
))
)
.addPanel(
diskIOSaturation
.addTarget(prometheus.target(
|||
sum((
instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}
/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}))
) != 0) by (%(clusterLabel)s, device)
||| % $._config, legendFormat='{{%(clusterLabel)s}} {{device}}' % $._config
))
)
)
.addRow(
row.new('Disk Space')
.addPanel(
diskSpaceUtilisation
.addTarget(prometheus.target(
|||
sum (
sum without (device) (
max without (fstype, mountpoint, instance, pod) ((
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
) != 0)
)
/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s})))
) by (%(clusterLabel)s)
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config
))
)
),
} else {},
} }