mirror of
https://github.com/prometheus/node_exporter.git
synced 2024-12-28 06:59:44 -08:00
Beginnings of a node-exporter monitoring mixin.
Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>
This commit is contained in:
parent
17fee8081f
commit
bafe1707f1
165
node-mixin/alerts/alerts.libsonnet
Normal file
165
node-mixin/alerts/alerts.libsonnet
Normal file
|
@ -0,0 +1,165 @@
|
|||
{
|
||||
prometheusAlerts+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'node',
|
||||
rules: [
|
||||
{
|
||||
alert: 'NodeFilesystemSpaceFillingUp',
|
||||
expr: |||
|
||||
predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0
|
||||
AND
|
||||
node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4
|
||||
AND
|
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 24 hours.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeFilesystemSpaceFillingUp',
|
||||
expr: |||
|
||||
predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0
|
||||
AND
|
||||
node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2
|
||||
AND
|
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 4 hours.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeFilesystemOutOfSpace',
|
||||
expr: |||
|
||||
node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5
|
||||
AND
|
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeFilesystemOutOfSpace',
|
||||
expr: |||
|
||||
node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3
|
||||
AND
|
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeFilesystemFilesFillingUp',
|
||||
expr: |||
|
||||
predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0
|
||||
AND
|
||||
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4
|
||||
AND
|
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 24 hours.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeFilesystemFilesFillingUp',
|
||||
expr: |||
|
||||
predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0
|
||||
AND
|
||||
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2
|
||||
AND
|
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 4 hours.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeFilesystemOutOfFiles',
|
||||
expr: |||
|
||||
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5
|
||||
AND
|
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available inodes left.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeFilesystemOutOfSpace',
|
||||
expr: |||
|
||||
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3
|
||||
AND
|
||||
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeNetworkReceiveErrs',
|
||||
expr: |||
|
||||
increase(node_network_receive_errs[2m]) > 10
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while receiving packets ({{ $value }} errors in two minutes).',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NodeNetworkTransmitErrs',
|
||||
expr: |||
|
||||
increase(node_network_transmit_errs[2m]) > 10
|
||||
||| % $._config,
|
||||
'for': '1h',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while transmitting packets ({{ $value }} errors in two minutes).',
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
11
node-mixin/config.libsonnet
Normal file
11
node-mixin/config.libsonnet
Normal file
|
@ -0,0 +1,11 @@
|
|||
{
|
||||
_config+:: {
|
||||
// Selectors are inserted between {} in Prometheus queries.
|
||||
nodeExporterSelector: 'job="node-exporter"',
|
||||
|
||||
// Mainly extracted because they are repetitive, but also useful to customize.
|
||||
fsSelectors: 'fstype=~"ext.|xfs",mountpoint!="/var/lib/docker/aufs"',
|
||||
|
||||
grafana_prefix: '',
|
||||
},
|
||||
}
|
2
node-mixin/dashboards/dashboards.libsonnet
Normal file
2
node-mixin/dashboards/dashboards.libsonnet
Normal file
|
@ -0,0 +1,2 @@
|
|||
(import 'node.libsonnet') +
|
||||
(import 'use.libsonnet')
|
176
node-mixin/dashboards/node.libsonnet
Normal file
176
node-mixin/dashboards/node.libsonnet
Normal file
|
@ -0,0 +1,176 @@
|
|||
local grafana = import 'grafonnet/grafana.libsonnet';
|
||||
local dashboard = grafana.dashboard;
|
||||
local row = grafana.row;
|
||||
local prometheus = grafana.prometheus;
|
||||
local template = grafana.template;
|
||||
local graphPanel = grafana.graphPanel;
|
||||
local promgrafonnet = import '../lib/promgrafonnet/promgrafonnet.libsonnet';
|
||||
local gauge = promgrafonnet.gauge;
|
||||
|
||||
{
|
||||
grafanaDashboards+:: {
|
||||
'nodes.json':
|
||||
local idleCPU =
|
||||
graphPanel.new(
|
||||
'Idle CPU',
|
||||
datasource='$datasource',
|
||||
span=6,
|
||||
format='percent',
|
||||
max=100,
|
||||
min=0,
|
||||
)
|
||||
.addTarget(prometheus.target(
|
||||
|||
|
||||
100 - (avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[5m])) * 100)
|
||||
||| % $._config,
|
||||
legendFormat='{{cpu}}',
|
||||
intervalFactor=10,
|
||||
));
|
||||
|
||||
local systemLoad =
|
||||
graphPanel.new(
|
||||
'System load',
|
||||
datasource='$datasource',
|
||||
span=6,
|
||||
format='percent',
|
||||
)
|
||||
.addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 1m'))
|
||||
.addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 5m'))
|
||||
.addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 15m'));
|
||||
|
||||
local memoryGraph =
|
||||
graphPanel.new(
|
||||
'Memory Usage',
|
||||
datasource='$datasource',
|
||||
span=9,
|
||||
format='bytes',
|
||||
)
|
||||
.addTarget(prometheus.target(
|
||||
|||
|
||||
node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"}
|
||||
- node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}
|
||||
- node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}
|
||||
- node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}
|
||||
||| % $._config, legendFormat='memory used'
|
||||
))
|
||||
.addTarget(prometheus.target('node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers'))
|
||||
.addTarget(prometheus.target('node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached'))
|
||||
.addTarget(prometheus.target('node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free'));
|
||||
|
||||
local memoryGauge = gauge.new(
|
||||
'Memory Usage',
|
||||
|||
|
||||
(
|
||||
node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"}
|
||||
- node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}
|
||||
- node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}
|
||||
- node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}
|
||||
) * 100
|
||||
/
|
||||
node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"}
|
||||
||| % $._config,
|
||||
).withLowerBeingBetter();
|
||||
|
||||
local diskIO =
|
||||
graphPanel.new(
|
||||
'Disk I/O',
|
||||
datasource='$datasource',
|
||||
span=9,
|
||||
)
|
||||
.addTarget(prometheus.target('sum by (instance) (rate(node_disk_bytes_read{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='read'))
|
||||
.addTarget(prometheus.target('sum by (instance) (rate(node_disk_bytes_written{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='written'))
|
||||
.addTarget(prometheus.target('sum by (instance) (rate(node_disk_io_time_ms{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='io time')) +
|
||||
{
|
||||
seriesOverrides: [
|
||||
{
|
||||
alias: 'read',
|
||||
yaxis: 1,
|
||||
},
|
||||
{
|
||||
alias: 'io time',
|
||||
yaxis: 2,
|
||||
},
|
||||
],
|
||||
yaxes: [
|
||||
self.yaxe(format='bytes'),
|
||||
self.yaxe(format='ms'),
|
||||
],
|
||||
};
|
||||
|
||||
local diskSpaceUsage = gauge.new(
|
||||
'Disk Space Usage',
|
||||
|||
|
||||
(
|
||||
sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"})
|
||||
- sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"})
|
||||
) * 100
|
||||
/
|
||||
sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"})
|
||||
||| % $._config,
|
||||
).withLowerBeingBetter();
|
||||
|
||||
local networkReceived =
|
||||
graphPanel.new(
|
||||
'Network Received',
|
||||
datasource='$datasource',
|
||||
span=6,
|
||||
format='bytes',
|
||||
)
|
||||
.addTarget(prometheus.target('rate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[5m])' % $._config, legendFormat='{{device}}'));
|
||||
|
||||
local networkTransmitted =
|
||||
graphPanel.new(
|
||||
'Network Transmitted',
|
||||
datasource='$datasource',
|
||||
span=6,
|
||||
format='bytes',
|
||||
)
|
||||
.addTarget(prometheus.target('rate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[5m])' % $._config, legendFormat='{{device}}'));
|
||||
|
||||
dashboard.new('Nodes', time_from='now-1h')
|
||||
.addTemplate(
|
||||
{
|
||||
current: {
|
||||
text: 'Prometheus',
|
||||
value: 'Prometheus',
|
||||
},
|
||||
hide: 0,
|
||||
label: null,
|
||||
name: 'datasource',
|
||||
options: [],
|
||||
query: 'prometheus',
|
||||
refresh: 1,
|
||||
regex: '',
|
||||
type: 'datasource',
|
||||
},
|
||||
)
|
||||
.addTemplate(
|
||||
template.new(
|
||||
'instance',
|
||||
'$datasource',
|
||||
'label_values(node_boot_time{%(nodeExporterSelector)s}, instance)' % $._config,
|
||||
refresh='time',
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
row.new()
|
||||
.addPanel(idleCPU)
|
||||
.addPanel(systemLoad)
|
||||
)
|
||||
.addRow(
|
||||
row.new()
|
||||
.addPanel(memoryGraph)
|
||||
.addPanel(memoryGauge)
|
||||
)
|
||||
.addRow(
|
||||
row.new()
|
||||
.addPanel(diskIO)
|
||||
.addPanel(diskSpaceUsage)
|
||||
)
|
||||
.addRow(
|
||||
row.new()
|
||||
.addPanel(networkReceived)
|
||||
.addPanel(networkTransmitted)
|
||||
),
|
||||
},
|
||||
}
|
151
node-mixin/dashboards/use.libsonnet
Normal file
151
node-mixin/dashboards/use.libsonnet
Normal file
|
@ -0,0 +1,151 @@
|
|||
local g = import 'grafana-builder/grafana.libsonnet';
|
||||
|
||||
{
|
||||
grafanaDashboards+:: {
|
||||
'node-cluster-rsrc-use.json':
|
||||
local legendLink = '%s/dashboard/file/k8s-node-rsrc-use.json' % $._config.grafana_prefix;
|
||||
|
||||
g.dashboard('USE Method / Cluster')
|
||||
.addRow(
|
||||
g.row('CPU')
|
||||
.addPanel(
|
||||
g.panel('CPU Utilisation') +
|
||||
g.queryPanel('instance:node_cpu_utilisation:avg1m * instance:node_num_cpu:sum / scalar(sum(instance:node_num_cpu:sum))', '{{instance}}', legendLink) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('CPU Saturation (Load1)') +
|
||||
g.queryPanel(|||
|
||||
instance:node_cpu_saturation_load1: / scalar(sum(up{%(nodeExporterSelector)s}))
|
||||
||| % $._config, '{{instance}}', legendLink) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Memory')
|
||||
.addPanel(
|
||||
g.panel('Memory Utilisation') +
|
||||
g.queryPanel('instance:node_memory_utilisation:ratio', '{{instance}}', legendLink) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Memory Saturation (Swap I/O)') +
|
||||
g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate', '{{instance}}', legendLink) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Disk')
|
||||
.addPanel(
|
||||
g.panel('Disk IO Utilisation') +
|
||||
// Full utilisation would be all disks on each node spending an average of
|
||||
// 1 sec per second doing I/O, normalize by node count for stacked charts
|
||||
g.queryPanel(|||
|
||||
instance:node_disk_utilisation:avg_irate / scalar(sum(up{%(nodeExporterSelector)s}))
|
||||
||| % $._config, '{{instance}}', legendLink) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Disk IO Saturation') +
|
||||
g.queryPanel(|||
|
||||
instance:node_disk_saturation:avg_irate / scalar(sum(up{%(nodeExporterSelector)s}))
|
||||
||| % $._config, '{{instance}}', legendLink) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Network')
|
||||
.addPanel(
|
||||
g.panel('Net Utilisation (Transmitted)') +
|
||||
g.queryPanel('instance:node_net_utilisation:sum_irate', '{{instance}}', legendLink) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Net Saturation (Dropped)') +
|
||||
g.queryPanel('instance:node_net_saturation:sum_irate', '{{instance}}', legendLink) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Storage')
|
||||
.addPanel(
|
||||
g.panel('Disk Capacity') +
|
||||
g.queryPanel('sum(max(node_filesystem_size{fstype=~"ext[24]"} - node_filesystem_free{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
||||
),
|
||||
),
|
||||
|
||||
'k8s-node-rsrc-use.json':
|
||||
g.dashboard('K8s / USE Method / Node')
|
||||
.addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance')
|
||||
.addRow(
|
||||
g.row('CPU')
|
||||
.addPanel(
|
||||
g.panel('CPU Utilisation') +
|
||||
g.queryPanel('instance:node_cpu_utilisation:avg1m{instance="$instance"}', 'Utilisation') +
|
||||
{ yaxes: g.yaxes('percentunit') },
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('CPU Saturation (Load1)') +
|
||||
g.queryPanel('instance:node_cpu_saturation_load1:{instance="$instance"}', 'Saturation') +
|
||||
{ yaxes: g.yaxes('percentunit') },
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Memory')
|
||||
.addPanel(
|
||||
g.panel('Memory Utilisation') +
|
||||
g.queryPanel('instance:node_memory_utilisation:{instance="$instance"}', 'Memory') +
|
||||
{ yaxes: g.yaxes('percentunit') },
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Memory Saturation (Swap I/O)') +
|
||||
g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{instance="$instance"}', 'Swap IO') +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Disk')
|
||||
.addPanel(
|
||||
g.panel('Disk IO Utilisation') +
|
||||
g.queryPanel('instance:node_disk_utilisation:avg_irate{instance="$instance"}', 'Utilisation') +
|
||||
{ yaxes: g.yaxes('percentunit') },
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Disk IO Saturation') +
|
||||
g.queryPanel('instance:node_disk_saturation:avg_irate{instance="$instance"}', 'Saturation') +
|
||||
{ yaxes: g.yaxes('percentunit') },
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Net')
|
||||
.addPanel(
|
||||
g.panel('Net Utilisation (Transmitted)') +
|
||||
g.queryPanel('instance:node_net_utilisation:sum_irate{instance="$instance"}', 'Utilisation') +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Net Saturation (Dropped)') +
|
||||
g.queryPanel('instance:node_net_saturation:sum_irate{instance="$instance"}', 'Saturation') +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Disk')
|
||||
.addPanel(
|
||||
g.panel('Disk Utilisation') +
|
||||
g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size{fstype=~"ext[24]"}))', 'Disk') +
|
||||
{ yaxes: g.yaxes('percentunit') },
|
||||
),
|
||||
),
|
||||
},
|
||||
}
|
24
node-mixin/jsonnetfile.json
Normal file
24
node-mixin/jsonnetfile.json
Normal file
|
@ -0,0 +1,24 @@
|
|||
{
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "grafonnet",
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/grafana/grafonnet-lib",
|
||||
"subdir": "grafonnet"
|
||||
}
|
||||
},
|
||||
"version": "master"
|
||||
},
|
||||
{
|
||||
"name": "grafana-builder",
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/kausalco/public",
|
||||
"subdir": "grafana-builder"
|
||||
}
|
||||
},
|
||||
"version": "master"
|
||||
}
|
||||
]
|
||||
}
|
60
node-mixin/lib/promgrafonnet/gauge.libsonnet
Normal file
60
node-mixin/lib/promgrafonnet/gauge.libsonnet
Normal file
|
@ -0,0 +1,60 @@
|
|||
local grafana = import 'grafonnet/grafana.libsonnet';
|
||||
local singlestat = grafana.singlestat;
|
||||
local prometheus = grafana.prometheus;
|
||||
|
||||
{
|
||||
new(title, query)::
|
||||
singlestat.new(
|
||||
title,
|
||||
datasource='prometheus',
|
||||
span=3,
|
||||
format='percent',
|
||||
valueName='current',
|
||||
colors=[
|
||||
'rgba(245, 54, 54, 0.9)',
|
||||
'rgba(237, 129, 40, 0.89)',
|
||||
'rgba(50, 172, 45, 0.97)',
|
||||
],
|
||||
thresholds='50, 80',
|
||||
valueMaps=[
|
||||
{
|
||||
op: '=',
|
||||
text: 'N/A',
|
||||
value: 'null',
|
||||
},
|
||||
],
|
||||
)
|
||||
.addTarget(
|
||||
prometheus.target(
|
||||
query
|
||||
)
|
||||
) + {
|
||||
gauge: {
|
||||
maxValue: 100,
|
||||
minValue: 0,
|
||||
show: true,
|
||||
thresholdLabels: false,
|
||||
thresholdMarkers: true,
|
||||
},
|
||||
withTextNullValue(text):: self {
|
||||
valueMaps: [
|
||||
{
|
||||
op: '=',
|
||||
text: text,
|
||||
value: 'null',
|
||||
},
|
||||
],
|
||||
},
|
||||
withSpanSize(size):: self {
|
||||
span: size,
|
||||
},
|
||||
withLowerBeingBetter():: self {
|
||||
colors: [
|
||||
'rgba(50, 172, 45, 0.97)',
|
||||
'rgba(237, 129, 40, 0.89)',
|
||||
'rgba(245, 54, 54, 0.9)',
|
||||
],
|
||||
thresholds: '80, 90',
|
||||
},
|
||||
},
|
||||
}
|
48
node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet
Normal file
48
node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet
Normal file
|
@ -0,0 +1,48 @@
|
|||
local grafana = import 'grafonnet/grafana.libsonnet';
|
||||
local singlestat = grafana.singlestat;
|
||||
local prometheus = grafana.prometheus;
|
||||
|
||||
{
|
||||
new(title, query)::
|
||||
singlestat.new(
|
||||
title,
|
||||
datasource='prometheus',
|
||||
span=3,
|
||||
valueName='current',
|
||||
valueMaps=[
|
||||
{
|
||||
op: '=',
|
||||
text: '0',
|
||||
value: 'null',
|
||||
},
|
||||
],
|
||||
)
|
||||
.addTarget(
|
||||
prometheus.target(
|
||||
query
|
||||
)
|
||||
) + {
|
||||
withTextNullValue(text):: self {
|
||||
valueMaps: [
|
||||
{
|
||||
op: '=',
|
||||
text: text,
|
||||
value: 'null',
|
||||
},
|
||||
],
|
||||
},
|
||||
withSpanSize(size):: self {
|
||||
span: size,
|
||||
},
|
||||
withPostfix(postfix):: self {
|
||||
postfix: postfix,
|
||||
},
|
||||
withSparkline():: self {
|
||||
sparkline: {
|
||||
show: true,
|
||||
lineColor: 'rgb(31, 120, 193)',
|
||||
fillColor: 'rgba(31, 118, 189, 0.18)',
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
5
node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet
Normal file
5
node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet
Normal file
|
@ -0,0 +1,5 @@
|
|||
{
|
||||
numbersinglestat:: import 'numbersinglestat.libsonnet',
|
||||
gauge:: import 'gauge.libsonnet',
|
||||
percentlinegraph:: import 'percentlinegraph.libsonnet',
|
||||
}
|
4
node-mixin/mixin.libsonnet
Normal file
4
node-mixin/mixin.libsonnet
Normal file
|
@ -0,0 +1,4 @@
|
|||
(import 'config.libsonnet') +
|
||||
(import 'alerts/alerts.libsonnet') +
|
||||
(import 'dashboards/dashboards.libsonnet') +
|
||||
(import 'rules/rules.libsonnet')
|
121
node-mixin/rules/rules.libsonnet
Normal file
121
node-mixin/rules/rules.libsonnet
Normal file
|
@ -0,0 +1,121 @@
|
|||
{
|
||||
prometheusRules+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'node.rules',
|
||||
rules: [
|
||||
{
|
||||
// This rule gives the number of CPUs per node.
|
||||
record: 'instance:node_num_cpu:sum',
|
||||
expr: |||
|
||||
count by (instance) (
|
||||
sum by (instance, cpu) (
|
||||
node_cpu{%(nodeExporterSelector)s}
|
||||
)
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
// CPU utilisation is % CPU is not idle.
|
||||
record: 'instance:node_cpu_utilisation:avg1m',
|
||||
expr: |||
|
||||
1 - avg by (instance) (
|
||||
rate(node_cpu{%(nodeExporterSelector)s,mode="idle"}[1m])
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
// CPU saturation is 1min avg run queue length / number of CPUs.
|
||||
// Can go over 100%. >100% is bad.
|
||||
record: 'instance:node_cpu_saturation_load1:',
|
||||
expr: |||
|
||||
sum by (instance) (
|
||||
node_load1{%(nodeExporterSelector)s}
|
||||
)
|
||||
/
|
||||
instance:node_num_cpu:sum
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
// Available memory per node
|
||||
record: 'instance:node_memory_bytes_available:sum',
|
||||
expr: |||
|
||||
sum by (instance) (
|
||||
(node_memory_MemFree{%(nodeExporterSelector)s} + node_memory_Cached{%(nodeExporterSelector)s} + node_memory_Buffers{%(nodeExporterSelector)s})
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
// Total memory per node
|
||||
record: 'instance:node_memory_bytes_total:sum',
|
||||
expr: |||
|
||||
sum by (instance) (
|
||||
node_memory_MemTotal{%(nodeExporterSelector)s}
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
// Memory utilisation per node, normalized by per-node memory
|
||||
record: 'instance:node_memory_utilisation:ratio',
|
||||
expr: |||
|
||||
(instance:node_memory_bytes_total:sum - instance:node_memory_bytes_available:sum)
|
||||
/
|
||||
scalar(sum(instance:node_memory_bytes_total:sum))
|
||||
|||,
|
||||
},
|
||||
{
|
||||
record: 'instance:node_memory_utilisation:',
|
||||
expr: |||
|
||||
1 - (instance:node_memory_bytes_available:sum / instance:node_memory_bytes_total:sum)
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
record: 'instance:node_memory_swap_io_bytes:sum_rate',
|
||||
expr: |||
|
||||
1e3 * sum by (instance) (
|
||||
(rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m])
|
||||
+ rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m]))
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
// Disk utilisation (ms spent, by rate() it's bound by 1 second)
|
||||
record: 'instance:node_disk_utilisation:avg_irate',
|
||||
expr: |||
|
||||
avg by (instance) (
|
||||
irate(node_disk_io_time_ms{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
// Disk saturation (ms spent, by rate() it's bound by 1 second)
|
||||
record: 'instance:node_disk_saturation:avg_irate',
|
||||
expr: |||
|
||||
avg by (instance) (
|
||||
irate(node_disk_io_time_weighted{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
record: 'instance:node_net_utilisation:sum_irate',
|
||||
expr: |||
|
||||
sum by (instance) (
|
||||
(irate(node_network_receive_bytes{%(nodeExporterSelector)s,device="eth0"}[1m]) +
|
||||
irate(node_network_transmit_bytes{%(nodeExporterSelector)s,device="eth0"}[1m]))
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
record: 'instance:node_net_saturation:sum_irate',
|
||||
expr: |||
|
||||
sum by (instance) (
|
||||
(irate(node_network_receive_drop{%(nodeExporterSelector)s,device="eth0"}[1m]) +
|
||||
irate(node_network_transmit_drop{%(nodeExporterSelector)s,device="eth0"}[1m]))
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
Loading…
Reference in a new issue