diff --git a/node-mixin/alerts/alerts.libsonnet b/node-mixin/alerts/alerts.libsonnet new file mode 100644 index 00000000..198e22fd --- /dev/null +++ b/node-mixin/alerts/alerts.libsonnet @@ -0,0 +1,165 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'node', + rules: [ + { + alert: 'NodeFilesystemSpaceFillingUp', + expr: ||| + predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + AND + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 24 hours.', + }, + }, + { + alert: 'NodeFilesystemSpaceFillingUp', + expr: ||| + predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + AND + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 4 hours.', + }, + }, + { + alert: 'NodeFilesystemOutOfSpace', + expr: ||| + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', + }, + }, + { + alert: 'NodeFilesystemOutOfSpace', + expr: ||| + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', + }, + }, + { + alert: 'NodeFilesystemFilesFillingUp', + expr: ||| + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + AND + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 24 hours.', + }, + }, + { + alert: 'NodeFilesystemFilesFillingUp', + expr: ||| + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + AND + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 4 hours.', + }, + }, + { + alert: 'NodeFilesystemOutOfFiles', + expr: ||| + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available inodes left.', + }, + }, + { + alert: 'NodeFilesystemOutOfSpace', + expr: ||| + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', + }, + }, + { + alert: 'NodeNetworkReceiveErrs', + expr: ||| + increase(node_network_receive_errs[2m]) > 10 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while receiving packets ({{ $value }} errors in two minutes).', + }, + }, + { + alert: 'NodeNetworkTransmitErrs', + expr: ||| + increase(node_network_transmit_errs[2m]) > 10 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while transmitting packets ({{ $value }} errors in two minutes).', + }, + }, + ], + }, + ], + }, +} diff --git a/node-mixin/config.libsonnet b/node-mixin/config.libsonnet new file mode 100644 index 00000000..6c5d6f74 --- /dev/null +++ b/node-mixin/config.libsonnet @@ -0,0 +1,11 @@ +{ + _config+:: { + // Selectors are inserted between {} in Prometheus queries. + nodeExporterSelector: 'job="node-exporter"', + + // Mainly extracted because they are repetitive, but also useful to customize. + fsSelectors: 'fstype=~"ext.|xfs",mountpoint!="/var/lib/docker/aufs"', + + grafana_prefix: '', + }, +} diff --git a/node-mixin/dashboards/dashboards.libsonnet b/node-mixin/dashboards/dashboards.libsonnet new file mode 100644 index 00000000..e6adbd4f --- /dev/null +++ b/node-mixin/dashboards/dashboards.libsonnet @@ -0,0 +1,2 @@ +(import 'node.libsonnet') + +(import 'use.libsonnet') diff --git a/node-mixin/dashboards/node.libsonnet b/node-mixin/dashboards/node.libsonnet new file mode 100644 index 00000000..471c5b37 --- /dev/null +++ b/node-mixin/dashboards/node.libsonnet @@ -0,0 +1,176 @@ +local grafana = import 'grafonnet/grafana.libsonnet'; +local dashboard = grafana.dashboard; +local row = grafana.row; +local prometheus = grafana.prometheus; +local template = grafana.template; +local graphPanel = grafana.graphPanel; +local promgrafonnet = import '../lib/promgrafonnet/promgrafonnet.libsonnet'; +local gauge = promgrafonnet.gauge; + +{ + grafanaDashboards+:: { + 'nodes.json': + local idleCPU = + graphPanel.new( + 'Idle CPU', + datasource='$datasource', + span=6, + format='percent', + max=100, + min=0, + ) + .addTarget(prometheus.target( + ||| + 100 - (avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[5m])) * 100) + ||| % $._config, + legendFormat='{{cpu}}', + intervalFactor=10, + )); + + local systemLoad = + graphPanel.new( + 'System load', + datasource='$datasource', + span=6, + format='percent', + ) + .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 1m')) + .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 5m')) + .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 15m')); + + local memoryGraph = + graphPanel.new( + 'Memory Usage', + datasource='$datasource', + span=9, + format='bytes', + ) + .addTarget(prometheus.target( + ||| + node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"} + ||| % $._config, legendFormat='memory used' + )) + .addTarget(prometheus.target('node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers')) + .addTarget(prometheus.target('node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) + .addTarget(prometheus.target('node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); + + local memoryGauge = gauge.new( + 'Memory Usage', + ||| + ( + node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"} + ) * 100 + / + node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + ||| % $._config, + ).withLowerBeingBetter(); + + local diskIO = + graphPanel.new( + 'Disk I/O', + datasource='$datasource', + span=9, + ) + .addTarget(prometheus.target('sum by (instance) (rate(node_disk_bytes_read{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='read')) + .addTarget(prometheus.target('sum by (instance) (rate(node_disk_bytes_written{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='written')) + .addTarget(prometheus.target('sum by (instance) (rate(node_disk_io_time_ms{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='io time')) + + { + seriesOverrides: [ + { + alias: 'read', + yaxis: 1, + }, + { + alias: 'io time', + yaxis: 2, + }, + ], + yaxes: [ + self.yaxe(format='bytes'), + self.yaxe(format='ms'), + ], + }; + + local diskSpaceUsage = gauge.new( + 'Disk Space Usage', + ||| + ( + sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}) + - sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}) + ) * 100 + / + sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}) + ||| % $._config, + ).withLowerBeingBetter(); + + local networkReceived = + graphPanel.new( + 'Network Received', + datasource='$datasource', + span=6, + format='bytes', + ) + .addTarget(prometheus.target('rate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[5m])' % $._config, legendFormat='{{device}}')); + + local networkTransmitted = + graphPanel.new( + 'Network Transmitted', + datasource='$datasource', + span=6, + format='bytes', + ) + .addTarget(prometheus.target('rate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[5m])' % $._config, legendFormat='{{device}}')); + + dashboard.new('Nodes', time_from='now-1h') + .addTemplate( + { + current: { + text: 'Prometheus', + value: 'Prometheus', + }, + hide: 0, + label: null, + name: 'datasource', + options: [], + query: 'prometheus', + refresh: 1, + regex: '', + type: 'datasource', + }, + ) + .addTemplate( + template.new( + 'instance', + '$datasource', + 'label_values(node_boot_time{%(nodeExporterSelector)s}, instance)' % $._config, + refresh='time', + ) + ) + .addRow( + row.new() + .addPanel(idleCPU) + .addPanel(systemLoad) + ) + .addRow( + row.new() + .addPanel(memoryGraph) + .addPanel(memoryGauge) + ) + .addRow( + row.new() + .addPanel(diskIO) + .addPanel(diskSpaceUsage) + ) + .addRow( + row.new() + .addPanel(networkReceived) + .addPanel(networkTransmitted) + ), + }, +} diff --git a/node-mixin/dashboards/use.libsonnet b/node-mixin/dashboards/use.libsonnet new file mode 100644 index 00000000..526002f6 --- /dev/null +++ b/node-mixin/dashboards/use.libsonnet @@ -0,0 +1,151 @@ +local g = import 'grafana-builder/grafana.libsonnet'; + +{ + grafanaDashboards+:: { + 'node-cluster-rsrc-use.json': + local legendLink = '%s/dashboard/file/k8s-node-rsrc-use.json' % $._config.grafana_prefix; + + g.dashboard('USE Method / Cluster') + .addRow( + g.row('CPU') + .addPanel( + g.panel('CPU Utilisation') + + g.queryPanel('instance:node_cpu_utilisation:avg1m * instance:node_num_cpu:sum / scalar(sum(instance:node_num_cpu:sum))', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + g.panel('CPU Saturation (Load1)') + + g.queryPanel(||| + instance:node_cpu_saturation_load1: / scalar(sum(up{%(nodeExporterSelector)s})) + ||| % $._config, '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + ) + .addRow( + g.row('Memory') + .addPanel( + g.panel('Memory Utilisation') + + g.queryPanel('instance:node_memory_utilisation:ratio', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + g.panel('Memory Saturation (Swap I/O)') + + g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes('Bps') }, + ) + ) + .addRow( + g.row('Disk') + .addPanel( + g.panel('Disk IO Utilisation') + + // Full utilisation would be all disks on each node spending an average of + // 1 sec per second doing I/O, normalize by node count for stacked charts + g.queryPanel(||| + instance:node_disk_utilisation:avg_irate / scalar(sum(up{%(nodeExporterSelector)s})) + ||| % $._config, '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + g.panel('Disk IO Saturation') + + g.queryPanel(||| + instance:node_disk_saturation:avg_irate / scalar(sum(up{%(nodeExporterSelector)s})) + ||| % $._config, '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + ) + .addRow( + g.row('Network') + .addPanel( + g.panel('Net Utilisation (Transmitted)') + + g.queryPanel('instance:node_net_utilisation:sum_irate', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes('Bps') }, + ) + .addPanel( + g.panel('Net Saturation (Dropped)') + + g.queryPanel('instance:node_net_saturation:sum_irate', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes('Bps') }, + ) + ) + .addRow( + g.row('Storage') + .addPanel( + g.panel('Disk Capacity') + + g.queryPanel('sum(max(node_filesystem_size{fstype=~"ext[24]"} - node_filesystem_free{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ), + ), + + 'k8s-node-rsrc-use.json': + g.dashboard('K8s / USE Method / Node') + .addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance') + .addRow( + g.row('CPU') + .addPanel( + g.panel('CPU Utilisation') + + g.queryPanel('instance:node_cpu_utilisation:avg1m{instance="$instance"}', 'Utilisation') + + { yaxes: g.yaxes('percentunit') }, + ) + .addPanel( + g.panel('CPU Saturation (Load1)') + + g.queryPanel('instance:node_cpu_saturation_load1:{instance="$instance"}', 'Saturation') + + { yaxes: g.yaxes('percentunit') }, + ) + ) + .addRow( + g.row('Memory') + .addPanel( + g.panel('Memory Utilisation') + + g.queryPanel('instance:node_memory_utilisation:{instance="$instance"}', 'Memory') + + { yaxes: g.yaxes('percentunit') }, + ) + .addPanel( + g.panel('Memory Saturation (Swap I/O)') + + g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{instance="$instance"}', 'Swap IO') + + { yaxes: g.yaxes('Bps') }, + ) + ) + .addRow( + g.row('Disk') + .addPanel( + g.panel('Disk IO Utilisation') + + g.queryPanel('instance:node_disk_utilisation:avg_irate{instance="$instance"}', 'Utilisation') + + { yaxes: g.yaxes('percentunit') }, + ) + .addPanel( + g.panel('Disk IO Saturation') + + g.queryPanel('instance:node_disk_saturation:avg_irate{instance="$instance"}', 'Saturation') + + { yaxes: g.yaxes('percentunit') }, + ) + ) + .addRow( + g.row('Net') + .addPanel( + g.panel('Net Utilisation (Transmitted)') + + g.queryPanel('instance:node_net_utilisation:sum_irate{instance="$instance"}', 'Utilisation') + + { yaxes: g.yaxes('Bps') }, + ) + .addPanel( + g.panel('Net Saturation (Dropped)') + + g.queryPanel('instance:node_net_saturation:sum_irate{instance="$instance"}', 'Saturation') + + { yaxes: g.yaxes('Bps') }, + ) + ) + .addRow( + g.row('Disk') + .addPanel( + g.panel('Disk Utilisation') + + g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size{fstype=~"ext[24]"}))', 'Disk') + + { yaxes: g.yaxes('percentunit') }, + ), + ), + }, +} diff --git a/node-mixin/jsonnetfile.json b/node-mixin/jsonnetfile.json new file mode 100644 index 00000000..45326aad --- /dev/null +++ b/node-mixin/jsonnetfile.json @@ -0,0 +1,24 @@ +{ + "dependencies": [ + { + "name": "grafonnet", + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib", + "subdir": "grafonnet" + } + }, + "version": "master" + }, + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/kausalco/public", + "subdir": "grafana-builder" + } + }, + "version": "master" + } + ] +} diff --git a/node-mixin/lib/promgrafonnet/gauge.libsonnet b/node-mixin/lib/promgrafonnet/gauge.libsonnet new file mode 100644 index 00000000..ea6c1ab6 --- /dev/null +++ b/node-mixin/lib/promgrafonnet/gauge.libsonnet @@ -0,0 +1,60 @@ +local grafana = import 'grafonnet/grafana.libsonnet'; +local singlestat = grafana.singlestat; +local prometheus = grafana.prometheus; + +{ + new(title, query):: + singlestat.new( + title, + datasource='prometheus', + span=3, + format='percent', + valueName='current', + colors=[ + 'rgba(245, 54, 54, 0.9)', + 'rgba(237, 129, 40, 0.89)', + 'rgba(50, 172, 45, 0.97)', + ], + thresholds='50, 80', + valueMaps=[ + { + op: '=', + text: 'N/A', + value: 'null', + }, + ], + ) + .addTarget( + prometheus.target( + query + ) + ) + { + gauge: { + maxValue: 100, + minValue: 0, + show: true, + thresholdLabels: false, + thresholdMarkers: true, + }, + withTextNullValue(text):: self { + valueMaps: [ + { + op: '=', + text: text, + value: 'null', + }, + ], + }, + withSpanSize(size):: self { + span: size, + }, + withLowerBeingBetter():: self { + colors: [ + 'rgba(50, 172, 45, 0.97)', + 'rgba(237, 129, 40, 0.89)', + 'rgba(245, 54, 54, 0.9)', + ], + thresholds: '80, 90', + }, + }, +} diff --git a/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet b/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet new file mode 100644 index 00000000..bc1d6f6f --- /dev/null +++ b/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet @@ -0,0 +1,48 @@ +local grafana = import 'grafonnet/grafana.libsonnet'; +local singlestat = grafana.singlestat; +local prometheus = grafana.prometheus; + +{ + new(title, query):: + singlestat.new( + title, + datasource='prometheus', + span=3, + valueName='current', + valueMaps=[ + { + op: '=', + text: '0', + value: 'null', + }, + ], + ) + .addTarget( + prometheus.target( + query + ) + ) + { + withTextNullValue(text):: self { + valueMaps: [ + { + op: '=', + text: text, + value: 'null', + }, + ], + }, + withSpanSize(size):: self { + span: size, + }, + withPostfix(postfix):: self { + postfix: postfix, + }, + withSparkline():: self { + sparkline: { + show: true, + lineColor: 'rgb(31, 120, 193)', + fillColor: 'rgba(31, 118, 189, 0.18)', + }, + }, + }, +} diff --git a/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet b/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet new file mode 100644 index 00000000..013ff42b --- /dev/null +++ b/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet @@ -0,0 +1,5 @@ +{ + numbersinglestat:: import 'numbersinglestat.libsonnet', + gauge:: import 'gauge.libsonnet', + percentlinegraph:: import 'percentlinegraph.libsonnet', +} diff --git a/node-mixin/mixin.libsonnet b/node-mixin/mixin.libsonnet new file mode 100644 index 00000000..b9831f93 --- /dev/null +++ b/node-mixin/mixin.libsonnet @@ -0,0 +1,4 @@ +(import 'config.libsonnet') + +(import 'alerts/alerts.libsonnet') + +(import 'dashboards/dashboards.libsonnet') + +(import 'rules/rules.libsonnet') diff --git a/node-mixin/rules/rules.libsonnet b/node-mixin/rules/rules.libsonnet new file mode 100644 index 00000000..ad1cc09b --- /dev/null +++ b/node-mixin/rules/rules.libsonnet @@ -0,0 +1,121 @@ +{ + prometheusRules+:: { + groups+: [ + { + name: 'node.rules', + rules: [ + { + // This rule gives the number of CPUs per node. + record: 'instance:node_num_cpu:sum', + expr: ||| + count by (instance) ( + sum by (instance, cpu) ( + node_cpu{%(nodeExporterSelector)s} + ) + ) + ||| % $._config, + }, + { + // CPU utilisation is % CPU is not idle. + record: 'instance:node_cpu_utilisation:avg1m', + expr: ||| + 1 - avg by (instance) ( + rate(node_cpu{%(nodeExporterSelector)s,mode="idle"}[1m]) + ) + ||| % $._config, + }, + { + // CPU saturation is 1min avg run queue length / number of CPUs. + // Can go over 100%. >100% is bad. + record: 'instance:node_cpu_saturation_load1:', + expr: ||| + sum by (instance) ( + node_load1{%(nodeExporterSelector)s} + ) + / + instance:node_num_cpu:sum + ||| % $._config, + }, + { + // Available memory per node + record: 'instance:node_memory_bytes_available:sum', + expr: ||| + sum by (instance) ( + (node_memory_MemFree{%(nodeExporterSelector)s} + node_memory_Cached{%(nodeExporterSelector)s} + node_memory_Buffers{%(nodeExporterSelector)s}) + ) + ||| % $._config, + }, + { + // Total memory per node + record: 'instance:node_memory_bytes_total:sum', + expr: ||| + sum by (instance) ( + node_memory_MemTotal{%(nodeExporterSelector)s} + ) + ||| % $._config, + }, + { + // Memory utilisation per node, normalized by per-node memory + record: 'instance:node_memory_utilisation:ratio', + expr: ||| + (instance:node_memory_bytes_total:sum - instance:node_memory_bytes_available:sum) + / + scalar(sum(instance:node_memory_bytes_total:sum)) + |||, + }, + { + record: 'instance:node_memory_utilisation:', + expr: ||| + 1 - (instance:node_memory_bytes_available:sum / instance:node_memory_bytes_total:sum) + ||| % $._config, + }, + { + record: 'instance:node_memory_swap_io_bytes:sum_rate', + expr: ||| + 1e3 * sum by (instance) ( + (rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m]) + + rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m])) + ) + ||| % $._config, + }, + { + // Disk utilisation (ms spent, by rate() it's bound by 1 second) + record: 'instance:node_disk_utilisation:avg_irate', + expr: ||| + avg by (instance) ( + irate(node_disk_io_time_ms{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3 + ) + ||| % $._config, + }, + { + // Disk saturation (ms spent, by rate() it's bound by 1 second) + record: 'instance:node_disk_saturation:avg_irate', + expr: ||| + avg by (instance) ( + irate(node_disk_io_time_weighted{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3 + ) + ||| % $._config, + }, + { + record: 'instance:node_net_utilisation:sum_irate', + expr: ||| + sum by (instance) ( + (irate(node_network_receive_bytes{%(nodeExporterSelector)s,device="eth0"}[1m]) + + irate(node_network_transmit_bytes{%(nodeExporterSelector)s,device="eth0"}[1m])) + ) + ||| % $._config, + }, + { + record: 'instance:node_net_saturation:sum_irate', + expr: ||| + sum by (instance) ( + (irate(node_network_receive_drop{%(nodeExporterSelector)s,device="eth0"}[1m]) + + irate(node_network_transmit_drop{%(nodeExporterSelector)s,device="eth0"}[1m])) + ) + ||| % $._config, + }, + ], + }, + ], + }, +}