From bafe1707f13f9da58c7a88b42f15ab596f649ba9 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Tue, 8 May 2018 12:10:29 +0200 Subject: [PATCH 01/13] Beginnings of a node-exporter monitoring mixin. Signed-off-by: Tom Wilkie --- node-mixin/alerts/alerts.libsonnet | 165 ++++++++++++++++ node-mixin/config.libsonnet | 11 ++ node-mixin/dashboards/dashboards.libsonnet | 2 + node-mixin/dashboards/node.libsonnet | 176 ++++++++++++++++++ node-mixin/dashboards/use.libsonnet | 151 +++++++++++++++ node-mixin/jsonnetfile.json | 24 +++ node-mixin/lib/promgrafonnet/gauge.libsonnet | 60 ++++++ .../promgrafonnet/numbersinglestat.libsonnet | 48 +++++ .../lib/promgrafonnet/promgrafonnet.libsonnet | 5 + node-mixin/mixin.libsonnet | 4 + node-mixin/rules/rules.libsonnet | 121 ++++++++++++ 11 files changed, 767 insertions(+) create mode 100644 node-mixin/alerts/alerts.libsonnet create mode 100644 node-mixin/config.libsonnet create mode 100644 node-mixin/dashboards/dashboards.libsonnet create mode 100644 node-mixin/dashboards/node.libsonnet create mode 100644 node-mixin/dashboards/use.libsonnet create mode 100644 node-mixin/jsonnetfile.json create mode 100644 node-mixin/lib/promgrafonnet/gauge.libsonnet create mode 100644 node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet create mode 100644 node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet create mode 100644 node-mixin/mixin.libsonnet create mode 100644 node-mixin/rules/rules.libsonnet diff --git a/node-mixin/alerts/alerts.libsonnet b/node-mixin/alerts/alerts.libsonnet new file mode 100644 index 00000000..198e22fd --- /dev/null +++ b/node-mixin/alerts/alerts.libsonnet @@ -0,0 +1,165 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'node', + rules: [ + { + alert: 'NodeFilesystemSpaceFillingUp', + expr: ||| + predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + AND + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 24 hours.', + }, + }, + { + alert: 'NodeFilesystemSpaceFillingUp', + expr: ||| + predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + AND + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of space within the next 4 hours.', + }, + }, + { + alert: 'NodeFilesystemOutOfSpace', + expr: ||| + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', + }, + }, + { + alert: 'NodeFilesystemOutOfSpace', + expr: ||| + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', + }, + }, + { + alert: 'NodeFilesystemFilesFillingUp', + expr: ||| + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + AND + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 24 hours.', + }, + }, + { + alert: 'NodeFilesystemFilesFillingUp', + expr: ||| + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + AND + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 4 hours.', + }, + }, + { + alert: 'NodeFilesystemOutOfFiles', + expr: ||| + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available inodes left.', + }, + }, + { + alert: 'NodeFilesystemOutOfSpace', + expr: ||| + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + AND + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ $value }}% available space left.', + }, + }, + { + alert: 'NodeNetworkReceiveErrs', + expr: ||| + increase(node_network_receive_errs[2m]) > 10 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while receiving packets ({{ $value }} errors in two minutes).', + }, + }, + { + alert: 'NodeNetworkTransmitErrs', + expr: ||| + increase(node_network_transmit_errs[2m]) > 10 + ||| % $._config, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while transmitting packets ({{ $value }} errors in two minutes).', + }, + }, + ], + }, + ], + }, +} diff --git a/node-mixin/config.libsonnet b/node-mixin/config.libsonnet new file mode 100644 index 00000000..6c5d6f74 --- /dev/null +++ b/node-mixin/config.libsonnet @@ -0,0 +1,11 @@ +{ + _config+:: { + // Selectors are inserted between {} in Prometheus queries. + nodeExporterSelector: 'job="node-exporter"', + + // Mainly extracted because they are repetitive, but also useful to customize. + fsSelectors: 'fstype=~"ext.|xfs",mountpoint!="/var/lib/docker/aufs"', + + grafana_prefix: '', + }, +} diff --git a/node-mixin/dashboards/dashboards.libsonnet b/node-mixin/dashboards/dashboards.libsonnet new file mode 100644 index 00000000..e6adbd4f --- /dev/null +++ b/node-mixin/dashboards/dashboards.libsonnet @@ -0,0 +1,2 @@ +(import 'node.libsonnet') + +(import 'use.libsonnet') diff --git a/node-mixin/dashboards/node.libsonnet b/node-mixin/dashboards/node.libsonnet new file mode 100644 index 00000000..471c5b37 --- /dev/null +++ b/node-mixin/dashboards/node.libsonnet @@ -0,0 +1,176 @@ +local grafana = import 'grafonnet/grafana.libsonnet'; +local dashboard = grafana.dashboard; +local row = grafana.row; +local prometheus = grafana.prometheus; +local template = grafana.template; +local graphPanel = grafana.graphPanel; +local promgrafonnet = import '../lib/promgrafonnet/promgrafonnet.libsonnet'; +local gauge = promgrafonnet.gauge; + +{ + grafanaDashboards+:: { + 'nodes.json': + local idleCPU = + graphPanel.new( + 'Idle CPU', + datasource='$datasource', + span=6, + format='percent', + max=100, + min=0, + ) + .addTarget(prometheus.target( + ||| + 100 - (avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[5m])) * 100) + ||| % $._config, + legendFormat='{{cpu}}', + intervalFactor=10, + )); + + local systemLoad = + graphPanel.new( + 'System load', + datasource='$datasource', + span=6, + format='percent', + ) + .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 1m')) + .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 5m')) + .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 15m')); + + local memoryGraph = + graphPanel.new( + 'Memory Usage', + datasource='$datasource', + span=9, + format='bytes', + ) + .addTarget(prometheus.target( + ||| + node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"} + ||| % $._config, legendFormat='memory used' + )) + .addTarget(prometheus.target('node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers')) + .addTarget(prometheus.target('node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) + .addTarget(prometheus.target('node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); + + local memoryGauge = gauge.new( + 'Memory Usage', + ||| + ( + node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"} + ) * 100 + / + node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + ||| % $._config, + ).withLowerBeingBetter(); + + local diskIO = + graphPanel.new( + 'Disk I/O', + datasource='$datasource', + span=9, + ) + .addTarget(prometheus.target('sum by (instance) (rate(node_disk_bytes_read{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='read')) + .addTarget(prometheus.target('sum by (instance) (rate(node_disk_bytes_written{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='written')) + .addTarget(prometheus.target('sum by (instance) (rate(node_disk_io_time_ms{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='io time')) + + { + seriesOverrides: [ + { + alias: 'read', + yaxis: 1, + }, + { + alias: 'io time', + yaxis: 2, + }, + ], + yaxes: [ + self.yaxe(format='bytes'), + self.yaxe(format='ms'), + ], + }; + + local diskSpaceUsage = gauge.new( + 'Disk Space Usage', + ||| + ( + sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}) + - sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}) + ) * 100 + / + sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}) + ||| % $._config, + ).withLowerBeingBetter(); + + local networkReceived = + graphPanel.new( + 'Network Received', + datasource='$datasource', + span=6, + format='bytes', + ) + .addTarget(prometheus.target('rate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[5m])' % $._config, legendFormat='{{device}}')); + + local networkTransmitted = + graphPanel.new( + 'Network Transmitted', + datasource='$datasource', + span=6, + format='bytes', + ) + .addTarget(prometheus.target('rate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[5m])' % $._config, legendFormat='{{device}}')); + + dashboard.new('Nodes', time_from='now-1h') + .addTemplate( + { + current: { + text: 'Prometheus', + value: 'Prometheus', + }, + hide: 0, + label: null, + name: 'datasource', + options: [], + query: 'prometheus', + refresh: 1, + regex: '', + type: 'datasource', + }, + ) + .addTemplate( + template.new( + 'instance', + '$datasource', + 'label_values(node_boot_time{%(nodeExporterSelector)s}, instance)' % $._config, + refresh='time', + ) + ) + .addRow( + row.new() + .addPanel(idleCPU) + .addPanel(systemLoad) + ) + .addRow( + row.new() + .addPanel(memoryGraph) + .addPanel(memoryGauge) + ) + .addRow( + row.new() + .addPanel(diskIO) + .addPanel(diskSpaceUsage) + ) + .addRow( + row.new() + .addPanel(networkReceived) + .addPanel(networkTransmitted) + ), + }, +} diff --git a/node-mixin/dashboards/use.libsonnet b/node-mixin/dashboards/use.libsonnet new file mode 100644 index 00000000..526002f6 --- /dev/null +++ b/node-mixin/dashboards/use.libsonnet @@ -0,0 +1,151 @@ +local g = import 'grafana-builder/grafana.libsonnet'; + +{ + grafanaDashboards+:: { + 'node-cluster-rsrc-use.json': + local legendLink = '%s/dashboard/file/k8s-node-rsrc-use.json' % $._config.grafana_prefix; + + g.dashboard('USE Method / Cluster') + .addRow( + g.row('CPU') + .addPanel( + g.panel('CPU Utilisation') + + g.queryPanel('instance:node_cpu_utilisation:avg1m * instance:node_num_cpu:sum / scalar(sum(instance:node_num_cpu:sum))', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + g.panel('CPU Saturation (Load1)') + + g.queryPanel(||| + instance:node_cpu_saturation_load1: / scalar(sum(up{%(nodeExporterSelector)s})) + ||| % $._config, '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + ) + .addRow( + g.row('Memory') + .addPanel( + g.panel('Memory Utilisation') + + g.queryPanel('instance:node_memory_utilisation:ratio', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + g.panel('Memory Saturation (Swap I/O)') + + g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes('Bps') }, + ) + ) + .addRow( + g.row('Disk') + .addPanel( + g.panel('Disk IO Utilisation') + + // Full utilisation would be all disks on each node spending an average of + // 1 sec per second doing I/O, normalize by node count for stacked charts + g.queryPanel(||| + instance:node_disk_utilisation:avg_irate / scalar(sum(up{%(nodeExporterSelector)s})) + ||| % $._config, '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + g.panel('Disk IO Saturation') + + g.queryPanel(||| + instance:node_disk_saturation:avg_irate / scalar(sum(up{%(nodeExporterSelector)s})) + ||| % $._config, '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ) + ) + .addRow( + g.row('Network') + .addPanel( + g.panel('Net Utilisation (Transmitted)') + + g.queryPanel('instance:node_net_utilisation:sum_irate', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes('Bps') }, + ) + .addPanel( + g.panel('Net Saturation (Dropped)') + + g.queryPanel('instance:node_net_saturation:sum_irate', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes('Bps') }, + ) + ) + .addRow( + g.row('Storage') + .addPanel( + g.panel('Disk Capacity') + + g.queryPanel('sum(max(node_filesystem_size{fstype=~"ext[24]"} - node_filesystem_free{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) + + g.stack + + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, + ), + ), + + 'k8s-node-rsrc-use.json': + g.dashboard('K8s / USE Method / Node') + .addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance') + .addRow( + g.row('CPU') + .addPanel( + g.panel('CPU Utilisation') + + g.queryPanel('instance:node_cpu_utilisation:avg1m{instance="$instance"}', 'Utilisation') + + { yaxes: g.yaxes('percentunit') }, + ) + .addPanel( + g.panel('CPU Saturation (Load1)') + + g.queryPanel('instance:node_cpu_saturation_load1:{instance="$instance"}', 'Saturation') + + { yaxes: g.yaxes('percentunit') }, + ) + ) + .addRow( + g.row('Memory') + .addPanel( + g.panel('Memory Utilisation') + + g.queryPanel('instance:node_memory_utilisation:{instance="$instance"}', 'Memory') + + { yaxes: g.yaxes('percentunit') }, + ) + .addPanel( + g.panel('Memory Saturation (Swap I/O)') + + g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{instance="$instance"}', 'Swap IO') + + { yaxes: g.yaxes('Bps') }, + ) + ) + .addRow( + g.row('Disk') + .addPanel( + g.panel('Disk IO Utilisation') + + g.queryPanel('instance:node_disk_utilisation:avg_irate{instance="$instance"}', 'Utilisation') + + { yaxes: g.yaxes('percentunit') }, + ) + .addPanel( + g.panel('Disk IO Saturation') + + g.queryPanel('instance:node_disk_saturation:avg_irate{instance="$instance"}', 'Saturation') + + { yaxes: g.yaxes('percentunit') }, + ) + ) + .addRow( + g.row('Net') + .addPanel( + g.panel('Net Utilisation (Transmitted)') + + g.queryPanel('instance:node_net_utilisation:sum_irate{instance="$instance"}', 'Utilisation') + + { yaxes: g.yaxes('Bps') }, + ) + .addPanel( + g.panel('Net Saturation (Dropped)') + + g.queryPanel('instance:node_net_saturation:sum_irate{instance="$instance"}', 'Saturation') + + { yaxes: g.yaxes('Bps') }, + ) + ) + .addRow( + g.row('Disk') + .addPanel( + g.panel('Disk Utilisation') + + g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size{fstype=~"ext[24]"}))', 'Disk') + + { yaxes: g.yaxes('percentunit') }, + ), + ), + }, +} diff --git a/node-mixin/jsonnetfile.json b/node-mixin/jsonnetfile.json new file mode 100644 index 00000000..45326aad --- /dev/null +++ b/node-mixin/jsonnetfile.json @@ -0,0 +1,24 @@ +{ + "dependencies": [ + { + "name": "grafonnet", + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib", + "subdir": "grafonnet" + } + }, + "version": "master" + }, + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/kausalco/public", + "subdir": "grafana-builder" + } + }, + "version": "master" + } + ] +} diff --git a/node-mixin/lib/promgrafonnet/gauge.libsonnet b/node-mixin/lib/promgrafonnet/gauge.libsonnet new file mode 100644 index 00000000..ea6c1ab6 --- /dev/null +++ b/node-mixin/lib/promgrafonnet/gauge.libsonnet @@ -0,0 +1,60 @@ +local grafana = import 'grafonnet/grafana.libsonnet'; +local singlestat = grafana.singlestat; +local prometheus = grafana.prometheus; + +{ + new(title, query):: + singlestat.new( + title, + datasource='prometheus', + span=3, + format='percent', + valueName='current', + colors=[ + 'rgba(245, 54, 54, 0.9)', + 'rgba(237, 129, 40, 0.89)', + 'rgba(50, 172, 45, 0.97)', + ], + thresholds='50, 80', + valueMaps=[ + { + op: '=', + text: 'N/A', + value: 'null', + }, + ], + ) + .addTarget( + prometheus.target( + query + ) + ) + { + gauge: { + maxValue: 100, + minValue: 0, + show: true, + thresholdLabels: false, + thresholdMarkers: true, + }, + withTextNullValue(text):: self { + valueMaps: [ + { + op: '=', + text: text, + value: 'null', + }, + ], + }, + withSpanSize(size):: self { + span: size, + }, + withLowerBeingBetter():: self { + colors: [ + 'rgba(50, 172, 45, 0.97)', + 'rgba(237, 129, 40, 0.89)', + 'rgba(245, 54, 54, 0.9)', + ], + thresholds: '80, 90', + }, + }, +} diff --git a/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet b/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet new file mode 100644 index 00000000..bc1d6f6f --- /dev/null +++ b/node-mixin/lib/promgrafonnet/numbersinglestat.libsonnet @@ -0,0 +1,48 @@ +local grafana = import 'grafonnet/grafana.libsonnet'; +local singlestat = grafana.singlestat; +local prometheus = grafana.prometheus; + +{ + new(title, query):: + singlestat.new( + title, + datasource='prometheus', + span=3, + valueName='current', + valueMaps=[ + { + op: '=', + text: '0', + value: 'null', + }, + ], + ) + .addTarget( + prometheus.target( + query + ) + ) + { + withTextNullValue(text):: self { + valueMaps: [ + { + op: '=', + text: text, + value: 'null', + }, + ], + }, + withSpanSize(size):: self { + span: size, + }, + withPostfix(postfix):: self { + postfix: postfix, + }, + withSparkline():: self { + sparkline: { + show: true, + lineColor: 'rgb(31, 120, 193)', + fillColor: 'rgba(31, 118, 189, 0.18)', + }, + }, + }, +} diff --git a/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet b/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet new file mode 100644 index 00000000..013ff42b --- /dev/null +++ b/node-mixin/lib/promgrafonnet/promgrafonnet.libsonnet @@ -0,0 +1,5 @@ +{ + numbersinglestat:: import 'numbersinglestat.libsonnet', + gauge:: import 'gauge.libsonnet', + percentlinegraph:: import 'percentlinegraph.libsonnet', +} diff --git a/node-mixin/mixin.libsonnet b/node-mixin/mixin.libsonnet new file mode 100644 index 00000000..b9831f93 --- /dev/null +++ b/node-mixin/mixin.libsonnet @@ -0,0 +1,4 @@ +(import 'config.libsonnet') + +(import 'alerts/alerts.libsonnet') + +(import 'dashboards/dashboards.libsonnet') + +(import 'rules/rules.libsonnet') diff --git a/node-mixin/rules/rules.libsonnet b/node-mixin/rules/rules.libsonnet new file mode 100644 index 00000000..ad1cc09b --- /dev/null +++ b/node-mixin/rules/rules.libsonnet @@ -0,0 +1,121 @@ +{ + prometheusRules+:: { + groups+: [ + { + name: 'node.rules', + rules: [ + { + // This rule gives the number of CPUs per node. + record: 'instance:node_num_cpu:sum', + expr: ||| + count by (instance) ( + sum by (instance, cpu) ( + node_cpu{%(nodeExporterSelector)s} + ) + ) + ||| % $._config, + }, + { + // CPU utilisation is % CPU is not idle. + record: 'instance:node_cpu_utilisation:avg1m', + expr: ||| + 1 - avg by (instance) ( + rate(node_cpu{%(nodeExporterSelector)s,mode="idle"}[1m]) + ) + ||| % $._config, + }, + { + // CPU saturation is 1min avg run queue length / number of CPUs. + // Can go over 100%. >100% is bad. + record: 'instance:node_cpu_saturation_load1:', + expr: ||| + sum by (instance) ( + node_load1{%(nodeExporterSelector)s} + ) + / + instance:node_num_cpu:sum + ||| % $._config, + }, + { + // Available memory per node + record: 'instance:node_memory_bytes_available:sum', + expr: ||| + sum by (instance) ( + (node_memory_MemFree{%(nodeExporterSelector)s} + node_memory_Cached{%(nodeExporterSelector)s} + node_memory_Buffers{%(nodeExporterSelector)s}) + ) + ||| % $._config, + }, + { + // Total memory per node + record: 'instance:node_memory_bytes_total:sum', + expr: ||| + sum by (instance) ( + node_memory_MemTotal{%(nodeExporterSelector)s} + ) + ||| % $._config, + }, + { + // Memory utilisation per node, normalized by per-node memory + record: 'instance:node_memory_utilisation:ratio', + expr: ||| + (instance:node_memory_bytes_total:sum - instance:node_memory_bytes_available:sum) + / + scalar(sum(instance:node_memory_bytes_total:sum)) + |||, + }, + { + record: 'instance:node_memory_utilisation:', + expr: ||| + 1 - (instance:node_memory_bytes_available:sum / instance:node_memory_bytes_total:sum) + ||| % $._config, + }, + { + record: 'instance:node_memory_swap_io_bytes:sum_rate', + expr: ||| + 1e3 * sum by (instance) ( + (rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m]) + + rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m])) + ) + ||| % $._config, + }, + { + // Disk utilisation (ms spent, by rate() it's bound by 1 second) + record: 'instance:node_disk_utilisation:avg_irate', + expr: ||| + avg by (instance) ( + irate(node_disk_io_time_ms{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3 + ) + ||| % $._config, + }, + { + // Disk saturation (ms spent, by rate() it's bound by 1 second) + record: 'instance:node_disk_saturation:avg_irate', + expr: ||| + avg by (instance) ( + irate(node_disk_io_time_weighted{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3 + ) + ||| % $._config, + }, + { + record: 'instance:node_net_utilisation:sum_irate', + expr: ||| + sum by (instance) ( + (irate(node_network_receive_bytes{%(nodeExporterSelector)s,device="eth0"}[1m]) + + irate(node_network_transmit_bytes{%(nodeExporterSelector)s,device="eth0"}[1m])) + ) + ||| % $._config, + }, + { + record: 'instance:node_net_saturation:sum_irate', + expr: ||| + sum by (instance) ( + (irate(node_network_receive_drop{%(nodeExporterSelector)s,device="eth0"}[1m]) + + irate(node_network_transmit_drop{%(nodeExporterSelector)s,device="eth0"}[1m])) + ) + ||| % $._config, + }, + ], + }, + ], + }, +} From 9303cf78ff1713ac7e114c2f0fc9da9b99577ffa Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Thu, 10 May 2018 10:35:35 +0200 Subject: [PATCH 02/13] Lower case binary operators and fix indentation. Signed-off-by: Tom Wilkie --- node-mixin/alerts/alerts.libsonnet | 64 +++++++++++++++--------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/node-mixin/alerts/alerts.libsonnet b/node-mixin/alerts/alerts.libsonnet index 198e22fd..c66d76db 100644 --- a/node-mixin/alerts/alerts.libsonnet +++ b/node-mixin/alerts/alerts.libsonnet @@ -7,11 +7,11 @@ { alert: 'NodeFilesystemSpaceFillingUp', expr: ||| - predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 - AND - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 - AND - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + and + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -24,11 +24,11 @@ { alert: 'NodeFilesystemSpaceFillingUp', expr: ||| - predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 - AND - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 - AND - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + and + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -41,9 +41,9 @@ { alert: 'NodeFilesystemOutOfSpace', expr: ||| - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 - AND - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -56,9 +56,9 @@ { alert: 'NodeFilesystemOutOfSpace', expr: ||| - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 - AND - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -71,11 +71,11 @@ { alert: 'NodeFilesystemFilesFillingUp', expr: ||| - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 - AND - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 - AND - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + and + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -88,11 +88,11 @@ { alert: 'NodeFilesystemFilesFillingUp', expr: ||| - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 - AND - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 - AND - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + and + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -105,9 +105,9 @@ { alert: 'NodeFilesystemOutOfFiles', expr: ||| - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 - AND - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -120,9 +120,9 @@ { alert: 'NodeFilesystemOutOfSpace', expr: ||| - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 - AND - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + and + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { From 417316b0e498ac661f6502d3df0896f2137fa255 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Thu, 10 May 2018 11:05:59 +0200 Subject: [PATCH 03/13] Switch to irate[1m] for node dashboard. Signed-off-by: Tom Wilkie --- node-mixin/dashboards/node.libsonnet | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/node-mixin/dashboards/node.libsonnet b/node-mixin/dashboards/node.libsonnet index 471c5b37..943864f9 100644 --- a/node-mixin/dashboards/node.libsonnet +++ b/node-mixin/dashboards/node.libsonnet @@ -15,13 +15,13 @@ local gauge = promgrafonnet.gauge; 'Idle CPU', datasource='$datasource', span=6, - format='percent', + format='percentunit', max=100, min=0, ) .addTarget(prometheus.target( ||| - 100 - (avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[5m])) * 100) + 1 - avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])) ||| % $._config, legendFormat='{{cpu}}', intervalFactor=10, @@ -32,11 +32,11 @@ local gauge = promgrafonnet.gauge; 'System load', datasource='$datasource', span=6, - format='percent', + format='percentunit', ) - .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 1m')) - .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 5m')) - .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"} * 100' % $._config, legendFormat='load 15m')); + .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 1m')) + .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 5m')) + .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 15m')); local memoryGraph = graphPanel.new( @@ -77,9 +77,9 @@ local gauge = promgrafonnet.gauge; datasource='$datasource', span=9, ) - .addTarget(prometheus.target('sum by (instance) (rate(node_disk_bytes_read{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='read')) - .addTarget(prometheus.target('sum by (instance) (rate(node_disk_bytes_written{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='written')) - .addTarget(prometheus.target('sum by (instance) (rate(node_disk_io_time_ms{%(nodeExporterSelector)s, instance="$instance"}[2m]))' % $._config, legendFormat='io time')) + + .addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_read_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read')) + .addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_written_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written')) + .addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_ms{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) + { seriesOverrides: [ { @@ -116,7 +116,7 @@ local gauge = promgrafonnet.gauge; span=6, format='bytes', ) - .addTarget(prometheus.target('rate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[5m])' % $._config, legendFormat='{{device}}')); + .addTarget(prometheus.target('irate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}')); local networkTransmitted = graphPanel.new( @@ -125,7 +125,7 @@ local gauge = promgrafonnet.gauge; span=6, format='bytes', ) - .addTarget(prometheus.target('rate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[5m])' % $._config, legendFormat='{{device}}')); + .addTarget(prometheus.target('irate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}')); dashboard.new('Nodes', time_from='now-1h') .addTemplate( From c34275d6e587fc7d3a76d208f1ffc058adc82098 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Thu, 10 May 2018 11:21:00 +0200 Subject: [PATCH 04/13] Switch gauges to percentunit. Signed-off-by: Tom Wilkie --- node-mixin/dashboards/node.libsonnet | 18 ++++++------------ node-mixin/lib/promgrafonnet/gauge.libsonnet | 2 +- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/node-mixin/dashboards/node.libsonnet b/node-mixin/dashboards/node.libsonnet index 943864f9..ba092cf7 100644 --- a/node-mixin/dashboards/node.libsonnet +++ b/node-mixin/dashboards/node.libsonnet @@ -60,14 +60,9 @@ local gauge = promgrafonnet.gauge; local memoryGauge = gauge.new( 'Memory Usage', ||| - ( + node_memory_MemAvailable{%(nodeExporterSelector)s, instance="$instance"} + / node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} - - node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"} - - node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"} - - node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"} - ) * 100 - / - node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} ||| % $._config, ).withLowerBeingBetter(); @@ -100,12 +95,11 @@ local gauge = promgrafonnet.gauge; local diskSpaceUsage = gauge.new( 'Disk Space Usage', ||| - ( - sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}) - - sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}) - ) * 100 + 1 - ( + sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} / - sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}) + sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} + ) ||| % $._config, ).withLowerBeingBetter(); diff --git a/node-mixin/lib/promgrafonnet/gauge.libsonnet b/node-mixin/lib/promgrafonnet/gauge.libsonnet index ea6c1ab6..f69a5cdc 100644 --- a/node-mixin/lib/promgrafonnet/gauge.libsonnet +++ b/node-mixin/lib/promgrafonnet/gauge.libsonnet @@ -8,7 +8,7 @@ local prometheus = grafana.prometheus; title, datasource='prometheus', span=3, - format='percent', + format='percentunit', valueName='current', colors=[ 'rgba(245, 54, 54, 0.9)', From 642f67ffa1f3d2738ca89430d722e66a2398e673 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Thu, 10 May 2018 11:35:48 +0200 Subject: [PATCH 05/13] Fix up some of the USE metrics. Signed-off-by: Tom Wilkie --- node-mixin/dashboards/use.libsonnet | 10 +++--- node-mixin/rules/rules.libsonnet | 47 ++++++++++------------------- 2 files changed, 21 insertions(+), 36 deletions(-) diff --git a/node-mixin/dashboards/use.libsonnet b/node-mixin/dashboards/use.libsonnet index 526002f6..9231a746 100644 --- a/node-mixin/dashboards/use.libsonnet +++ b/node-mixin/dashboards/use.libsonnet @@ -45,7 +45,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; // Full utilisation would be all disks on each node spending an average of // 1 sec per second doing I/O, normalize by node count for stacked charts g.queryPanel(||| - instance:node_disk_utilisation:avg_irate / scalar(sum(up{%(nodeExporterSelector)s})) + instance:node_disk_utilisation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s})) ||| % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, @@ -53,7 +53,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; .addPanel( g.panel('Disk IO Saturation') + g.queryPanel(||| - instance:node_disk_saturation:avg_irate / scalar(sum(up{%(nodeExporterSelector)s})) + instance:node_disk_saturation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s})) ||| % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, @@ -104,7 +104,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Memory') .addPanel( g.panel('Memory Utilisation') + - g.queryPanel('instance:node_memory_utilisation:{instance="$instance"}', 'Memory') + + g.queryPanel('instance:node_memory_utilisation:ratio{instance="$instance"}', 'Memory') + { yaxes: g.yaxes('percentunit') }, ) .addPanel( @@ -117,12 +117,12 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Disk') .addPanel( g.panel('Disk IO Utilisation') + - g.queryPanel('instance:node_disk_utilisation:avg_irate{instance="$instance"}', 'Utilisation') + + g.queryPanel('instance:node_disk_utilisation:sum_irate{instance="$instance"}', 'Utilisation') + { yaxes: g.yaxes('percentunit') }, ) .addPanel( g.panel('Disk IO Saturation') + - g.queryPanel('instance:node_disk_saturation:avg_irate{instance="$instance"}', 'Saturation') + + g.queryPanel('instance:node_disk_saturation:sum_irate{instance="$instance"}', 'Saturation') + { yaxes: g.yaxes('percentunit') }, ) ) diff --git a/node-mixin/rules/rules.libsonnet b/node-mixin/rules/rules.libsonnet index ad1cc09b..7c70540e 100644 --- a/node-mixin/rules/rules.libsonnet +++ b/node-mixin/rules/rules.libsonnet @@ -29,20 +29,9 @@ // Can go over 100%. >100% is bad. record: 'instance:node_cpu_saturation_load1:', expr: ||| - sum by (instance) ( - node_load1{%(nodeExporterSelector)s} - ) + sum by (instance) (node_load1{%(nodeExporterSelector)s}) / - instance:node_num_cpu:sum - ||| % $._config, - }, - { - // Available memory per node - record: 'instance:node_memory_bytes_available:sum', - expr: ||| - sum by (instance) ( - (node_memory_MemFree{%(nodeExporterSelector)s} + node_memory_Cached{%(nodeExporterSelector)s} + node_memory_Buffers{%(nodeExporterSelector)s}) - ) + instance:node_num_cpu:sum ||| % $._config, }, { @@ -58,17 +47,13 @@ // Memory utilisation per node, normalized by per-node memory record: 'instance:node_memory_utilisation:ratio', expr: ||| - (instance:node_memory_bytes_total:sum - instance:node_memory_bytes_available:sum) - / - scalar(sum(instance:node_memory_bytes_total:sum)) + 1 - ( + node_memory_MemAvailable{%(nodeExporterSelector)s} + / + node_memory_MemTotal{%(nodeExporterSelector)s} + ) |||, }, - { - record: 'instance:node_memory_utilisation:', - expr: ||| - 1 - (instance:node_memory_bytes_available:sum / instance:node_memory_bytes_total:sum) - ||| % $._config, - }, { record: 'instance:node_memory_swap_io_bytes:sum_rate', expr: ||| @@ -79,19 +64,19 @@ ||| % $._config, }, { - // Disk utilisation (ms spent, by rate() it's bound by 1 second) - record: 'instance:node_disk_utilisation:avg_irate', + // Disk utilisation (ms spent, 1 second irate()) + record: 'instance:node_disk_utilisation:sum_irate', expr: ||| - avg by (instance) ( + sum by (instance) ( irate(node_disk_io_time_ms{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3 ) ||| % $._config, }, { // Disk saturation (ms spent, by rate() it's bound by 1 second) - record: 'instance:node_disk_saturation:avg_irate', + record: 'instance:node_disk_saturation:sum_irate', expr: ||| - avg by (instance) ( + sum by (instance) ( irate(node_disk_io_time_weighted{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3 ) ||| % $._config, @@ -100,8 +85,8 @@ record: 'instance:node_net_utilisation:sum_irate', expr: ||| sum by (instance) ( - (irate(node_network_receive_bytes{%(nodeExporterSelector)s,device="eth0"}[1m]) + - irate(node_network_transmit_bytes{%(nodeExporterSelector)s,device="eth0"}[1m])) + (irate(node_network_receive_bytes{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + + irate(node_network_transmit_bytes{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) ) ||| % $._config, }, @@ -109,8 +94,8 @@ record: 'instance:node_net_saturation:sum_irate', expr: ||| sum by (instance) ( - (irate(node_network_receive_drop{%(nodeExporterSelector)s,device="eth0"}[1m]) + - irate(node_network_transmit_drop{%(nodeExporterSelector)s,device="eth0"}[1m])) + (irate(node_network_receive_drop{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + + irate(node_network_transmit_drop{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) ) ||| % $._config, }, From bd648827fe430b2c61c19cff792cdc1e5abbaba5 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Fri, 11 May 2018 14:40:20 +0100 Subject: [PATCH 06/13] Remove k8s from dashboard title, make gauges use datasource variable. Signed-off-by: Tom Wilkie --- node-mixin/dashboards/use.libsonnet | 4 ++-- node-mixin/lib/promgrafonnet/gauge.libsonnet | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/node-mixin/dashboards/use.libsonnet b/node-mixin/dashboards/use.libsonnet index 9231a746..eeb72093 100644 --- a/node-mixin/dashboards/use.libsonnet +++ b/node-mixin/dashboards/use.libsonnet @@ -84,8 +84,8 @@ local g = import 'grafana-builder/grafana.libsonnet'; ), ), - 'k8s-node-rsrc-use.json': - g.dashboard('K8s / USE Method / Node') + 'node-rsrc-use.json': + g.dashboard('USE Method / Node') .addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance') .addRow( g.row('CPU') diff --git a/node-mixin/lib/promgrafonnet/gauge.libsonnet b/node-mixin/lib/promgrafonnet/gauge.libsonnet index f69a5cdc..43640b6d 100644 --- a/node-mixin/lib/promgrafonnet/gauge.libsonnet +++ b/node-mixin/lib/promgrafonnet/gauge.libsonnet @@ -6,7 +6,7 @@ local prometheus = grafana.prometheus; new(title, query):: singlestat.new( title, - datasource='prometheus', + datasource='$datasource', span=3, format='percentunit', valueName='current', From ff0a13d90056a88ef75cb135d32eeff45911ca7e Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 13 Jul 2018 15:01:01 +0200 Subject: [PATCH 07/13] Fix multiline strings Signed-off-by: Matthias Loibl --- node-mixin/alerts/alerts.libsonnet | 40 ++++++++++++++-------------- node-mixin/dashboards/node.libsonnet | 4 +-- node-mixin/rules/rules.libsonnet | 6 ++--- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/node-mixin/alerts/alerts.libsonnet b/node-mixin/alerts/alerts.libsonnet index c66d76db..17bbda8b 100644 --- a/node-mixin/alerts/alerts.libsonnet +++ b/node-mixin/alerts/alerts.libsonnet @@ -7,11 +7,11 @@ { alert: 'NodeFilesystemSpaceFillingUp', expr: ||| - predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 and - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -24,11 +24,11 @@ { alert: 'NodeFilesystemSpaceFillingUp', expr: ||| - predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 and - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -41,9 +41,9 @@ { alert: 'NodeFilesystemOutOfSpace', expr: ||| - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -56,9 +56,9 @@ { alert: 'NodeFilesystemOutOfSpace', expr: ||| - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -71,11 +71,11 @@ { alert: 'NodeFilesystemFilesFillingUp', expr: ||| - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 and - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -88,11 +88,11 @@ { alert: 'NodeFilesystemFilesFillingUp', expr: ||| - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 and - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -105,9 +105,9 @@ { alert: 'NodeFilesystemOutOfFiles', expr: ||| - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { @@ -120,9 +120,9 @@ { alert: 'NodeFilesystemOutOfSpace', expr: ||| - node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 and - node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 + node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, 'for': '1h', labels: { diff --git a/node-mixin/dashboards/node.libsonnet b/node-mixin/dashboards/node.libsonnet index ba092cf7..4594e3ed 100644 --- a/node-mixin/dashboards/node.libsonnet +++ b/node-mixin/dashboards/node.libsonnet @@ -60,9 +60,9 @@ local gauge = promgrafonnet.gauge; local memoryGauge = gauge.new( 'Memory Usage', ||| - node_memory_MemAvailable{%(nodeExporterSelector)s, instance="$instance"} + node_memory_MemAvailable{%(nodeExporterSelector)s, instance="$instance"} / - node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} ||| % $._config, ).withLowerBeingBetter(); diff --git a/node-mixin/rules/rules.libsonnet b/node-mixin/rules/rules.libsonnet index 7c70540e..72b18b9c 100644 --- a/node-mixin/rules/rules.libsonnet +++ b/node-mixin/rules/rules.libsonnet @@ -29,9 +29,9 @@ // Can go over 100%. >100% is bad. record: 'instance:node_cpu_saturation_load1:', expr: ||| - sum by (instance) (node_load1{%(nodeExporterSelector)s}) + sum by (instance) (node_load1{%(nodeExporterSelector)s}) / - instance:node_num_cpu:sum + instance:node_num_cpu:sum ||| % $._config, }, { @@ -52,7 +52,7 @@ / node_memory_MemTotal{%(nodeExporterSelector)s} ) - |||, + ||| % $._config, }, { record: 'instance:node_memory_swap_io_bytes:sum_rate', From 1482cc03095e1e85fc9c372edc3ca98949d7e5cf Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 6 Aug 2018 10:41:18 +0200 Subject: [PATCH 08/13] Rename group names to node-exporter to avoid naming collisions Signed-off-by: Matthias Loibl --- node-mixin/alerts/alerts.libsonnet | 2 +- node-mixin/rules/rules.libsonnet | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/node-mixin/alerts/alerts.libsonnet b/node-mixin/alerts/alerts.libsonnet index 17bbda8b..a0ca230f 100644 --- a/node-mixin/alerts/alerts.libsonnet +++ b/node-mixin/alerts/alerts.libsonnet @@ -2,7 +2,7 @@ prometheusAlerts+:: { groups+: [ { - name: 'node', + name: 'node-exporter', rules: [ { alert: 'NodeFilesystemSpaceFillingUp', diff --git a/node-mixin/rules/rules.libsonnet b/node-mixin/rules/rules.libsonnet index 72b18b9c..a9517119 100644 --- a/node-mixin/rules/rules.libsonnet +++ b/node-mixin/rules/rules.libsonnet @@ -2,7 +2,7 @@ prometheusRules+:: { groups+: [ { - name: 'node.rules', + name: 'node-exporter', rules: [ { // This rule gives the number of CPUs per node. From 961aa6770196407ac8282bea0a569365341e5775 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 6 Aug 2018 10:46:28 +0200 Subject: [PATCH 09/13] Append .rules to node_exporter.rules group name Signed-off-by: Matthias Loibl --- node-mixin/rules/rules.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/node-mixin/rules/rules.libsonnet b/node-mixin/rules/rules.libsonnet index a9517119..c3f74ba7 100644 --- a/node-mixin/rules/rules.libsonnet +++ b/node-mixin/rules/rules.libsonnet @@ -2,7 +2,7 @@ prometheusRules+:: { groups+: [ { - name: 'node-exporter', + name: 'node-exporter.rules', rules: [ { // This rule gives the number of CPUs per node. From 619e23e5df8b4d9765c51740d893f0ac790aba2c Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 19 Nov 2018 16:00:48 +0100 Subject: [PATCH 10/13] node-mixin: Update rules to node_exporter v0.16 Signed-off-by: Matthias Loibl --- node-mixin/rules/rules.libsonnet | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/node-mixin/rules/rules.libsonnet b/node-mixin/rules/rules.libsonnet index c3f74ba7..f836d0d0 100644 --- a/node-mixin/rules/rules.libsonnet +++ b/node-mixin/rules/rules.libsonnet @@ -10,7 +10,7 @@ expr: ||| count by (instance) ( sum by (instance, cpu) ( - node_cpu{%(nodeExporterSelector)s} + node_cpu_seconds_total{%(nodeExporterSelector)s} ) ) ||| % $._config, @@ -20,7 +20,7 @@ record: 'instance:node_cpu_utilisation:avg1m', expr: ||| 1 - avg by (instance) ( - rate(node_cpu{%(nodeExporterSelector)s,mode="idle"}[1m]) + rate(node_cpu_seconds_total{%(nodeExporterSelector)s,mode="idle"}[1m]) ) ||| % $._config, }, @@ -39,7 +39,7 @@ record: 'instance:node_memory_bytes_total:sum', expr: ||| sum by (instance) ( - node_memory_MemTotal{%(nodeExporterSelector)s} + node_memory_MemTotal_bytes{%(nodeExporterSelector)s} ) ||| % $._config, }, @@ -48,9 +48,9 @@ record: 'instance:node_memory_utilisation:ratio', expr: ||| 1 - ( - node_memory_MemAvailable{%(nodeExporterSelector)s} + node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / - node_memory_MemTotal{%(nodeExporterSelector)s} + node_memory_MemTotal_bytes{%(nodeExporterSelector)s} ) ||| % $._config, }, @@ -68,7 +68,7 @@ record: 'instance:node_disk_utilisation:sum_irate', expr: ||| sum by (instance) ( - irate(node_disk_io_time_ms{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3 + irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) ) ||| % $._config, }, @@ -77,7 +77,7 @@ record: 'instance:node_disk_saturation:sum_irate', expr: ||| sum by (instance) ( - irate(node_disk_io_time_weighted{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3 + irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) ) ||| % $._config, }, @@ -85,8 +85,8 @@ record: 'instance:node_net_utilisation:sum_irate', expr: ||| sum by (instance) ( - (irate(node_network_receive_bytes{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + - irate(node_network_transmit_bytes{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) + (irate(node_network_receive_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + + irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) ) ||| % $._config, }, @@ -94,8 +94,8 @@ record: 'instance:node_net_saturation:sum_irate', expr: ||| sum by (instance) ( - (irate(node_network_receive_drop{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + - irate(node_network_transmit_drop{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) + (irate(node_network_receive_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + + irate(node_network_transmit_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) ) ||| % $._config, }, From 53e4093b64ec5348c99897bc2b26002f1d3332c7 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 19 Nov 2018 16:11:37 +0100 Subject: [PATCH 11/13] node-mixin: Update alerts to node_exporter v0.16 Signed-off-by: Matthias Loibl --- node-mixin/alerts/alerts.libsonnet | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/node-mixin/alerts/alerts.libsonnet b/node-mixin/alerts/alerts.libsonnet index a0ca230f..8ea70cc7 100644 --- a/node-mixin/alerts/alerts.libsonnet +++ b/node-mixin/alerts/alerts.libsonnet @@ -7,9 +7,9 @@ { alert: 'NodeFilesystemSpaceFillingUp', expr: ||| - predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 + predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 24*60*60) < 0 and - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.4 and node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, @@ -24,9 +24,9 @@ { alert: 'NodeFilesystemSpaceFillingUp', expr: ||| - predict_linear(node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 + predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s}[6h], 4*60*60) < 0 and - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} < 0.2 and node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, @@ -41,7 +41,7 @@ { alert: 'NodeFilesystemOutOfSpace', expr: ||| - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 5 and node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, @@ -56,7 +56,7 @@ { alert: 'NodeFilesystemOutOfSpace', expr: ||| - node_filesystem_avail{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 + node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelectors)s} * 100 < 3 and node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelectors)s} == 0 ||| % $._config, @@ -96,7 +96,7 @@ ||| % $._config, 'for': '1h', labels: { - severity: 'warning', + severity: 'critical', }, annotations: { message: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted to run out of files within the next 4 hours.', @@ -135,7 +135,7 @@ { alert: 'NodeNetworkReceiveErrs', expr: ||| - increase(node_network_receive_errs[2m]) > 10 + increase(node_network_receive_errs_total[2m]) > 10 ||| % $._config, 'for': '1h', labels: { @@ -148,7 +148,7 @@ { alert: 'NodeNetworkTransmitErrs', expr: ||| - increase(node_network_transmit_errs[2m]) > 10 + increase(node_network_transmit_errs_total[2m]) > 10 ||| % $._config, 'for': '1h', labels: { From 61bc03adbed4737fa4c4b9a80d78f455f3998f74 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 19 Nov 2018 16:56:05 +0100 Subject: [PATCH 12/13] node-mixin: Ignore jsonnetfile.lock.json and vendor folder Signed-off-by: Matthias Loibl --- node-mixin/.gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 node-mixin/.gitignore diff --git a/node-mixin/.gitignore b/node-mixin/.gitignore new file mode 100644 index 00000000..65d141bd --- /dev/null +++ b/node-mixin/.gitignore @@ -0,0 +1,3 @@ +/jsonnetfile.lock.json +/vendor/ + From 0bcded8d2bc27a108c00aa06f812c3fbc7929faa Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Mon, 19 Nov 2018 17:40:30 +0100 Subject: [PATCH 13/13] node-mixin: Update dashboards to v0.16 Signed-off-by: Matthias Loibl --- node-mixin/dashboards/use.libsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node-mixin/dashboards/use.libsonnet b/node-mixin/dashboards/use.libsonnet index eeb72093..3e368c86 100644 --- a/node-mixin/dashboards/use.libsonnet +++ b/node-mixin/dashboards/use.libsonnet @@ -78,7 +78,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Storage') .addPanel( g.panel('Disk Capacity') + - g.queryPanel('sum(max(node_filesystem_size{fstype=~"ext[24]"} - node_filesystem_free{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) + + g.queryPanel('sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"} - node_filesystem_free_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ), @@ -143,7 +143,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Disk') .addPanel( g.panel('Disk Utilisation') + - g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size{fstype=~"ext[24]"}))', 'Disk') + + g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free_bytes{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size_bytes{fstype=~"ext[24]"}))', 'Disk') + { yaxes: g.yaxes('percentunit') }, ), ),