From a92d1d7889ddcbaad50e821cb155795bf3e9758a Mon Sep 17 00:00:00 2001 From: beorn7 Date: Tue, 16 Jul 2019 21:18:17 +0200 Subject: [PATCH] Address review comments, batch 2 Signed-off-by: beorn7 --- docs/node-mixin/alerts/alerts.libsonnet | 12 +++--- docs/node-mixin/config.libsonnet | 5 ++- docs/node-mixin/dashboards/node.libsonnet | 16 +++++--- docs/node-mixin/dashboards/use.libsonnet | 34 ++++++++++------- docs/node-mixin/rules/rules.libsonnet | 46 ++++++++++++++--------- 5 files changed, 68 insertions(+), 45 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 013a9ee3..76bbb031 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -43,7 +43,7 @@ }, }, { - alert: 'NodeFilesystemOutOfSpace', + alert: 'NodeFilesystemAlmostOutOfSpace', expr: ||| ( node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5 @@ -60,7 +60,7 @@ }, }, { - alert: 'NodeFilesystemOutOfSpace', + alert: 'NodeFilesystemAlmostOutOfSpace', expr: ||| ( node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3 @@ -115,7 +115,7 @@ }, }, { - alert: 'NodeFilesystemOutOfFiles', + alert: 'NodeFilesystemAlmostOutOfFiles', expr: ||| ( node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5 @@ -132,7 +132,7 @@ }, }, { - alert: 'NodeFilesystemOutOfSpace', + alert: 'NodeFilesystemAlmostOutOfFiles', expr: ||| ( node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3 @@ -155,7 +155,7 @@ ||| % $._config, 'for': '1h', labels: { - severity: 'critical', + severity: 'warning', }, annotations: { message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while receiving packets ({{ $value }} errors in two minutes).', @@ -168,7 +168,7 @@ ||| % $._config, 'for': '1h', labels: { - severity: 'critical', + severity: 'warning', }, annotations: { message: '{{ $labels.instance }} interface {{ $labels.device }} shows errors while transmitting packets ({{ $value }} errors in two minutes).', diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet index de84b9ee..701d9bea 100644 --- a/docs/node-mixin/config.libsonnet +++ b/docs/node-mixin/config.libsonnet @@ -3,10 +3,11 @@ // Selectors are inserted between {} in Prometheus queries. // Select the metrics coming from the node exporter. - nodeExporterSelector: 'job="node-exporter"', + nodeExporterSelector: 'job="node"', // Select the fstype for filesystem-related queries. - fsSelector: 'fstype=~"ext.|xfs",mountpoint!="/var/lib/docker/aufs"', + // TODO: What is a good default selector here? + fsSelector: 'fstype=~"ext.|xfs|jfs|btrfs|vfat|ntfs"', // Select the device for disk-related queries. diskDeviceSelector: 'device=~"(sd|xvd).+"', diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet index 040d60a3..915cbe48 100644 --- a/docs/node-mixin/dashboards/node.libsonnet +++ b/docs/node-mixin/dashboards/node.libsonnet @@ -20,8 +20,9 @@ local gauge = promgrafonnet.gauge; min=0, ) .addTarget(prometheus.target( + // TODO: Consider using `${__interval}` as range and a 1m min step. ||| - 1 - avg by (cpu) (irate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])) + 1 - avg by (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])) ||| % $._config, legendFormat='{{cpu}}', intervalFactor=10, @@ -81,9 +82,10 @@ local gauge = promgrafonnet.gauge; datasource='$datasource', span=9, ) - .addTarget(prometheus.target('sum by (instance) (irate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read')) - .addTarget(prometheus.target('sum by (instance) (irate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written')) - .addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) + + // TODO: Consider using `${__interval}` as range and a 1m min step. + .addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} read')) + .addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} written')) + .addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} io time')) + { seriesOverrides: [ { @@ -122,7 +124,8 @@ local gauge = promgrafonnet.gauge; span=6, format='bytes', ) - .addTarget(prometheus.target('irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}')); + // TODO: Consider using `${__interval}` as range and a 1m min step. + .addTarget(prometheus.target('rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}')); local networkTransmitted = graphPanel.new( @@ -131,7 +134,8 @@ local gauge = promgrafonnet.gauge; span=6, format='bytes', ) - .addTarget(prometheus.target('irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}')); + // TODO: Consider using `${__interval}` as range and a 1m min step. + .addTarget(prometheus.target('rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}')); dashboard.new('Nodes', time_from='now-1h') .addTemplate( diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index 115e893c..533f392b 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -12,7 +12,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('CPU Utilisation') + g.queryPanel(||| ( - instance:node_cpu_utilisation:avg1m + instance:node_cpu_utilisation:avg_rate1m * instance:node_num_cpu:sum / ignoring (instance) group_left @@ -60,9 +60,9 @@ local g = import 'grafana-builder/grafana.libsonnet'; // 1 second per second doing I/O, normalize by metric cardinality for stacked charts. g.queryPanel(||| ( - instance:node_disk_utilisation:sum_irate + instance:node_disk_io_time:sum_rate1m / ignoring (instance) group_left - count without (instance) (instance:node_disk_utilisation:sum_irate) + count without (instance) (instance:node_disk_io_time:sum_rate1m) ) |||, '{{instance}}', legendLink) + g.stack + @@ -72,9 +72,9 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('Disk IO Saturation') + g.queryPanel(||| ( - instance:node_disk_saturation:sum_irate + instance:node_disk_io_time_weighted:sum_rate1m / ignoring (instance) group_left - count without (instance) (instance:node_disk_saturation:sum_irate) + count without (instance) (instance:node_disk_io_time_weighted:sum_rate1m) ) |||, '{{instance}}', legendLink) + g.stack + @@ -127,7 +127,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('CPU') .addPanel( g.panel('CPU Utilisation') + - g.queryPanel('instance:node_cpu_utilisation:avg1m{instance="$instance"}', 'Utilisation') + + g.queryPanel('instance:node_cpu_utilisation:avg_rate1m{instance="$instance"}', 'Utilisation') + { yaxes: g.yaxes('percentunit') }, ) .addPanel( @@ -145,7 +145,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; ) .addPanel( g.panel('Memory Saturation (pages swapped per second)') + - g.queryPanel('instance:node_memory_swap_io_pages:sum_rate{instance="$instance"}', 'Swap IO') + + g.queryPanel('instance:node_memory_swap_io_pages:rate1m{instance="$instance"}', 'Swap IO') + { yaxes: g.yaxes('short') }, ) ) @@ -153,26 +153,32 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Disk') .addPanel( g.panel('Disk IO Utilisation') + - g.queryPanel('instance:node_disk_utilisation:sum_irate{instance="$instance"}', 'Utilisation') + + g.queryPanel('instance:node_disk_io_time:sum_rate1m{instance="$instance"}', 'Utilisation') + { yaxes: g.yaxes('percentunit') }, ) .addPanel( g.panel('Disk IO Saturation') + - g.queryPanel('instance:node_disk_saturation:sum_irate{instance="$instance"}', 'Saturation') + + g.queryPanel('instance:node_disk_io_time_weighted:sum_rate1m{instance="$instance"}', 'Saturation') + { yaxes: g.yaxes('percentunit') }, ) ) .addRow( g.row('Net') .addPanel( - g.panel('Net Utilisation (Transmitted)') + - g.queryPanel('instance:node_net_utilisation:sum_irate{instance="$instance"}', 'Utilisation') + + g.panel('Net Utilisation (Bytes Receive/Transmit)') + + g.queryPanel( + ['node_network_receive_bytes_total{instance="$instance"}', '-node_network_transmit_bytes_total{instance="$instance"}'], + ['Receive', 'Transmit'], + ) + { yaxes: g.yaxes('Bps') }, ) .addPanel( - g.panel('Net Saturation (Dropped)') + - g.queryPanel('instance:node_net_saturation:sum_irate{instance="$instance"}', 'Saturation') + - { yaxes: g.yaxes('Bps') }, + g.panel('Net Saturation (Drops Receive/Transmit)') + + g.queryPanel( + ['node_network_receive_drop_total{instance="$instance"}', '-node_network_transmit_drop_total{instance="$instance"}'], + ['Receive drops', 'Transmit drops'], + ) + + { yaxes: g.yaxes('rps') }, ) ) .addRow( diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet index c4bc31a8..5422f443 100644 --- a/docs/node-mixin/rules/rules.libsonnet +++ b/docs/node-mixin/rules/rules.libsonnet @@ -17,7 +17,7 @@ }, { // CPU utilisation is % CPU is not idle. - record: 'instance:node_cpu_utilisation:avg1m', + record: 'instance:node_cpu_utilisation:avg_rate1m', expr: ||| 1 - avg without (cpu, mode) ( rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}[1m]) @@ -48,7 +48,7 @@ ||| % $._config, }, { - record: 'instance:node_memory_swap_io_pages:sum_rate', + record: 'instance:node_memory_swap_io_pages:rate1m', expr: ||| ( rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m]) @@ -58,42 +58,54 @@ ||| % $._config, }, { - // Disk utilisation (ms spent, 1 second irate()) - record: 'instance:node_disk_utilisation:sum_irate', + // Disk utilisation (seconds spent, 1 second rate) + record: 'instance:node_disk_io_time:sum_rate1m', expr: ||| sum without (device) ( - irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) + rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) ) ||| % $._config, }, { - // Disk saturation (ms spent, by rate() it's bound by 1 second) - record: 'instance:node_disk_saturation:sum_irate', + // Disk saturation (weighted seconds spent, 1 second rate) + record: 'instance:node_disk_io_time_weighted:sum_rate1m', expr: ||| sum without (device) ( - irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) + rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) ) ||| % $._config, }, - // TODO: For the following two rules, consider configurable filtering to exclude more network + // TODO: For the following rules, consider configurable filtering to exclude more network // device names than just "lo". { - record: 'instance:node_net_utilisation:sum_irate', + record: 'instance:node_network_receive_bytes:sum_rate1m', expr: ||| sum without (device) ( - irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) - + - irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) + rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) ) ||| % $._config, }, { - record: 'instance:node_net_saturation:sum_irate', + record: 'instance:node_network_transmit_bytes:sum_rate1m', expr: ||| sum without (device) ( - irate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) - + - irate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) + rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) + ) + ||| % $._config, + }, + { + record: 'instance:node_network_receive_drop:sum_rate1m', + expr: ||| + sum without (device) ( + rate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) + ) + ||| % $._config, + }, + { + record: 'instance:node_network_transmit_drop:sum_rate1m', + expr: ||| + sum without (device) ( + rate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) ) ||| % $._config, },