From 706511a49598db1c256a85b2b7dec4e6d754cabd Mon Sep 17 00:00:00 2001 From: beorn7 Date: Wed, 17 Jul 2019 23:54:31 +0200 Subject: [PATCH] Responses to review comments, round 3 Signed-off-by: beorn7 --- docs/node-mixin/config.libsonnet | 15 +++-- docs/node-mixin/dashboards/node.libsonnet | 18 ++++-- docs/node-mixin/dashboards/use.libsonnet | 76 ++++++++++++++--------- docs/node-mixin/rules/rules.libsonnet | 15 +++-- 4 files changed, 81 insertions(+), 43 deletions(-) diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet index 701d9bea..95070ca9 100644 --- a/docs/node-mixin/config.libsonnet +++ b/docs/node-mixin/config.libsonnet @@ -5,12 +5,17 @@ // Select the metrics coming from the node exporter. nodeExporterSelector: 'job="node"', - // Select the fstype for filesystem-related queries. - // TODO: What is a good default selector here? - fsSelector: 'fstype=~"ext.|xfs|jfs|btrfs|vfat|ntfs"', + // Select the fstype for filesystem-related queries. If left + // empty, all filesystems are selected. If you have unusual + // filesystem you don't want to include in dashboards and + // alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'. + fsSelector: '', - // Select the device for disk-related queries. - diskDeviceSelector: 'device=~"(sd|xvd).+"', + // Select the device for disk-related queries. If left empty, all + // devices are selected. If you have unusual devices you don't + // want to include in dashboards and alerting, you can exclude + // them here, e.g. 'device!="tmpfs"'. + diskDeviceSelector: '', grafana_prefix: '', }, diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet index 915cbe48..c3c97f37 100644 --- a/docs/node-mixin/dashboards/node.libsonnet +++ b/docs/node-mixin/dashboards/node.libsonnet @@ -22,7 +22,7 @@ local gauge = promgrafonnet.gauge; .addTarget(prometheus.target( // TODO: Consider using `${__interval}` as range and a 1m min step. ||| - 1 - avg by (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])) + 1 - rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m]) ||| % $._config, legendFormat='{{cpu}}', intervalFactor=10, @@ -64,15 +64,18 @@ local gauge = promgrafonnet.gauge; .addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) .addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); + // TODO: It would be nicer to have a gauge that gets a 0-1 range and displays it as a percentage 0%-100%. + // This needs to be added upstream in the promgrafonnet library and then changed here. local memoryGauge = gauge.new( 'Memory Usage', ||| + 100 - ( node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} - ) * 100 + ) ||| % $._config, ).withLowerBeingBetter(); @@ -82,10 +85,11 @@ local gauge = promgrafonnet.gauge; datasource='$datasource', span=9, ) + // TODO: Does it make sense to have those three in the same panel? // TODO: Consider using `${__interval}` as range and a 1m min step. - .addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} read')) - .addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} written')) - .addTarget(prometheus.target('sum by (instance, device) (rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='{{device}} io time')) + + .addTarget(prometheus.target('rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m])' % $._config, legendFormat='{{device}} read')) + .addTarget(prometheus.target('rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m])' % $._config, legendFormat='{{device}} written')) + .addTarget(prometheus.target('rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s, instance="$instance"}[1m])' % $._config, legendFormat='{{device}} io time')) + { seriesOverrides: [ { @@ -103,6 +107,8 @@ local gauge = promgrafonnet.gauge; ], }; + // TODO: It would be nicer to have a gauge that gets a 0-1 range and displays it as a percentage 0%-100%. + // This needs to be added upstream in the promgrafonnet library and then changed here. // TODO: Should this be partitioned by mountpoint? local diskSpaceUsage = gauge.new( 'Disk Space Usage', @@ -158,7 +164,7 @@ local gauge = promgrafonnet.gauge; template.new( 'instance', '$datasource', - 'label_values(node_boot_time_seconds{%(nodeExporterSelector)s}, instance)' % $._config, + 'label_values(node_exporter_build_info{%(nodeExporterSelector)s}, instance)' % $._config, refresh='time', ) ) diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index 533f392b..e3739ac2 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -12,13 +12,13 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('CPU Utilisation') + g.queryPanel(||| ( - instance:node_cpu_utilisation:avg_rate1m + instance:node_cpu_utilisation:avg_rate1m{%(nodeExporterSelector)s} * - instance:node_num_cpu:sum + instance:node_num_cpu:sum{%(nodeExporterSelector)s} / ignoring (instance) group_left - sum without (instance) (instance:node_num_cpu:sum) + sum without (instance) (instance:node_num_cpu:sum{%(nodeExporterSelector)s}) ) - |||, '{{instance}}', legendLink) + + ||| % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) @@ -27,11 +27,11 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('CPU Saturation (load1 per CPU)') + g.queryPanel(||| ( - instance:node_load1_per_cpu:ratio + instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s} / ignoring (instance) group_left - count without (instance) (instance:node_load1_per_cpu:ratio) + count without (instance) (instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}) ) - |||, '{{instance}}', legendLink) + + ||| % $._config, '{{instance}}', legendLink) + g.stack + // TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios. { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, @@ -41,13 +41,13 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Memory') .addPanel( g.panel('Memory Utilisation') + - g.queryPanel('instance:node_memory_utilisation:ratio', '{{instance}}', legendLink) + + g.queryPanel('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( g.panel('Memory Saturation (Swap I/O)') + - g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate', '{{instance}}', legendLink) + + g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes('Bps') }, ) @@ -60,11 +60,11 @@ local g = import 'grafana-builder/grafana.libsonnet'; // 1 second per second doing I/O, normalize by metric cardinality for stacked charts. g.queryPanel(||| ( - instance:node_disk_io_time:sum_rate1m + instance:node_disk_io_time_seconds:sum_rate1m{%(nodeExporterSelector)s} / ignoring (instance) group_left - count without (instance) (instance:node_disk_io_time:sum_rate1m) + count without (instance) (instance:node_disk_io_time_seconds:sum_rate1m{%(nodeExporterSelector)s}) ) - |||, '{{instance}}', legendLink) + + ||| % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) @@ -72,11 +72,11 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.panel('Disk IO Saturation') + g.queryPanel(||| ( - instance:node_disk_io_time_weighted:sum_rate1m + instance:node_disk_io_time_weighted_seconds:sum_rate1m{%(nodeExporterSelector)s} / ignoring (instance) group_left - count without (instance) (instance:node_disk_io_time_weighted:sum_rate1m) + count without (instance) (instance:node_disk_io_time_weighted_seconds:sum_rate1m{%(nodeExporterSelector)s}) ) - |||, '{{instance}}', legendLink) + + ||| % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) @@ -84,16 +84,30 @@ local g = import 'grafana-builder/grafana.libsonnet'; .addRow( g.row('Network') .addPanel( - g.panel('Net Utilisation (Transmitted)') + - g.queryPanel('instance:node_net_utilisation:sum_irate', '{{instance}}', legendLink) + + g.panel('Net Utilisation (Bytes Receive/Transmit)') + + g.queryPanel( + [ + 'instance:node_network_receive_bytes:sum_rate1m{%(nodeExporterSelector)s}' % $._config, + '-instance:node_network_transmit_bytes:sum_rate1m{%(nodeExporterSelector)s}' % $._config, + ], + ['{{instance}} Receive', '{{instance}} Transmit'], + legendLink, + ) + g.stack + { yaxes: g.yaxes('Bps') }, ) .addPanel( - g.panel('Net Saturation (Dropped)') + - g.queryPanel('instance:node_net_saturation:sum_irate', '{{instance}}', legendLink) + + g.panel('Net Saturation (Drops Receive/Transmit)') + + g.queryPanel( + [ + 'instance:node_network_receive_drop:sum_rate1m{%(nodeExporterSelector)s}' % $._config, + '-instance:node_network_transmit_drop:sum_rate1m{%(nodeExporterSelector)s}' % $._config, + ], + ['{{instance}} Receive', '{{instance}} Transmit'], + legendLink, + ) + g.stack + - { yaxes: g.yaxes('Bps') }, + { yaxes: g.yaxes('rps') }, ) ) .addRow( @@ -127,12 +141,12 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('CPU') .addPanel( g.panel('CPU Utilisation') + - g.queryPanel('instance:node_cpu_utilisation:avg_rate1m{instance="$instance"}', 'Utilisation') + + g.queryPanel('instance:node_cpu_utilisation:avg_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation') + { yaxes: g.yaxes('percentunit') }, ) .addPanel( g.panel('CPU Saturation (Load1)') + - g.queryPanel('instance:node_cpu_saturation_load1:{instance="$instance"}', 'Saturation') + + g.queryPanel('instance:node_cpu_saturation_load1:{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') + { yaxes: g.yaxes('percentunit') }, ) ) @@ -140,12 +154,12 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Memory') .addPanel( g.panel('Memory Utilisation') + - g.queryPanel('instance:node_memory_utilisation:ratio{instance="$instance"}', 'Memory') + + g.queryPanel('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Memory') + { yaxes: g.yaxes('percentunit') }, ) .addPanel( g.panel('Memory Saturation (pages swapped per second)') + - g.queryPanel('instance:node_memory_swap_io_pages:rate1m{instance="$instance"}', 'Swap IO') + + g.queryPanel('instance:node_memory_swap_io_pages:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Swap IO') + { yaxes: g.yaxes('short') }, ) ) @@ -153,12 +167,12 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Disk') .addPanel( g.panel('Disk IO Utilisation') + - g.queryPanel('instance:node_disk_io_time:sum_rate1m{instance="$instance"}', 'Utilisation') + + g.queryPanel('instance:node_disk_io_time_seconds:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation') + { yaxes: g.yaxes('percentunit') }, ) .addPanel( g.panel('Disk IO Saturation') + - g.queryPanel('instance:node_disk_io_time_weighted:sum_rate1m{instance="$instance"}', 'Saturation') + + g.queryPanel('instance:node_disk_io_time_weighted_seconds:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') + { yaxes: g.yaxes('percentunit') }, ) ) @@ -167,7 +181,10 @@ local g = import 'grafana-builder/grafana.libsonnet'; .addPanel( g.panel('Net Utilisation (Bytes Receive/Transmit)') + g.queryPanel( - ['node_network_receive_bytes_total{instance="$instance"}', '-node_network_transmit_bytes_total{instance="$instance"}'], + [ + 'instance:node_network_receive_bytes:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + '-instance:node_network_transmit_bytes:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + ], ['Receive', 'Transmit'], ) + { yaxes: g.yaxes('Bps') }, @@ -175,7 +192,10 @@ local g = import 'grafana-builder/grafana.libsonnet'; .addPanel( g.panel('Net Saturation (Drops Receive/Transmit)') + g.queryPanel( - ['node_network_receive_drop_total{instance="$instance"}', '-node_network_transmit_drop_total{instance="$instance"}'], + [ + 'instance:node_network_receive_drop:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + '-instance:node_network_transmit_drop:sum_rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, + ], ['Receive drops', 'Transmit drops'], ) + { yaxes: g.yaxes('rps') }, diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet index 5422f443..d8c0faed 100644 --- a/docs/node-mixin/rules/rules.libsonnet +++ b/docs/node-mixin/rules/rules.libsonnet @@ -9,7 +9,7 @@ record: 'instance:node_num_cpu:sum', expr: ||| count without (cpu) ( - sum without (mode) ( + count without (mode) ( node_cpu_seconds_total{%(nodeExporterSelector)s} ) ) @@ -26,7 +26,9 @@ }, { // This is CPU saturation: 1min avg run queue length / number of CPUs. - // Can go over 1. >1 is bad. + // Can go over 1. + // TODO: There are situation where a run queue >1/core is just normal and fine. + // We need to clarify how to lead this metric and if its usage is helpful at all. record: 'instance:node_load1_per_cpu:ratio', expr: ||| ( @@ -59,7 +61,9 @@ }, { // Disk utilisation (seconds spent, 1 second rate) - record: 'instance:node_disk_io_time:sum_rate1m', + // TODO: This should probably not aggregate over all devices but + // keep them separate. + record: 'instance:node_disk_io_time_seconds:sum_rate1m', expr: ||| sum without (device) ( rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) @@ -68,7 +72,9 @@ }, { // Disk saturation (weighted seconds spent, 1 second rate) - record: 'instance:node_disk_io_time_weighted:sum_rate1m', + // TODO: This should probably not aggregate over all devices but + // keep them separate. + record: 'instance:node_disk_io_time_weighted_seconds:sum_rate1m', expr: ||| sum without (device) ( rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) @@ -93,6 +99,7 @@ ) ||| % $._config, }, + // TODO: Find out if those drops ever happen on modern switched networks. { record: 'instance:node_network_receive_drop:sum_rate1m', expr: |||