diff --git a/node-mixin/dashboards/use.libsonnet b/node-mixin/dashboards/use.libsonnet index 526002f6..9231a746 100644 --- a/node-mixin/dashboards/use.libsonnet +++ b/node-mixin/dashboards/use.libsonnet @@ -45,7 +45,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; // Full utilisation would be all disks on each node spending an average of // 1 sec per second doing I/O, normalize by node count for stacked charts g.queryPanel(||| - instance:node_disk_utilisation:avg_irate / scalar(sum(up{%(nodeExporterSelector)s})) + instance:node_disk_utilisation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s})) ||| % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, @@ -53,7 +53,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; .addPanel( g.panel('Disk IO Saturation') + g.queryPanel(||| - instance:node_disk_saturation:avg_irate / scalar(sum(up{%(nodeExporterSelector)s})) + instance:node_disk_saturation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s})) ||| % $._config, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, @@ -104,7 +104,7 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Memory') .addPanel( g.panel('Memory Utilisation') + - g.queryPanel('instance:node_memory_utilisation:{instance="$instance"}', 'Memory') + + g.queryPanel('instance:node_memory_utilisation:ratio{instance="$instance"}', 'Memory') + { yaxes: g.yaxes('percentunit') }, ) .addPanel( @@ -117,12 +117,12 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Disk') .addPanel( g.panel('Disk IO Utilisation') + - g.queryPanel('instance:node_disk_utilisation:avg_irate{instance="$instance"}', 'Utilisation') + + g.queryPanel('instance:node_disk_utilisation:sum_irate{instance="$instance"}', 'Utilisation') + { yaxes: g.yaxes('percentunit') }, ) .addPanel( g.panel('Disk IO Saturation') + - g.queryPanel('instance:node_disk_saturation:avg_irate{instance="$instance"}', 'Saturation') + + g.queryPanel('instance:node_disk_saturation:sum_irate{instance="$instance"}', 'Saturation') + { yaxes: g.yaxes('percentunit') }, ) ) diff --git a/node-mixin/rules/rules.libsonnet b/node-mixin/rules/rules.libsonnet index ad1cc09b..7c70540e 100644 --- a/node-mixin/rules/rules.libsonnet +++ b/node-mixin/rules/rules.libsonnet @@ -29,20 +29,9 @@ // Can go over 100%. >100% is bad. record: 'instance:node_cpu_saturation_load1:', expr: ||| - sum by (instance) ( - node_load1{%(nodeExporterSelector)s} - ) + sum by (instance) (node_load1{%(nodeExporterSelector)s}) / - instance:node_num_cpu:sum - ||| % $._config, - }, - { - // Available memory per node - record: 'instance:node_memory_bytes_available:sum', - expr: ||| - sum by (instance) ( - (node_memory_MemFree{%(nodeExporterSelector)s} + node_memory_Cached{%(nodeExporterSelector)s} + node_memory_Buffers{%(nodeExporterSelector)s}) - ) + instance:node_num_cpu:sum ||| % $._config, }, { @@ -58,17 +47,13 @@ // Memory utilisation per node, normalized by per-node memory record: 'instance:node_memory_utilisation:ratio', expr: ||| - (instance:node_memory_bytes_total:sum - instance:node_memory_bytes_available:sum) - / - scalar(sum(instance:node_memory_bytes_total:sum)) + 1 - ( + node_memory_MemAvailable{%(nodeExporterSelector)s} + / + node_memory_MemTotal{%(nodeExporterSelector)s} + ) |||, }, - { - record: 'instance:node_memory_utilisation:', - expr: ||| - 1 - (instance:node_memory_bytes_available:sum / instance:node_memory_bytes_total:sum) - ||| % $._config, - }, { record: 'instance:node_memory_swap_io_bytes:sum_rate', expr: ||| @@ -79,19 +64,19 @@ ||| % $._config, }, { - // Disk utilisation (ms spent, by rate() it's bound by 1 second) - record: 'instance:node_disk_utilisation:avg_irate', + // Disk utilisation (ms spent, 1 second irate()) + record: 'instance:node_disk_utilisation:sum_irate', expr: ||| - avg by (instance) ( + sum by (instance) ( irate(node_disk_io_time_ms{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3 ) ||| % $._config, }, { // Disk saturation (ms spent, by rate() it's bound by 1 second) - record: 'instance:node_disk_saturation:avg_irate', + record: 'instance:node_disk_saturation:sum_irate', expr: ||| - avg by (instance) ( + sum by (instance) ( irate(node_disk_io_time_weighted{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) / 1e3 ) ||| % $._config, @@ -100,8 +85,8 @@ record: 'instance:node_net_utilisation:sum_irate', expr: ||| sum by (instance) ( - (irate(node_network_receive_bytes{%(nodeExporterSelector)s,device="eth0"}[1m]) + - irate(node_network_transmit_bytes{%(nodeExporterSelector)s,device="eth0"}[1m])) + (irate(node_network_receive_bytes{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + + irate(node_network_transmit_bytes{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) ) ||| % $._config, }, @@ -109,8 +94,8 @@ record: 'instance:node_net_saturation:sum_irate', expr: ||| sum by (instance) ( - (irate(node_network_receive_drop{%(nodeExporterSelector)s,device="eth0"}[1m]) + - irate(node_network_transmit_drop{%(nodeExporterSelector)s,device="eth0"}[1m])) + (irate(node_network_receive_drop{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + + irate(node_network_transmit_drop{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])) ) ||| % $._config, },