mirror of
				https://github.com/prometheus/node_exporter.git
				synced 2025-08-20 18:33:52 -07:00 
			
		
		
		
	The `instance:node_memory_swap_io_pages:rate1m` rule was intended to measure the amount of memory pressure a system is under, but its name is a bit misleading (it specifically refers to swap), and the rate of `node_vmstat_pgmajfault` is a better metric for memory pressure (see #1524). This commit renames `instance:node_memory_swap_io_pages:rate1m` to `instance:node_vmstat_pgmajfault:rate1m`, and defines it as `rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[1m])`. The dashboards are updated accordingly. Signed-off-by: Benoît Knecht <benoit.knecht@fsfe.org>
		
			
				
	
	
		
			296 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			296 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| local g = import 'grafana-builder/grafana.libsonnet';
 | |
| 
 | |
| {
 | |
|   grafanaDashboards+:: {
 | |
|     'node-cluster-rsrc-use.json':
 | |
|       local legendLink = '%s/dashboard/file/node-rsrc-use.json' % $._config.grafana_prefix;
 | |
| 
 | |
|       g.dashboard('USE Method / Cluster')
 | |
|       .addRow(
 | |
|         g.row('CPU')
 | |
|         .addPanel(
 | |
|           g.panel('CPU Utilisation') +
 | |
|           g.queryPanel(|||
 | |
|             (
 | |
|               instance:node_cpu_utilisation:rate1m{%(nodeExporterSelector)s}
 | |
|             *
 | |
|               instance:node_num_cpu:sum{%(nodeExporterSelector)s}
 | |
|             / ignoring (instance) group_left
 | |
|               sum without (instance) (instance:node_num_cpu:sum{%(nodeExporterSelector)s})
 | |
|             )
 | |
|           ||| % $._config, '{{instance}}', legendLink) +
 | |
|           g.stack +
 | |
|           { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
 | |
|         )
 | |
|         .addPanel(
 | |
|           // TODO: Is this a useful panel? At least there should be some explanation how load
 | |
|           // average relates to the "CPU saturation" in the title.
 | |
|           g.panel('CPU Saturation (load1 per CPU)') +
 | |
|           g.queryPanel(|||
 | |
|             (
 | |
|               instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}
 | |
|             / ignoring (instance) group_left
 | |
|               count without (instance) (instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s})
 | |
|             )
 | |
|           ||| % $._config, '{{instance}}', legendLink) +
 | |
|           g.stack +
 | |
|           // TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios.
 | |
|           { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
 | |
|         )
 | |
|       )
 | |
|       .addRow(
 | |
|         g.row('Memory')
 | |
|         .addPanel(
 | |
|           g.panel('Memory Utilisation') +
 | |
|           g.queryPanel(|||
 | |
|             (
 | |
|               instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}
 | |
|             / ignoring (instance) group_left
 | |
|               count without (instance) (instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s})
 | |
|             )
 | |
|           ||| % $._config, '{{instance}}', legendLink) +
 | |
|           g.stack +
 | |
|           { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
 | |
|         )
 | |
|         .addPanel(
 | |
|           g.panel('Memory Saturation (Major Page Faults)') +
 | |
|           g.queryPanel('instance:node_vmstat_pgmajfault:rate1m{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) +
 | |
|           g.stack +
 | |
|           { yaxes: g.yaxes('rps') },
 | |
|         )
 | |
|       )
 | |
|       .addRow(
 | |
|         g.row('Network')
 | |
|         .addPanel(
 | |
|           g.panel('Net Utilisation (Bytes Receive/Transmit)') +
 | |
|           g.queryPanel(
 | |
|             [
 | |
|               'instance:node_network_receive_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config,
 | |
|               'instance:node_network_transmit_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config,
 | |
|             ],
 | |
|             ['{{instance}} Receive', '{{instance}} Transmit'],
 | |
|             legendLink,
 | |
|           ) +
 | |
|           g.stack +
 | |
|           {
 | |
|             yaxes: g.yaxes({ format: 'Bps', min: null }),
 | |
|             seriesOverrides: [
 | |
|               {
 | |
|                 alias: '/ Receive/',
 | |
|                 stack: 'A',
 | |
|               },
 | |
|               {
 | |
|                 alias: '/ Transmit/',
 | |
|                 stack: 'B',
 | |
|                 transform: 'negative-Y',
 | |
|               },
 | |
|             ],
 | |
|           },
 | |
|         )
 | |
|         .addPanel(
 | |
|           g.panel('Net Saturation (Drops Receive/Transmit)') +
 | |
|           g.queryPanel(
 | |
|             [
 | |
|               'instance:node_network_receive_drop_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config,
 | |
|               'instance:node_network_transmit_drop_excluding_lo:rate1m{%(nodeExporterSelector)s}' % $._config,
 | |
|             ],
 | |
|             ['{{instance}} Receive', '{{instance}} Transmit'],
 | |
|             legendLink,
 | |
|           ) +
 | |
|           g.stack +
 | |
|           {
 | |
|             yaxes: g.yaxes({ format: 'rps', min: null }),
 | |
|             seriesOverrides: [
 | |
|               {
 | |
|                 alias: '/ Receive/',
 | |
|                 stack: 'A',
 | |
|               },
 | |
|               {
 | |
|                 alias: '/ Transmit/',
 | |
|                 stack: 'B',
 | |
|                 transform: 'negative-Y',
 | |
|               },
 | |
|             ],
 | |
|           },
 | |
|         )
 | |
|       )
 | |
|       .addRow(
 | |
|         g.row('Disk IO')
 | |
|         .addPanel(
 | |
|           g.panel('Disk IO Utilisation') +
 | |
|           // Full utilisation would be all disks on each node spending an average of
 | |
|           // 1 second per second doing I/O, normalize by metric cardinality for stacked charts.
 | |
|           // TODO: Does the partition by device make sense? Using the most utilized device per
 | |
|           // instance might make more sense.
 | |
|           g.queryPanel(|||
 | |
|             (
 | |
|               instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s}
 | |
|             / ignoring (instance, device) group_left
 | |
|               count without (instance, device) (instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s})
 | |
|             )
 | |
|           ||| % $._config, '{{instance}} {{device}}', legendLink) +
 | |
|           g.stack +
 | |
|           { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
 | |
|         )
 | |
|         .addPanel(
 | |
|           g.panel('Disk IO Saturation') +
 | |
|           g.queryPanel(|||
 | |
|             (
 | |
|               instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s}
 | |
|             / ignoring (instance, device) group_left
 | |
|               count without (instance, device) (instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s})
 | |
|             )
 | |
|           ||| % $._config, '{{instance}} {{device}}', legendLink) +
 | |
|           g.stack +
 | |
|           { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
 | |
|         )
 | |
|       )
 | |
|       .addRow(
 | |
|         g.row('Disk Space')
 | |
|         .addPanel(
 | |
|           g.panel('Disk Space Utilisation') +
 | |
|           g.queryPanel(|||
 | |
|             (
 | |
|               sum without (device) (
 | |
|                 max without (fstype, mountpoint) (
 | |
|                   node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
 | |
|                 )
 | |
|               ) 
 | |
|             / ignoring (instance) group_left
 | |
|               sum without (instance, device) (
 | |
|                 max without (fstype, mountpoint) (
 | |
|                   node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
 | |
|                 )
 | |
|               )
 | |
|             )  
 | |
|           ||| % $._config, '{{instance}}', legendLink) +
 | |
|           g.stack +
 | |
|           { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
 | |
|         ),
 | |
|       ),
 | |
| 
 | |
|     'node-rsrc-use.json':
 | |
|       g.dashboard('USE Method / Node')
 | |
|       .addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance')
 | |
|       .addRow(
 | |
|         g.row('CPU')
 | |
|         .addPanel(
 | |
|           g.panel('CPU Utilisation') +
 | |
|           g.queryPanel('instance:node_cpu_utilisation:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation') +
 | |
|           {
 | |
|             yaxes: g.yaxes('percentunit'),
 | |
|             legend+: { show: false },
 | |
|           },
 | |
|         )
 | |
|         .addPanel(
 | |
|           // TODO: Is this a useful panel? At least there should be some explanation how load
 | |
|           // average relates to the "CPU saturation" in the title.
 | |
|           g.panel('CPU Saturation (Load1 per CPU)') +
 | |
|           g.queryPanel('instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') +
 | |
|           {
 | |
|             yaxes: g.yaxes('percentunit'),
 | |
|             legend+: { show: false },
 | |
|           },
 | |
|         )
 | |
|       )
 | |
|       .addRow(
 | |
|         g.row('Memory')
 | |
|         .addPanel(
 | |
|           g.panel('Memory Utilisation') +
 | |
|           g.queryPanel('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Memory') +
 | |
|           { yaxes: g.yaxes('percentunit') },
 | |
|         )
 | |
|         .addPanel(
 | |
|           g.panel('Memory Saturation (Major Page Faults)') +
 | |
|           g.queryPanel('instance:node_vmstat_pgmajfault:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Major page faults') +
 | |
|           {
 | |
|             yaxes: g.yaxes('short'),
 | |
|             legend+: { show: false },
 | |
|           },
 | |
|         )
 | |
|       )
 | |
|       .addRow(
 | |
|         g.row('Net')
 | |
|         .addPanel(
 | |
|           g.panel('Net Utilisation (Bytes Receive/Transmit)') +
 | |
|           g.queryPanel(
 | |
|             [
 | |
|               'instance:node_network_receive_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config,
 | |
|               'instance:node_network_transmit_bytes_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config,
 | |
|             ],
 | |
|             ['Receive', 'Transmit'],
 | |
|           ) +
 | |
|           {
 | |
|             yaxes: g.yaxes({ format: 'Bps', min: null }),
 | |
|             seriesOverrides: [
 | |
|               {
 | |
|                 alias: '/Receive/',
 | |
|                 stack: 'A',
 | |
|               },
 | |
|               {
 | |
|                 alias: '/Transmit/',
 | |
|                 stack: 'B',
 | |
|                 transform: 'negative-Y',
 | |
|               },
 | |
|             ],
 | |
|           },
 | |
|         )
 | |
|         .addPanel(
 | |
|           g.panel('Net Saturation (Drops Receive/Transmit)') +
 | |
|           g.queryPanel(
 | |
|             [
 | |
|               'instance:node_network_receive_drop_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config,
 | |
|               'instance:node_network_transmit_drop_excluding_lo:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config,
 | |
|             ],
 | |
|             ['Receive drops', 'Transmit drops'],
 | |
|           ) +
 | |
|           {
 | |
|             yaxes: g.yaxes({ format: 'rps', min: null }),
 | |
|             seriesOverrides: [
 | |
|               {
 | |
|                 alias: '/Receive/',
 | |
|                 stack: 'A',
 | |
|               },
 | |
|               {
 | |
|                 alias: '/Transmit/',
 | |
|                 stack: 'B',
 | |
|                 transform: 'negative-Y',
 | |
|               },
 | |
|             ],
 | |
|           },
 | |
|         )
 | |
|       )
 | |
|       .addRow(
 | |
|         g.row('Disk IO')
 | |
|         .addPanel(
 | |
|           g.panel('Disk IO Utilisation') +
 | |
|           g.queryPanel('instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, '{{device}}') +
 | |
|           { yaxes: g.yaxes('percentunit') },
 | |
|         )
 | |
|         .addPanel(
 | |
|           g.panel('Disk IO Saturation') +
 | |
|           g.queryPanel('instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s, instance="$instance"}' % $._config, '{{device}}') +
 | |
|           { yaxes: g.yaxes('percentunit') },
 | |
|         )
 | |
|       )
 | |
|       .addRow(
 | |
|         g.row('Disk Space')
 | |
|         .addPanel(
 | |
|           g.panel('Disk Space Utilisation') +
 | |
|           g.queryPanel(|||
 | |
|             1 -
 | |
|             (
 | |
|               max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"})
 | |
|             /
 | |
|               max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"})
 | |
|             )
 | |
|           ||| % $._config, '{{device}}') +
 | |
|           {
 | |
|             yaxes: g.yaxes('percentunit'),
 | |
|             legend+: { show: false },
 | |
|           },
 | |
|         ),
 | |
|       ),
 | |
|   },
 | |
| }
 |