mirror of
				https://github.com/prometheus/node_exporter.git
				synced 2025-08-20 18:33:52 -07:00 
			
		
		
		
	Replace load average with PSI metric
The load average metric is misleading as a representation of CPU saturation. Normal CPU utilization is a better real representation of saturation. On newer Linux, there is a new Pressure Stall Information[0] metric that better represents CPU over saturation. This is also useful as it can make single-core saturation more visible. [0]: https://www.kernel.org/doc/html/latest/accounting/psi.html Signed-off-by: Ben Kochie <superq@gmail.com>
This commit is contained in:
		
							parent
							
								
									aea88e4dc5
								
							
						
					
					
						commit
						d90b2d83d7
					
				|  | @ -16,7 +16,7 @@ | ||||||
|             ||| % $._config, |             ||| % $._config, | ||||||
|           }, |           }, | ||||||
|           { |           { | ||||||
|             // CPU utilisation is % CPU is not idle. |             // CPU utilisation is % CPU is not idle. This represents CPU saturation. | ||||||
|             record: 'instance:node_cpu_utilisation:rate%(rateInterval)s' % $._config, |             record: 'instance:node_cpu_utilisation:rate%(rateInterval)s' % $._config, | ||||||
|             expr: ||| |             expr: ||| | ||||||
|               1 - avg without (cpu, mode) ( |               1 - avg without (cpu, mode) ( | ||||||
|  | @ -25,17 +25,14 @@ | ||||||
|             ||| % $._config, |             ||| % $._config, | ||||||
|           }, |           }, | ||||||
|           { |           { | ||||||
|             // This is CPU saturation: 1min avg run queue length / number of CPUs. |             // CPU pressure represents over-saturation. This is the amount of CPU seconds | ||||||
|             // Can go over 1. |             // requested, but the kernel was not able to schedule. | ||||||
|             // TODO: There are situation where a run queue >1/core is just normal and fine. |             // NOTE: This is only availalbe on Linux >= 4.19 and `CONFIG_PSI` is enabled. | ||||||
|             //       We need to clarify how to read this metric and if its usage is helpful at all. |             // See also: | ||||||
|             record: 'instance:node_load1_per_cpu:ratio', |             // - https://www.kernel.org/doc/html/latest/accounting/psi.html | ||||||
|  |             // - https://facebookmicrosites.github.io/psi/docs/overview | ||||||
|             expr: ||| |             expr: ||| | ||||||
|               ( |               rate(node_pressure_cpu_waiting_seconds_total{%(nodeExporterSelector)s}[%(rateInterval)s]) | ||||||
|                 node_load1{%(nodeExporterSelector)s} |  | ||||||
|               / |  | ||||||
|                 instance:node_num_cpu:sum{%(nodeExporterSelector)s} |  | ||||||
|               ) |  | ||||||
|             ||| % $._config, |             ||| % $._config, | ||||||
|           }, |           }, | ||||||
|           { |           { | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue