mirror of
				https://github.com/prometheus/node_exporter.git
				synced 2025-08-20 18:33:52 -07:00 
			
		
		
		
	Replace load average with PSI metric
The load average metric is misleading as a representation of CPU saturation. Normal CPU utilization is a better real representation of saturation. On newer Linux, there is a new Pressure Stall Information[0] metric that better represents CPU over saturation. This is also useful as it can make single-core saturation more visible. [0]: https://www.kernel.org/doc/html/latest/accounting/psi.html Signed-off-by: Ben Kochie <superq@gmail.com>
This commit is contained in:
		
							parent
							
								
									aea88e4dc5
								
							
						
					
					
						commit
						d90b2d83d7
					
				|  | @ -16,7 +16,7 @@ | |||
|             ||| % $._config, | ||||
|           }, | ||||
|           { | ||||
|             // CPU utilisation is % CPU is not idle. | ||||
|             // CPU utilisation is % CPU is not idle. This represents CPU saturation. | ||||
|             record: 'instance:node_cpu_utilisation:rate%(rateInterval)s' % $._config, | ||||
|             expr: ||| | ||||
|               1 - avg without (cpu, mode) ( | ||||
|  | @ -25,17 +25,14 @@ | |||
|             ||| % $._config, | ||||
|           }, | ||||
|           { | ||||
|             // This is CPU saturation: 1min avg run queue length / number of CPUs. | ||||
|             // Can go over 1. | ||||
|             // TODO: There are situation where a run queue >1/core is just normal and fine. | ||||
|             //       We need to clarify how to read this metric and if its usage is helpful at all. | ||||
|             record: 'instance:node_load1_per_cpu:ratio', | ||||
|             // CPU pressure represents over-saturation. This is the amount of CPU seconds | ||||
|             // requested, but the kernel was not able to schedule. | ||||
|             // NOTE: This is only availalbe on Linux >= 4.19 and `CONFIG_PSI` is enabled. | ||||
|             // See also: | ||||
|             // - https://www.kernel.org/doc/html/latest/accounting/psi.html | ||||
|             // - https://facebookmicrosites.github.io/psi/docs/overview | ||||
|             expr: ||| | ||||
|               ( | ||||
|                 node_load1{%(nodeExporterSelector)s} | ||||
|               / | ||||
|                 instance:node_num_cpu:sum{%(nodeExporterSelector)s} | ||||
|               ) | ||||
|               rate(node_pressure_cpu_waiting_seconds_total{%(nodeExporterSelector)s}[%(rateInterval)s]) | ||||
|             ||| % $._config, | ||||
|           }, | ||||
|           { | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue