| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  | { | 
					
						
							|  |  |  |   prometheusRules+:: { | 
					
						
							|  |  |  |     groups+: [ | 
					
						
							|  |  |  |       { | 
					
						
							| 
									
										
										
										
											2018-08-06 01:46:28 -07:00
										 |  |  |         name: 'node-exporter.rules', | 
					
						
							| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  |         rules: [ | 
					
						
							|  |  |  |           { | 
					
						
							|  |  |  |             // This rule gives the number of CPUs per node. | 
					
						
							|  |  |  |             record: 'instance:node_num_cpu:sum', | 
					
						
							|  |  |  |             expr: ||| | 
					
						
							| 
									
										
										
										
											2019-07-12 13:58:43 -07:00
										 |  |  |               count without (cpu) ( | 
					
						
							| 
									
										
										
										
											2019-07-17 14:54:31 -07:00
										 |  |  |                 count without (mode) ( | 
					
						
							| 
									
										
										
										
											2018-11-19 07:00:48 -08:00
										 |  |  |                   node_cpu_seconds_total{%(nodeExporterSelector)s} | 
					
						
							| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  |                 ) | 
					
						
							|  |  |  |               ) | 
					
						
							|  |  |  |             ||| % $._config, | 
					
						
							|  |  |  |           }, | 
					
						
							|  |  |  |           { | 
					
						
							|  |  |  |             // CPU utilisation is % CPU is not idle. | 
					
						
							| 
									
										
										
										
											2019-07-16 12:18:17 -07:00
										 |  |  |             record: 'instance:node_cpu_utilisation:avg_rate1m', | 
					
						
							| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  |             expr: ||| | 
					
						
							| 
									
										
										
										
											2019-07-12 13:58:43 -07:00
										 |  |  |               1 - avg without (cpu, mode) ( | 
					
						
							|  |  |  |                 rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}[1m]) | 
					
						
							| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  |               ) | 
					
						
							|  |  |  |             ||| % $._config, | 
					
						
							|  |  |  |           }, | 
					
						
							|  |  |  |           { | 
					
						
							| 
									
										
										
										
											2019-07-12 13:58:43 -07:00
										 |  |  |             // This is CPU saturation: 1min avg run queue length / number of CPUs. | 
					
						
							| 
									
										
										
										
											2019-07-17 14:54:31 -07:00
										 |  |  |             // Can go over 1. | 
					
						
							|  |  |  |             // TODO: There are situation where a run queue >1/core is just normal and fine. | 
					
						
							|  |  |  |             //       We need to clarify how to lead this metric and if its usage is helpful at all. | 
					
						
							| 
									
										
										
										
											2019-07-12 13:58:43 -07:00
										 |  |  |             record: 'instance:node_load1_per_cpu:ratio', | 
					
						
							| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  |             expr: ||| | 
					
						
							| 
									
										
										
										
											2019-07-10 11:07:20 -07:00
										 |  |  |               ( | 
					
						
							| 
									
										
										
										
											2019-07-12 13:58:43 -07:00
										 |  |  |                 node_load1{%(nodeExporterSelector)s} | 
					
						
							| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  |               / | 
					
						
							| 
									
										
										
										
											2019-07-12 13:58:43 -07:00
										 |  |  |                 instance:node_num_cpu:sum{%(nodeExporterSelector)s} | 
					
						
							| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  |               ) | 
					
						
							|  |  |  |             ||| % $._config, | 
					
						
							|  |  |  |           }, | 
					
						
							|  |  |  |           { | 
					
						
							|  |  |  |             // Memory utilisation per node, normalized by per-node memory | 
					
						
							|  |  |  |             record: 'instance:node_memory_utilisation:ratio', | 
					
						
							|  |  |  |             expr: ||| | 
					
						
							| 
									
										
										
										
											2018-05-10 02:35:48 -07:00
										 |  |  |               1 - ( | 
					
						
							| 
									
										
										
										
											2019-07-10 11:07:20 -07:00
										 |  |  |                 node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} | 
					
						
							|  |  |  |               / | 
					
						
							|  |  |  |                 node_memory_MemTotal_bytes{%(nodeExporterSelector)s} | 
					
						
							| 
									
										
										
										
											2018-05-10 02:35:48 -07:00
										 |  |  |               ) | 
					
						
							| 
									
										
										
										
											2018-07-13 06:01:01 -07:00
										 |  |  |             ||| % $._config, | 
					
						
							| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  |           }, | 
					
						
							|  |  |  |           { | 
					
						
							| 
									
										
										
										
											2019-07-16 12:18:17 -07:00
										 |  |  |             record: 'instance:node_memory_swap_io_pages:rate1m', | 
					
						
							| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  |             expr: ||| | 
					
						
							| 
									
										
										
										
											2019-07-12 13:58:43 -07:00
										 |  |  |               ( | 
					
						
							| 
									
										
										
										
											2019-07-10 11:07:20 -07:00
										 |  |  |                 rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m]) | 
					
						
							|  |  |  |               + | 
					
						
							|  |  |  |                 rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m]) | 
					
						
							| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  |               ) | 
					
						
							|  |  |  |             ||| % $._config, | 
					
						
							|  |  |  |           }, | 
					
						
							|  |  |  |           { | 
					
						
							| 
									
										
										
										
											2019-07-16 12:18:17 -07:00
										 |  |  |             // Disk utilisation (seconds spent, 1 second rate) | 
					
						
							| 
									
										
										
										
											2019-07-17 14:54:31 -07:00
										 |  |  |             // TODO: This should probably not aggregate over all devices but | 
					
						
							|  |  |  |             //       keep them separate. | 
					
						
							|  |  |  |             record: 'instance:node_disk_io_time_seconds:sum_rate1m', | 
					
						
							| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  |             expr: ||| | 
					
						
							| 
									
										
										
										
											2019-07-12 13:58:43 -07:00
										 |  |  |               sum without (device) ( | 
					
						
							| 
									
										
										
										
											2019-07-16 12:18:17 -07:00
										 |  |  |                 rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) | 
					
						
							| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  |               ) | 
					
						
							|  |  |  |             ||| % $._config, | 
					
						
							|  |  |  |           }, | 
					
						
							|  |  |  |           { | 
					
						
							| 
									
										
										
										
											2019-07-16 12:18:17 -07:00
										 |  |  |             // Disk saturation (weighted seconds spent, 1 second rate) | 
					
						
							| 
									
										
										
										
											2019-07-17 14:54:31 -07:00
										 |  |  |             // TODO: This should probably not aggregate over all devices but | 
					
						
							|  |  |  |             //       keep them separate. | 
					
						
							|  |  |  |             record: 'instance:node_disk_io_time_weighted_seconds:sum_rate1m', | 
					
						
							| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  |             expr: ||| | 
					
						
							| 
									
										
										
										
											2019-07-12 13:58:43 -07:00
										 |  |  |               sum without (device) ( | 
					
						
							| 
									
										
										
										
											2019-07-16 12:18:17 -07:00
										 |  |  |                 rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[1m]) | 
					
						
							| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  |               ) | 
					
						
							|  |  |  |             ||| % $._config, | 
					
						
							|  |  |  |           }, | 
					
						
							| 
									
										
										
										
											2019-07-16 12:18:17 -07:00
										 |  |  |           // TODO: For the following rules, consider configurable filtering to exclude more network | 
					
						
							| 
									
										
										
										
											2019-07-12 13:58:43 -07:00
										 |  |  |           // device names than just "lo". | 
					
						
							| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  |           { | 
					
						
							| 
									
										
										
										
											2019-07-16 12:18:17 -07:00
										 |  |  |             record: 'instance:node_network_receive_bytes:sum_rate1m', | 
					
						
							| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  |             expr: ||| | 
					
						
							| 
									
										
										
										
											2019-07-12 13:58:43 -07:00
										 |  |  |               sum without (device) ( | 
					
						
							| 
									
										
										
										
											2019-07-16 12:18:17 -07:00
										 |  |  |                 rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) | 
					
						
							| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  |               ) | 
					
						
							|  |  |  |             ||| % $._config, | 
					
						
							|  |  |  |           }, | 
					
						
							|  |  |  |           { | 
					
						
							| 
									
										
										
										
											2019-07-16 12:18:17 -07:00
										 |  |  |             record: 'instance:node_network_transmit_bytes:sum_rate1m', | 
					
						
							| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  |             expr: ||| | 
					
						
							| 
									
										
										
										
											2019-07-12 13:58:43 -07:00
										 |  |  |               sum without (device) ( | 
					
						
							| 
									
										
										
										
											2019-07-16 12:18:17 -07:00
										 |  |  |                 rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) | 
					
						
							|  |  |  |               ) | 
					
						
							|  |  |  |             ||| % $._config, | 
					
						
							|  |  |  |           }, | 
					
						
							| 
									
										
										
										
											2019-07-17 14:54:31 -07:00
										 |  |  |           // TODO: Find out if those drops ever happen on modern switched networks. | 
					
						
							| 
									
										
										
										
											2019-07-16 12:18:17 -07:00
										 |  |  |           { | 
					
						
							|  |  |  |             record: 'instance:node_network_receive_drop:sum_rate1m', | 
					
						
							|  |  |  |             expr: ||| | 
					
						
							|  |  |  |               sum without (device) ( | 
					
						
							|  |  |  |                 rate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) | 
					
						
							|  |  |  |               ) | 
					
						
							|  |  |  |             ||| % $._config, | 
					
						
							|  |  |  |           }, | 
					
						
							|  |  |  |           { | 
					
						
							|  |  |  |             record: 'instance:node_network_transmit_drop:sum_rate1m', | 
					
						
							|  |  |  |             expr: ||| | 
					
						
							|  |  |  |               sum without (device) ( | 
					
						
							|  |  |  |                 rate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) | 
					
						
							| 
									
										
										
										
											2018-05-08 03:10:29 -07:00
										 |  |  |               ) | 
					
						
							|  |  |  |             ||| % $._config, | 
					
						
							|  |  |  |           }, | 
					
						
							|  |  |  |         ], | 
					
						
							|  |  |  |       }, | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  |   }, | 
					
						
							|  |  |  | } |