mirror of
				https://github.com/prometheus/node_exporter.git
				synced 2025-08-20 18:33:52 -07:00 
			
		
		
		
	Refactor USE method mixin dashboards with grafonnet-lib, add multi-cluster support.
Aiming for cleaner code and following standards used on younger mixins. Signed-off-by: ArthurSens <arthursens2005@gmail.com>
This commit is contained in:
		
							parent
							
								
									129b5f5b5f
								
							
						
					
					
						commit
						3731f93fd7
					
				|  | @ -52,8 +52,12 @@ | ||||||
|     fsSpaceAvailableCriticalThreshold: 5, |     fsSpaceAvailableCriticalThreshold: 5, | ||||||
|     fsSpaceAvailableWarningThreshold: 3, |     fsSpaceAvailableWarningThreshold: 3, | ||||||
| 
 | 
 | ||||||
|     grafana_prefix: '', |  | ||||||
| 
 |  | ||||||
|     rateInterval: '5m', |     rateInterval: '5m', | ||||||
|  |     // Opt-in for multi-cluster support. | ||||||
|  |     showMultiCluster: false, | ||||||
|  |     clusterLabel: 'cluster', | ||||||
|  | 
 | ||||||
|  |     dashboardNamePrefix: 'Node Exporter / ', | ||||||
|  |     dashboardTags: ['node-exporter-mixin'], | ||||||
|   }, |   }, | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -200,7 +200,14 @@ local gauge = promgrafonnet.gauge; | ||||||
|           legendFormat='{{device}}', |           legendFormat='{{device}}', | ||||||
|         )); |         )); | ||||||
| 
 | 
 | ||||||
|       dashboard.new('Nodes', time_from='now-1h') |       dashboard.new( | ||||||
|  |         '%sNodes' % $._config.dashboardNamePrefix, | ||||||
|  |         time_from='now-1h', | ||||||
|  |         tags=($._config.dashboardTags), | ||||||
|  |         timezone='utc', | ||||||
|  |         refresh='30s', | ||||||
|  |         graphTooltip='shared_crosshair' | ||||||
|  |       ) | ||||||
|       .addTemplate( |       .addTemplate( | ||||||
|         { |         { | ||||||
|           current: { |           current: { | ||||||
|  |  | ||||||
|  | @ -1,275 +1,467 @@ | ||||||
| local g = import 'github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet'; | local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; | ||||||
|  | local dashboard = grafana.dashboard; | ||||||
|  | local row = grafana.row; | ||||||
|  | local prometheus = grafana.prometheus; | ||||||
|  | local template = grafana.template; | ||||||
|  | local graphPanel = grafana.graphPanel; | ||||||
|  | 
 | ||||||
|  | local c = import '../config.libsonnet'; | ||||||
|  | 
 | ||||||
|  | local datasourceTemplate = { | ||||||
|  |   current: { | ||||||
|  |     text: 'Prometheus', | ||||||
|  |     value: 'Prometheus', | ||||||
|  |   }, | ||||||
|  |   hide: 0, | ||||||
|  |   label: null, | ||||||
|  |   name: 'datasource', | ||||||
|  |   options: [], | ||||||
|  |   query: 'prometheus', | ||||||
|  |   refresh: 1, | ||||||
|  |   regex: '', | ||||||
|  |   type: 'datasource', | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | local clusterTemplate = | ||||||
|  |   template.new( | ||||||
|  |     name='cluster', | ||||||
|  |     datasource='$datasource', | ||||||
|  |     query='label_values(node_time_seconds, %s)' % c._config.clusterLabel, | ||||||
|  |     current='', | ||||||
|  |     hide=if c._config.showMultiCluster then '' else '2', | ||||||
|  |     refresh=2, | ||||||
|  |     includeAll=false, | ||||||
|  |     sort=1 | ||||||
|  |   ); | ||||||
|  | 
 | ||||||
|  | local CPUUtilisation = | ||||||
|  |   graphPanel.new( | ||||||
|  |     'CPU Utilisation', | ||||||
|  |     datasource='$datasource', | ||||||
|  |     span=6, | ||||||
|  |     format='percentunit', | ||||||
|  |     stack=true, | ||||||
|  |     fill=10, | ||||||
|  |     legend_show=false, | ||||||
|  |   ) { tooltip+: { sort: 2 } }; | ||||||
|  | 
 | ||||||
|  | local CPUSaturation = | ||||||
|  |   // TODO: Is this a useful panel? At least there should be some explanation how load | ||||||
|  |   // average relates to the "CPU saturation" in the title. | ||||||
|  |   graphPanel.new( | ||||||
|  |     'CPU Saturation (Load1 per CPU)', | ||||||
|  |     datasource='$datasource', | ||||||
|  |     span=6, | ||||||
|  |     format='percentunit', | ||||||
|  |     stack=true, | ||||||
|  |     fill=10, | ||||||
|  |     legend_show=false, | ||||||
|  |   ) { tooltip+: { sort: 2 } }; | ||||||
|  | 
 | ||||||
|  | local memoryUtilisation = | ||||||
|  |   graphPanel.new( | ||||||
|  |     'Memory Utilisation', | ||||||
|  |     datasource='$datasource', | ||||||
|  |     span=6, | ||||||
|  |     format='percentunit', | ||||||
|  |     stack=true, | ||||||
|  |     fill=10, | ||||||
|  |     legend_show=false, | ||||||
|  |   ) { tooltip+: { sort: 2 } }; | ||||||
|  | 
 | ||||||
|  | local memorySaturation = | ||||||
|  |   graphPanel.new( | ||||||
|  |     'Memory Saturation (Major Page Faults)', | ||||||
|  |     datasource='$datasource', | ||||||
|  |     span=6, | ||||||
|  |     format='rds', | ||||||
|  |     stack=true, | ||||||
|  |     fill=10, | ||||||
|  |     legend_show=false, | ||||||
|  |   ) { tooltip+: { sort: 2 } }; | ||||||
|  | 
 | ||||||
|  | local networkUtilisation = | ||||||
|  |   graphPanel.new( | ||||||
|  |     'Network Utilisation (Bytes Receive/Transmit)', | ||||||
|  |     datasource='$datasource', | ||||||
|  |     span=6, | ||||||
|  |     format='Bps', | ||||||
|  |     stack=true, | ||||||
|  |     fill=10, | ||||||
|  |     legend_show=false, | ||||||
|  |   ) | ||||||
|  |   .addSeriesOverride({ alias: '/Receive/', stack: 'A' }) | ||||||
|  |   .addSeriesOverride({ alias: '/Transmit/', stack: 'B', transform: 'negative-Y' }) | ||||||
|  |   { tooltip+: { sort: 2 } }; | ||||||
|  | 
 | ||||||
|  | local networkSaturation = | ||||||
|  |   graphPanel.new( | ||||||
|  |     'Network Saturation (Drops Receive/Transmit)', | ||||||
|  |     datasource='$datasource', | ||||||
|  |     span=6, | ||||||
|  |     format='Bps', | ||||||
|  |     stack=true, | ||||||
|  |     fill=10, | ||||||
|  |     legend_show=false, | ||||||
|  |   ) | ||||||
|  |   .addSeriesOverride({ alias: '/ Receive/', stack: 'A' }) | ||||||
|  |   .addSeriesOverride({ alias: '/ Transmit/', stack: 'B', transform: 'negative-Y' }) | ||||||
|  |   { tooltip+: { sort: 2 } }; | ||||||
|  | 
 | ||||||
|  | local diskIOUtilisation = | ||||||
|  |   graphPanel.new( | ||||||
|  |     'Disk IO Utilisation', | ||||||
|  |     datasource='$datasource', | ||||||
|  |     span=6, | ||||||
|  |     format='percentunit', | ||||||
|  |     stack=true, | ||||||
|  |     fill=10, | ||||||
|  |     legend_show=false, | ||||||
|  |   ) { tooltip+: { sort: 2 } }; | ||||||
|  | 
 | ||||||
|  | local diskIOSaturation = | ||||||
|  |   graphPanel.new( | ||||||
|  |     'Disk IO Saturation', | ||||||
|  |     datasource='$datasource', | ||||||
|  |     span=6, | ||||||
|  |     format='percentunit', | ||||||
|  |     stack=true, | ||||||
|  |     fill=10, | ||||||
|  |     legend_show=false, | ||||||
|  |   ) { tooltip+: { sort: 2 } }; | ||||||
|  | 
 | ||||||
|  | local diskSpaceUtilisation = | ||||||
|  |   graphPanel.new( | ||||||
|  |     'Disk Space Utilisation', | ||||||
|  |     datasource='$datasource', | ||||||
|  |     span=12, | ||||||
|  |     format='percentunit', | ||||||
|  |     stack=true, | ||||||
|  |     fill=10, | ||||||
|  |     legend_show=false, | ||||||
|  |   ) { tooltip+: { sort: 2 } }; | ||||||
| 
 | 
 | ||||||
| { | { | ||||||
|   grafanaDashboards+:: { |   grafanaDashboards+:: { | ||||||
|     'node-cluster-rsrc-use.json': |                          'node-rsrc-use.json': | ||||||
|       local legendLink = '%s/dashboard/file/node-rsrc-use.json' % $._config.grafana_prefix; |  | ||||||
| 
 | 
 | ||||||
|       g.dashboard('USE Method / Cluster') |                            dashboard.new( | ||||||
|       .addRow( |                              '%sUSE Method / Node' % $._config.dashboardNamePrefix, | ||||||
|         g.row('CPU') |                              time_from='now-1h', | ||||||
|         .addPanel( |                              tags=($._config.dashboardTags), | ||||||
|           g.panel('CPU Utilisation') + |                              timezone='utc', | ||||||
|           g.queryPanel(||| |                              refresh='30s', | ||||||
|             ( |                              graphTooltip='shared_crosshair' | ||||||
|               instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s} |                            ) | ||||||
|             * |                            .addTemplate(datasourceTemplate) | ||||||
|               instance:node_num_cpu:sum{%(nodeExporterSelector)s} |                            .addTemplate(clusterTemplate) | ||||||
|             ) |                            .addTemplate( | ||||||
|             / scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s})) |                              template.new( | ||||||
|           ||| % $._config, '{{instance}}', legendLink) + |                                'instance', | ||||||
|           g.stack + |                                '$datasource', | ||||||
|           { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |                                'label_values(node_exporter_build_info{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}, instance)' % $._config, | ||||||
|         ) |                                refresh='time', | ||||||
|         .addPanel( |                                sort=1 | ||||||
|           // TODO: Is this a useful panel? At least there should be some explanation how load |                              ) | ||||||
|           // average relates to the "CPU saturation" in the title. |                            ) | ||||||
|           g.panel('CPU Saturation (load1 per CPU)') + |                            .addRow( | ||||||
|           g.queryPanel(||| |                              row.new('CPU') | ||||||
|             instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s} |                              .addPanel(CPUUtilisation.addTarget(prometheus.target('instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Utilisation'))) | ||||||
|             / scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s})) |                              .addPanel(CPUSaturation.addTarget(prometheus.target('instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Saturation'))) | ||||||
|           ||| % $._config, '{{instance}}', legendLink) + |                            ) | ||||||
|           g.stack + |                            .addRow( | ||||||
|           // TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios. |                              row.new('Memory') | ||||||
|           { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |                              .addPanel(memoryUtilisation.addTarget(prometheus.target('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Utilisation'))) | ||||||
|         ) |                              .addPanel(memorySaturation.addTarget(prometheus.target('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Major page Faults'))) | ||||||
|       ) |                            ) | ||||||
|       .addRow( |                            .addRow( | ||||||
|         g.row('Memory') |                              row.new('Network') | ||||||
|         .addPanel( |                              .addPanel( | ||||||
|           g.panel('Memory Utilisation') + |                                networkUtilisation | ||||||
|           g.queryPanel(||| |                                .addTarget(prometheus.target('instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Receive')) | ||||||
|             instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s} |                                .addTarget(prometheus.target('instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Transmit')) | ||||||
|             / scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s})) |                              ) | ||||||
|           ||| % $._config, '{{instance}}', legendLink) + |                              .addPanel( | ||||||
|           g.stack + |                                networkSaturation | ||||||
|           { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |                                .addTarget(prometheus.target('instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Receive')) | ||||||
|         ) |                                .addTarget(prometheus.target('instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Transmit')) | ||||||
|         .addPanel( |                              ) | ||||||
|           g.panel('Memory Saturation (Major Page Faults)') + |                            ) | ||||||
|           g.queryPanel('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s}' % $._config, '{{instance}}', legendLink) + |                            .addRow( | ||||||
|           g.stack + |                              row.new('Disk IO') | ||||||
|           { yaxes: g.yaxes('rps') }, |                              .addPanel(diskIOUtilisation.addTarget(prometheus.target('instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{device}}'))) | ||||||
|         ) |                              .addPanel(diskIOSaturation.addTarget(prometheus.target('instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{device}}'))) | ||||||
|       ) |                            ) | ||||||
|       .addRow( |                            .addRow( | ||||||
|         g.row('Network') |                              row.new('Disk Space') | ||||||
|         .addPanel( |                              .addPanel( | ||||||
|           g.panel('Net Utilisation (Bytes Receive/Transmit)') + |                                diskSpaceUtilisation.addTarget(prometheus.target( | ||||||
|           g.queryPanel( |                                  ||| | ||||||
|             [ |                                    sort_desc(1 - | ||||||
|               'instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}' % $._config, |                                      ( | ||||||
|               'instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}' % $._config, |                                       max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"}) | ||||||
|             ], |                                       / | ||||||
|             ['{{instance}} Receive', '{{instance}} Transmit'], |                                       max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"}) | ||||||
|             legendLink, |                                      ) != 0 | ||||||
|           ) + |                                    ) | ||||||
|           g.stack + |                                  ||| % $._config, legendFormat='{{device}}' | ||||||
|           { |                                )) | ||||||
|             yaxes: g.yaxes({ format: 'Bps', min: null }), |                              ) | ||||||
|             seriesOverrides: [ |                            ), | ||||||
|               { |  | ||||||
|                 alias: '/ Receive/', |  | ||||||
|                 stack: 'A', |  | ||||||
|               }, |  | ||||||
|               { |  | ||||||
|                 alias: '/ Transmit/', |  | ||||||
|                 stack: 'B', |  | ||||||
|                 transform: 'negative-Y', |  | ||||||
|               }, |  | ||||||
|             ], |  | ||||||
|           }, |  | ||||||
|         ) |  | ||||||
|         .addPanel( |  | ||||||
|           g.panel('Net Saturation (Drops Receive/Transmit)') + |  | ||||||
|           g.queryPanel( |  | ||||||
|             [ |  | ||||||
|               'instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}' % $._config, |  | ||||||
|               'instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}' % $._config, |  | ||||||
|             ], |  | ||||||
|             ['{{instance}} Receive', '{{instance}} Transmit'], |  | ||||||
|             legendLink, |  | ||||||
|           ) + |  | ||||||
|           g.stack + |  | ||||||
|           { |  | ||||||
|             yaxes: g.yaxes({ format: 'rps', min: null }), |  | ||||||
|             seriesOverrides: [ |  | ||||||
|               { |  | ||||||
|                 alias: '/ Receive/', |  | ||||||
|                 stack: 'A', |  | ||||||
|               }, |  | ||||||
|               { |  | ||||||
|                 alias: '/ Transmit/', |  | ||||||
|                 stack: 'B', |  | ||||||
|                 transform: 'negative-Y', |  | ||||||
|               }, |  | ||||||
|             ], |  | ||||||
|           }, |  | ||||||
|         ) |  | ||||||
|       ) |  | ||||||
|       .addRow( |  | ||||||
|         g.row('Disk IO') |  | ||||||
|         .addPanel( |  | ||||||
|           g.panel('Disk IO Utilisation') + |  | ||||||
|           // Full utilisation would be all disks on each node spending an average of |  | ||||||
|           // 1 second per second doing I/O, normalize by metric cardinality for stacked charts. |  | ||||||
|           // TODO: Does the partition by device make sense? Using the most utilized device per |  | ||||||
|           // instance might make more sense. |  | ||||||
|           g.queryPanel(||| |  | ||||||
|             instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s} |  | ||||||
|             / scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s})) |  | ||||||
|           ||| % $._config, '{{instance}} {{device}}', legendLink) + |  | ||||||
|           g.stack + |  | ||||||
|           { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |  | ||||||
|         ) |  | ||||||
|         .addPanel( |  | ||||||
|           g.panel('Disk IO Saturation') + |  | ||||||
|           g.queryPanel(||| |  | ||||||
|             instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s} |  | ||||||
|             / scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s})) |  | ||||||
|           ||| % $._config, '{{instance}} {{device}}', legendLink) + |  | ||||||
|           g.stack + |  | ||||||
|           { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |  | ||||||
|         ) |  | ||||||
|       ) |  | ||||||
|       .addRow( |  | ||||||
|         g.row('Disk Space') |  | ||||||
|         .addPanel( |  | ||||||
|           g.panel('Disk Space Utilisation') + |  | ||||||
|           g.queryPanel(||| |  | ||||||
|             sum without (device) ( |  | ||||||
|               max without (fstype, mountpoint) ( |  | ||||||
|                 node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s} |  | ||||||
|               ) |  | ||||||
|             )  |  | ||||||
|             / scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s}))) |  | ||||||
|           ||| % $._config, '{{instance}}', legendLink) + |  | ||||||
|           g.stack + |  | ||||||
|           { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, |  | ||||||
|         ), |  | ||||||
|       ), |  | ||||||
| 
 | 
 | ||||||
|     'node-rsrc-use.json': |                          'node-cluster-rsrc-use.json': | ||||||
|       g.dashboard('USE Method / Node') |                            dashboard.new( | ||||||
|       .addTemplate('instance', 'up{%(nodeExporterSelector)s}' % $._config, 'instance') |                              '%sUSE Method / Cluster' % $._config.dashboardNamePrefix, | ||||||
|       .addRow( |                              time_from='now-1h', | ||||||
|         g.row('CPU') |                              tags=($._config.dashboardTags), | ||||||
|         .addPanel( |                              timezone='utc', | ||||||
|           g.panel('CPU Utilisation') + |                              refresh='30s', | ||||||
|           g.queryPanel('instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Utilisation') + |                              graphTooltip='shared_crosshair' | ||||||
|           { |                            ) | ||||||
|             yaxes: g.yaxes('percentunit'), |                            .addTemplate(datasourceTemplate) | ||||||
|             legend+: { show: false }, |                            .addTemplate(clusterTemplate) | ||||||
|           }, |                            .addRow( | ||||||
|         ) |                              row.new('CPU') | ||||||
|         .addPanel( |                              .addPanel( | ||||||
|           // TODO: Is this a useful panel? At least there should be some explanation how load |                                CPUUtilisation | ||||||
|           // average relates to the "CPU saturation" in the title. |                                .addTarget(prometheus.target( | ||||||
|           g.panel('CPU Saturation (Load1 per CPU)') + |                                  ||| | ||||||
|           g.queryPanel('instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Saturation') + |                                    (( | ||||||
|           { |                                      instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} | ||||||
|             yaxes: g.yaxes('percentunit'), |                                      * | ||||||
|             legend+: { show: false }, |                                      instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} | ||||||
|           }, |                                    ) != 0 ) | ||||||
|         ) |                                    / scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) | ||||||
|       ) |                                  ||| % $._config, legendFormat='{{ instance }}' | ||||||
|       .addRow( |                                )) | ||||||
|         g.row('Memory') |                              ) | ||||||
|         .addPanel( |                              .addPanel( | ||||||
|           g.panel('Memory Utilisation') + |                                CPUSaturation | ||||||
|           g.queryPanel('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Memory') + |                                .addTarget(prometheus.target( | ||||||
|           { yaxes: g.yaxes('percentunit') }, |                                  ||| | ||||||
|         ) |                                    ( | ||||||
|         .addPanel( |                                      instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} | ||||||
|           g.panel('Memory Saturation (Major Page Faults)') + |                                      / scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) | ||||||
|           g.queryPanel('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance"}' % $._config, 'Major page faults') + |                                    )  != 0 | ||||||
|           { |                                  ||| % $._config, legendFormat='{{instance}}' | ||||||
|             yaxes: g.yaxes('short'), |                                )) | ||||||
|             legend+: { show: false }, |                              ) | ||||||
|           }, |                            ) | ||||||
|         ) |                            .addRow( | ||||||
|       ) |                              row.new('Memory') | ||||||
|       .addRow( |                              .addPanel( | ||||||
|         g.row('Net') |                                memoryUtilisation | ||||||
|         .addPanel( |                                .addTarget(prometheus.target( | ||||||
|           g.panel('Net Utilisation (Bytes Receive/Transmit)') + |                                  ||| | ||||||
|           g.queryPanel( |                                    ( | ||||||
|             [ |                                      instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} | ||||||
|               'instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance"}' % $._config, |                                      / scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) | ||||||
|               'instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance"}' % $._config, |                                    ) != 0 | ||||||
|             ], |                                  ||| % $._config, legendFormat='{{instance}}', | ||||||
|             ['Receive', 'Transmit'], |                                )) | ||||||
|           ) + |                              ) | ||||||
|           { |                              .addPanel(memorySaturation.addTarget(prometheus.target('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{instance}}'))) | ||||||
|             yaxes: g.yaxes({ format: 'Bps', min: null }), |                            ) | ||||||
|             seriesOverrides: [ |                            .addRow( | ||||||
|               { |                              row.new('Network') | ||||||
|                 alias: '/Receive/', |                              .addPanel( | ||||||
|                 stack: 'A', |                                networkUtilisation | ||||||
|               }, |                                .addTarget(prometheus.target('instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Receive')) | ||||||
|               { |                                .addTarget(prometheus.target('instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Transmit')) | ||||||
|                 alias: '/Transmit/', |                              ) | ||||||
|                 stack: 'B', |                              .addPanel( | ||||||
|                 transform: 'negative-Y', |                                networkSaturation | ||||||
|               }, |                                .addTarget(prometheus.target('instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Receive')) | ||||||
|             ], |                                .addTarget(prometheus.target('instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Transmit')) | ||||||
|           }, |                              ) | ||||||
|         ) |                            ) | ||||||
|         .addPanel( |                            .addRow( | ||||||
|           g.panel('Net Saturation (Drops Receive/Transmit)') + |                              row.new('Disk IO') | ||||||
|           g.queryPanel( |                              .addPanel( | ||||||
|             [ |                                diskIOUtilisation | ||||||
|               'instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance"}' % $._config, |                                .addTarget(prometheus.target( | ||||||
|               'instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance"}' % $._config, |                                  ||| | ||||||
|             ], |                                    ( | ||||||
|             ['Receive drops', 'Transmit drops'], |                                      instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} | ||||||
|           ) + |                                      / scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) | ||||||
|           { |                                    ) != 0 | ||||||
|             yaxes: g.yaxes({ format: 'rps', min: null }), |                                  ||| % $._config, legendFormat='{{instance}} {{device}}' | ||||||
|             seriesOverrides: [ |                                )) | ||||||
|               { |                              ) | ||||||
|                 alias: '/Receive/', |                              .addPanel( | ||||||
|                 stack: 'A', |                                diskIOSaturation | ||||||
|               }, |                                .addTarget(prometheus.target( | ||||||
|               { |                                  ||| | ||||||
|                 alias: '/Transmit/', |                                    ( | ||||||
|                 stack: 'B', |                                      instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} | ||||||
|                 transform: 'negative-Y', |                                      / scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) | ||||||
|               }, |                                    ) != 0 | ||||||
|             ], |                                  ||| % $._config, legendFormat='{{instance}} {{device}}' | ||||||
|           }, |                                )) | ||||||
|         ) |                              ) | ||||||
|       ) |                            ) | ||||||
|       .addRow( |                            .addRow( | ||||||
|         g.row('Disk IO') |                              row.new('Disk Space') | ||||||
|         .addPanel( |                              .addPanel( | ||||||
|           g.panel('Disk IO Utilisation') + |                                diskSpaceUtilisation | ||||||
|           g.queryPanel('instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance"}' % $._config, '{{device}}') + |                                .addTarget(prometheus.target( | ||||||
|           { yaxes: g.yaxes('percentunit') }, |                                  ||| | ||||||
|         ) |                                    sum without (device) ( | ||||||
|         .addPanel( |                                      max without (fstype, mountpoint) (( | ||||||
|           g.panel('Disk IO Saturation') + |                                        node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(clusterLabel)s="$cluster"} | ||||||
|           g.queryPanel('instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance"}' % $._config, '{{device}}') + |                                        - | ||||||
|           { yaxes: g.yaxes('percentunit') }, |                                        node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(clusterLabel)s="$cluster"} | ||||||
|         ) |                                      ) != 0) | ||||||
|       ) |                                    ) | ||||||
|       .addRow( |                                    / scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(clusterLabel)s="$cluster"}))) | ||||||
|         g.row('Disk Space') |                                  ||| % $._config, legendFormat='{{instance}}' | ||||||
|         .addPanel( |                                )) | ||||||
|           g.panel('Disk Space Utilisation') + |                              ) | ||||||
|           g.queryPanel(||| |                            ), | ||||||
|             1 - |                        } + | ||||||
|             ( |                        if $._config.showMultiCluster then { | ||||||
|               max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"}) |                          'node-multicluster-rsrc-use.json': | ||||||
|             / |                            dashboard.new( | ||||||
|               max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"}) |                              '%sUSE Method / Multi-cluster' % $._config.dashboardNamePrefix, | ||||||
|             ) |                              time_from='now-1h', | ||||||
|           ||| % $._config, '{{device}}') + |                              tags=($._config.dashboardTags), | ||||||
|           { |                              timezone='utc', | ||||||
|             yaxes: g.yaxes('percentunit'), |                              refresh='30s', | ||||||
|             legend+: { show: false }, |                              graphTooltip='shared_crosshair' | ||||||
|           }, |                            ) | ||||||
|         ), |                            .addTemplate(datasourceTemplate) | ||||||
|       ), |                            .addRow( | ||||||
|   }, |                              row.new('CPU') | ||||||
|  |                              .addPanel( | ||||||
|  |                                CPUUtilisation | ||||||
|  |                                .addTarget(prometheus.target( | ||||||
|  |                                  ||| | ||||||
|  |                                    sum( | ||||||
|  |                                      (( | ||||||
|  |                                        instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s} | ||||||
|  |                                        * | ||||||
|  |                                        instance:node_num_cpu:sum{%(nodeExporterSelector)s} | ||||||
|  |                                      ) != 0) | ||||||
|  |                                      / scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s})) | ||||||
|  |                                    ) by (%(clusterLabel)s) | ||||||
|  |                                  ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config | ||||||
|  |                                )) | ||||||
|  |                              ) | ||||||
|  |                              .addPanel( | ||||||
|  |                                CPUSaturation | ||||||
|  |                                .addTarget(prometheus.target( | ||||||
|  |                                  ||| | ||||||
|  |                                    sum(( | ||||||
|  |                                      instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s} | ||||||
|  |                                      / scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s})) | ||||||
|  |                                    ) != 0) by (%(clusterLabel)s) | ||||||
|  |                                  ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config | ||||||
|  |                                )) | ||||||
|  |                              ) | ||||||
|  |                            ) | ||||||
|  |                            .addRow( | ||||||
|  |                              row.new('Memory') | ||||||
|  |                              .addPanel( | ||||||
|  |                                memoryUtilisation | ||||||
|  |                                .addTarget(prometheus.target( | ||||||
|  |                                  ||| | ||||||
|  |                                    sum(( | ||||||
|  |                                        instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s} | ||||||
|  |                                        / scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s})) | ||||||
|  |                                    ) != 0) by (%(clusterLabel)s) | ||||||
|  |                                  ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config | ||||||
|  |                                )) | ||||||
|  |                              ) | ||||||
|  |                              .addPanel( | ||||||
|  |                                memorySaturation | ||||||
|  |                                .addTarget(prometheus.target( | ||||||
|  |                                  ||| | ||||||
|  |                                    sum(( | ||||||
|  |                                        instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s} | ||||||
|  |                                    ) != 0) by (%(clusterLabel)s) | ||||||
|  |                                  ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config | ||||||
|  |                                )) | ||||||
|  |                              ) | ||||||
|  |                            ) | ||||||
|  |                            .addRow( | ||||||
|  |                              row.new('Network') | ||||||
|  |                              .addPanel( | ||||||
|  |                                networkUtilisation | ||||||
|  |                                .addTarget(prometheus.target( | ||||||
|  |                                  ||| | ||||||
|  |                                    sum(( | ||||||
|  |                                        instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} | ||||||
|  |                                    ) != 0) by (%(clusterLabel)s) | ||||||
|  |                                  ||| % $._config, legendFormat='{{%(clusterLabel)s}} Receive' % $._config | ||||||
|  |                                )) | ||||||
|  |                                .addTarget(prometheus.target( | ||||||
|  |                                  ||| | ||||||
|  |                                    sum(( | ||||||
|  |                                        instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} | ||||||
|  |                                    ) != 0) by (%(clusterLabel)s) | ||||||
|  |                                  ||| % $._config, legendFormat='{{%(clusterLabel)s}} Transmit' % $._config | ||||||
|  |                                )) | ||||||
|  |                              ) | ||||||
|  |                              .addPanel( | ||||||
|  |                                networkSaturation | ||||||
|  |                                .addTarget(prometheus.target( | ||||||
|  |                                  ||| | ||||||
|  |                                    sum(( | ||||||
|  |                                        instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} | ||||||
|  |                                    ) != 0) by (%(clusterLabel)s) | ||||||
|  |                                  ||| % $._config, legendFormat='{{%(clusterLabel)s}} Receive' % $._config | ||||||
|  |                                )) | ||||||
|  |                                .addTarget(prometheus.target( | ||||||
|  |                                  ||| | ||||||
|  |                                    sum(( | ||||||
|  |                                        instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} | ||||||
|  |                                    ) != 0) by (%(clusterLabel)s) | ||||||
|  |                                  ||| % $._config, legendFormat='{{%(clusterLabel)s}} Transmit' % $._config | ||||||
|  |                                )) | ||||||
|  |                              ) | ||||||
|  |                            ) | ||||||
|  |                            .addRow( | ||||||
|  |                              row.new('Disk IO') | ||||||
|  |                              .addPanel( | ||||||
|  |                                diskIOUtilisation | ||||||
|  |                                .addTarget(prometheus.target( | ||||||
|  |                                  ||| | ||||||
|  |                                    sum(( | ||||||
|  |                                        instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s} | ||||||
|  |                                        / scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s})) | ||||||
|  |                                    ) != 0) by (%(clusterLabel)s, device) | ||||||
|  |                                  ||| % $._config, legendFormat='{{%(clusterLabel)s}} {{device}}' % $._config | ||||||
|  |                                )) | ||||||
|  |                              ) | ||||||
|  |                              .addPanel( | ||||||
|  |                                diskIOSaturation | ||||||
|  |                                .addTarget(prometheus.target( | ||||||
|  |                                  ||| | ||||||
|  |                                    sum(( | ||||||
|  |                                      instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s} | ||||||
|  |                                      / scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s})) | ||||||
|  |                                    ) != 0) by (%(clusterLabel)s, device) | ||||||
|  |                                  ||| % $._config, legendFormat='{{%(clusterLabel)s}} {{device}}' % $._config | ||||||
|  |                                )) | ||||||
|  |                              ) | ||||||
|  |                            ) | ||||||
|  |                            .addRow( | ||||||
|  |                              row.new('Disk Space') | ||||||
|  |                              .addPanel( | ||||||
|  |                                diskSpaceUtilisation | ||||||
|  |                                .addTarget(prometheus.target( | ||||||
|  |                                  ||| | ||||||
|  |                                    sum ( | ||||||
|  |                                      sum without (device) ( | ||||||
|  |                                        max without (fstype, mountpoint, instance, pod) (( | ||||||
|  |                                          node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s} | ||||||
|  |                                        ) != 0) | ||||||
|  |                                      ) | ||||||
|  |                                      / scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s}))) | ||||||
|  |                                    ) by (%(clusterLabel)s) | ||||||
|  |                                  ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config | ||||||
|  |                                )) | ||||||
|  |                              ) | ||||||
|  |                            ), | ||||||
|  |                        } else {}, | ||||||
| } | } | ||||||
|  |  | ||||||
		Loading…
	
		Reference in a new issue