node_exporter/docs/node-mixin/lib/prom-mixin.libsonnet
Tom d0c1d00d18
Some checks failed
golangci-lint / lint (push) Has been cancelled
Migrate dashboards to new grafonnet library (#3147)
Migrated away from deprecated Grafonnet library. This replaces panels using Angular JS which are disabled by default in Grafana 11 and will be unsupported in Grafana 12.

Fixes #3046

---------

Signed-off-by: Tom <12222103+critchtionary@users.noreply.github.com>
2024-12-19 16:49:22 +01:00

536 lines
22 KiB
Plaintext

local grafana = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
local dashboard = grafana.dashboard;
local row = grafana.panel.row;
local prometheus = grafana.query.prometheus;
local variable = dashboard.variable;
local timeSeriesPanel = grafana.panel.timeSeries;
local tsOptions = timeSeriesPanel.options;
local tsStandardOptions = timeSeriesPanel.standardOptions;
local tsQueryOptions = timeSeriesPanel.queryOptions;
local tsCustom = timeSeriesPanel.fieldConfig.defaults.custom;
local gaugePanel = grafana.panel.gauge;
local gaugeStep = gaugePanel.standardOptions.threshold.step;
local table = grafana.panel.table;
local tableStep = table.standardOptions.threshold.step;
local tableOverride = table.standardOptions.override;
local tableTransformation = table.queryOptions.transformation;
{
new(config=null, platform=null, uid=null):: {
local prometheusDatasourceVariable = variable.datasource.new(
'datasource', 'prometheus'
),
local clusterVariablePrototype =
variable.query.new('cluster')
+ variable.query.withDatasourceFromVariable(prometheusDatasourceVariable)
+ (if config.showMultiCluster then variable.query.generalOptions.showOnDashboard.withLabelAndValue() else variable.query.generalOptions.showOnDashboard.withNothing())
+ variable.query.refresh.onTime()
+ variable.query.generalOptions.withLabel('Cluster'),
local clusterVariable =
if platform == 'Darwin' then
clusterVariablePrototype
+ variable.query.queryTypes.withLabelValues(
' %(clusterLabel)s' % config,
'node_uname_info{%(nodeExporterSelector)s, sysname="Darwin"}' % config,
)
else
clusterVariablePrototype
+ variable.query.queryTypes.withLabelValues(
'%(clusterLabel)s' % config,
'node_uname_info{%(nodeExporterSelector)s, sysname!="Darwin"}' % config,
),
local instanceVariablePrototype =
variable.query.new('instance')
+ variable.query.withDatasourceFromVariable(prometheusDatasourceVariable)
+ variable.query.refresh.onTime()
+ variable.query.generalOptions.withLabel('Instance'),
local instanceVariable =
if platform == 'Darwin' then
instanceVariablePrototype
+ variable.query.queryTypes.withLabelValues(
'instance',
'node_uname_info{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster", sysname="Darwin"}' % config,
)
else
instanceVariablePrototype
+ variable.query.queryTypes.withLabelValues(
'instance',
'node_uname_info{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster", sysname!="Darwin"}' % config,
),
local idleCPU =
timeSeriesPanel.new('CPU Usage')
+ variable.query.withDatasourceFromVariable(prometheusDatasourceVariable)
+ tsStandardOptions.withUnit('percentunit')
+ tsCustom.stacking.withMode('normal')
+ tsStandardOptions.withMax(1)
+ tsStandardOptions.withMin(0)
+ tsOptions.tooltip.withMode('multi')
+ tsCustom.withFillOpacity(10)
+ tsCustom.withShowPoints('never')
+ tsQueryOptions.withTargets([
prometheus.new(
'$datasource',
|||
(
(1 - sum without (mode) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode=~"idle|iowait|steal", instance="$instance", %(clusterLabel)s="$cluster"}[$__rate_interval])))
/ ignoring(cpu) group_left
count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance", %(clusterLabel)s="$cluster"})
)
||| % config,
)
+ prometheus.withLegendFormat('{{cpu}}')
+ prometheus.withIntervalFactor(5),
]),
local systemLoad =
timeSeriesPanel.new('Load Average')
+ variable.query.withDatasourceFromVariable(prometheusDatasourceVariable)
+ tsStandardOptions.withUnit('short')
+ tsStandardOptions.withMin(0)
+ tsCustom.withFillOpacity(0)
+ tsCustom.withShowPoints('never')
+ tsOptions.tooltip.withMode('multi')
+ tsQueryOptions.withTargets([
prometheus.new('$datasource', 'node_load1{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('1m load average'),
prometheus.new('$datasource', 'node_load5{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('5m load average'),
prometheus.new('$datasource', 'node_load15{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('15m load average'),
prometheus.new('$datasource', 'count(node_cpu_seconds_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", mode="idle"})' % config) + prometheus.withLegendFormat('logical cores'),
]),
local memoryGraphPanelPrototype =
timeSeriesPanel.new('Memory Usage')
+ variable.query.withDatasourceFromVariable(prometheusDatasourceVariable)
+ tsStandardOptions.withUnit('bytes')
+ tsStandardOptions.withMin(0)
+ tsOptions.tooltip.withMode('multi')
+ tsCustom.withFillOpacity(10)
+ tsCustom.withShowPoints('never'),
local memoryGraph =
if platform == 'Linux' then
memoryGraphPanelPrototype
+ tsCustom.stacking.withMode('normal')
+ tsQueryOptions.withTargets([
prometheus.new(
'$datasource',
|||
(
node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}
-
node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}
-
node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}
-
node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}
)
||| % config,
) + prometheus.withLegendFormat('memory used'),
prometheus.new('$datasource', 'node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('memory buffers'),
prometheus.new('$datasource', 'node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('memory cached'),
prometheus.new('$datasource', 'node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('memory free'),
])
else if platform == 'Darwin' then
// not useful to stack
memoryGraphPanelPrototype
+ tsCustom.stacking.withMode('none')
+ tsQueryOptions.withTargets([
prometheus.new('$datasource', 'node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('Physical Memory'),
prometheus.new(
'$datasource',
|||
(
node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} -
node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} +
node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} +
node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}
)
||| % config
) + prometheus.withLegendFormat(
'Memory Used'
),
prometheus.new(
'$datasource',
|||
(
node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} -
node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}
)
||| % config
) + prometheus.withLegendFormat(
'App Memory'
),
prometheus.new('$datasource', 'node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('Wired Memory'),
prometheus.new('$datasource', 'node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('Compressed'),
])
else if platform == 'AIX' then
memoryGraphPanelPrototype
+ tsCustom.stacking.withMode('none')
+ tsQueryOptions.withTargets([
prometheus.new('$datasource', 'node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('Physical Memory'),
prometheus.new(
'$datasource',
|||
(
node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} -
node_memory_available_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}
)
||| % config
) + prometheus.withLegendFormat('Memory Used'),
]),
// NOTE: avg() is used to circumvent a label change caused by a node_exporter rollout.
local memoryGaugePanelPrototype =
gaugePanel.new('Memory Usage')
+ variable.query.withDatasourceFromVariable(prometheusDatasourceVariable)
+ gaugePanel.standardOptions.thresholds.withSteps([
gaugeStep.withColor('rgba(50, 172, 45, 0.97)'),
gaugeStep.withColor('rgba(237, 129, 40, 0.89)') + gaugeStep.withValue(80),
gaugeStep.withColor('rgba(245, 54, 54, 0.9)') + gaugeStep.withValue(90),
])
+ gaugePanel.standardOptions.withMax(100)
+ gaugePanel.standardOptions.withMin(0)
+ gaugePanel.standardOptions.withUnit('percent'),
local memoryGauge =
if platform == 'Linux' then
memoryGaugePanelPrototype
+ gaugePanel.queryOptions.withTargets([
prometheus.new(
'$datasource',
|||
100 -
(
avg(node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) /
avg(node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"})
* 100
)
||| % config,
),
])
else if platform == 'Darwin' then
memoryGaugePanelPrototype
+ gaugePanel.queryOptions.withTargets([
prometheus.new(
'$datasource',
|||
(
(
avg(node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) -
avg(node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) +
avg(node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) +
avg(node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"})
) /
avg(node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"})
)
*
100
||| % config
),
])
else if platform == 'AIX' then
memoryGaugePanelPrototype
+ gaugePanel.queryOptions.withTargets([
prometheus.new(
'$datasource',
|||
100 -
(
avg(node_memory_available_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) /
avg(node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"})
* 100
)
||| % config
),
]),
local diskIO =
timeSeriesPanel.new('Disk I/O')
+ variable.query.withDatasourceFromVariable(prometheusDatasourceVariable)
+ tsStandardOptions.withMin(0)
+ tsCustom.withFillOpacity(0)
+ tsCustom.withShowPoints('never')
+ tsOptions.tooltip.withMode('multi')
+ tsQueryOptions.withTargets([
// TODO: Does it make sense to have those three in the same panel?
prometheus.new('$datasource', 'rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(diskDeviceSelector)s}[$__rate_interval])' % config)
+ prometheus.withLegendFormat('{{device}} read')
+ prometheus.withIntervalFactor(1),
prometheus.new('$datasource', 'rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(diskDeviceSelector)s}[$__rate_interval])' % config)
+ prometheus.withLegendFormat('{{device}} written')
+ prometheus.withIntervalFactor(1),
prometheus.new('$datasource', 'rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(diskDeviceSelector)s}[$__rate_interval])' % config)
+ prometheus.withLegendFormat('{{device}} io time')
+ prometheus.withIntervalFactor(1),
])
+ tsStandardOptions.withOverrides(
[
tsStandardOptions.override.byRegexp.new('/ read| written/')
+ tsStandardOptions.override.byRegexp.withPropertiesFromOptions(
tsStandardOptions.withUnit('Bps')
),
tsStandardOptions.override.byRegexp.new('/ io time/')
+ tsStandardOptions.override.byRegexp.withPropertiesFromOptions(tsStandardOptions.withUnit('percentunit')),
]
),
local diskSpaceUsage =
table.new('Disk Space Usage')
+ variable.query.withDatasourceFromVariable(prometheusDatasourceVariable)
+ table.standardOptions.withUnit('decbytes')
+ table.standardOptions.thresholds.withSteps(
[
tableStep.withColor('green'),
tableStep.withColor('yellow') + gaugeStep.withValue(0.8),
tableStep.withColor('red') + gaugeStep.withValue(0.9),
]
)
+ table.queryOptions.withTargets([
prometheus.new(
'$datasource',
|||
max by (mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(fsSelector)s, %(fsMountpointSelector)s})
||| % config
)
+ prometheus.withLegendFormat('')
+ prometheus.withInstant()
+ prometheus.withFormat('table'),
prometheus.new(
'$datasource',
|||
max by (mountpoint) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(fsSelector)s, %(fsMountpointSelector)s})
||| % config
)
+ prometheus.withLegendFormat('')
+ prometheus.withInstant()
+ prometheus.withFormat('table'),
])
+ table.standardOptions.withOverrides([
tableOverride.byName.new('Mounted on')
+ tableOverride.byName.withProperty('custom.width', 260),
tableOverride.byName.new('Size')
+ tableOverride.byName.withProperty('custom.width', 93),
tableOverride.byName.new('Used')
+ tableOverride.byName.withProperty('custom.width', 72),
tableOverride.byName.new('Available')
+ tableOverride.byName.withProperty('custom.width', 88),
tableOverride.byName.new('Used, %')
+ tableOverride.byName.withProperty('unit', 'percentunit')
+ tableOverride.byName.withPropertiesFromOptions(
table.fieldConfig.defaults.custom.withCellOptions(
{ type: 'gauge' },
)
)
+ tableOverride.byName.withProperty('max', 1)
+ tableOverride.byName.withProperty('min', 0),
])
+ table.queryOptions.withTransformations([
tableTransformation.withId('groupBy')
+ tableTransformation.withOptions(
{
fields: {
'Value #A': {
aggregations: [
'lastNotNull',
],
operation: 'aggregate',
},
'Value #B': {
aggregations: [
'lastNotNull',
],
operation: 'aggregate',
},
mountpoint: {
aggregations: [],
operation: 'groupby',
},
},
}
),
tableTransformation.withId('merge'),
tableTransformation.withId('calculateField')
+ tableTransformation.withOptions(
{
alias: 'Used',
binary: {
left: 'Value #A (lastNotNull)',
operator: '-',
reducer: 'sum',
right: 'Value #B (lastNotNull)',
},
mode: 'binary',
reduce: {
reducer: 'sum',
},
}
),
tableTransformation.withId('calculateField')
+ tableTransformation.withOptions(
{
alias: 'Used, %',
binary: {
left: 'Used',
operator: '/',
reducer: 'sum',
right: 'Value #A (lastNotNull)',
},
mode: 'binary',
reduce: {
reducer: 'sum',
},
}
),
tableTransformation.withId('organize')
+ tableTransformation.withOptions(
{
excludeByName: {},
indexByName: {},
renameByName: {
'Value #A (lastNotNull)': 'Size',
'Value #B (lastNotNull)': 'Available',
mountpoint: 'Mounted on',
},
}
),
tableTransformation.withId('sortBy')
+ tableTransformation.withOptions(
{
fields: {},
sort: [
{
field: 'Mounted on',
},
],
}
),
]),
local networkReceived =
timeSeriesPanel.new('Network Received')
+ timeSeriesPanel.panelOptions.withDescription('Network received (bits/s)')
+ variable.query.withDatasourceFromVariable(prometheusDatasourceVariable)
+ tsStandardOptions.withUnit('bps')
+ tsStandardOptions.withMin(0)
+ tsCustom.withFillOpacity(0)
+ tsCustom.withShowPoints('never')
+ tsOptions.tooltip.withMode('multi')
+ tsQueryOptions.withTargets([
prometheus.new('$datasource', 'rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", device!="lo"}[$__rate_interval]) * 8' % config)
+ prometheus.withLegendFormat('{{device}}')
+ prometheus.withIntervalFactor(1),
]),
local networkTransmitted =
timeSeriesPanel.new('Network Transmitted')
+ timeSeriesPanel.panelOptions.withDescription('Network transmitted (bits/s)')
+ variable.query.withDatasourceFromVariable(prometheusDatasourceVariable)
+ tsStandardOptions.withUnit('bps')
+ tsStandardOptions.withMin(0)
+ tsCustom.withFillOpacity(0)
+ tsOptions.tooltip.withMode('multi')
+ tsQueryOptions.withTargets([
prometheus.new('$datasource', 'rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", device!="lo"}[$__rate_interval]) * 8' % config)
+ prometheus.withLegendFormat('{{device}}')
+ prometheus.withIntervalFactor(1),
]),
local cpuRow =
row.new('CPU')
+ row.withPanels([
idleCPU,
systemLoad,
]),
local memoryRow = [
row.new('Memory') + row.gridPos.withY(8),
memoryGraph + row.gridPos.withX(0) + row.gridPos.withY(9) + row.gridPos.withH(7) + row.gridPos.withW(18),
memoryGauge + row.gridPos.withX(18) + row.gridPos.withY(9) + row.gridPos.withH(7) + row.gridPos.withW(6),
],
local diskRow =
row.new('Disk')
+ row.withPanels([
diskIO,
diskSpaceUsage,
]),
local networkRow =
row.new('Network')
+ row.withPanels([
networkReceived,
networkTransmitted,
]),
local panels =
grafana.util.grid.makeGrid([
cpuRow,
], panelWidth=12, panelHeight=7)
+ memoryRow
+ grafana.util.grid.makeGrid([
diskRow,
networkRow,
], panelWidth=12, panelHeight=7, startY=18),
local variables =
[
prometheusDatasourceVariable,
clusterVariable,
instanceVariable,
],
dashboard: if platform == 'Linux' then
dashboard.new(
'%sNodes' % config.dashboardNamePrefix,
)
+ dashboard.time.withFrom('now-1h')
+ dashboard.withTags(config.dashboardTags)
+ dashboard.withTimezone('utc')
+ dashboard.withRefresh('30s')
+ dashboard.withUid(std.md5(uid))
+ dashboard.graphTooltip.withSharedCrosshair()
+ dashboard.withVariables(variables)
+ dashboard.withPanels(panels)
else if platform == 'Darwin' then
dashboard.new(
'%sMacOS' % config.dashboardNamePrefix,
)
+ dashboard.time.withFrom('now-1h')
+ dashboard.withTags(config.dashboardTags)
+ dashboard.withTimezone('utc')
+ dashboard.withRefresh('30s')
+ dashboard.withUid(std.md5(uid))
+ dashboard.graphTooltip.withSharedCrosshair()
+ dashboard.withVariables(variables)
+ dashboard.withPanels(panels)
else if platform == 'AIX' then
dashboard.new(
'%sAIX' % config.dashboardNamePrefix,
)
+ dashboard.time.withFrom('now-1h')
+ dashboard.withTags(config.dashboardTags)
+ dashboard.withTimezone('utc')
+ dashboard.withRefresh('30s')
+ dashboard.withUid(std.md5(uid))
+ dashboard.graphTooltip.withSharedCrosshair()
+ dashboard.withVariables(variables)
+ dashboard.withPanels(panels),
},
}