Merge pull request #1530 from prometheus/beorn7/mixin

Fix the normalization for the cluster-wide dashboards
This commit is contained in:
Björn Rabenstein 2019-11-01 13:38:41 +01:00 committed by GitHub
commit 44774994fe
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 20 additions and 35 deletions

View file

@ -2,7 +2,12 @@
_config+:: {
// Selectors are inserted between {} in Prometheus queries.
// Select the metrics coming from the node exporter.
// Select the metrics coming from the node exporter. Note that all
// the selected metrics are shown stacked on top of each other in
// the 'USE Method / Cluster' dashboard. Consider disabling that
// dashboard if mixing up all those metrics in the same dashboard
// doesn't make sense (e.g. because they are coming from different
// clusters).
nodeExporterSelector: 'job="node"',
// Select the fstype for filesystem-related queries. If left

View file

@ -15,9 +15,8 @@ local g = import 'grafana-builder/grafana.libsonnet';
instance:node_cpu_utilisation:rate1m{%(nodeExporterSelector)s}
*
instance:node_num_cpu:sum{%(nodeExporterSelector)s}
/ ignoring (instance) group_left
sum without (instance) (instance:node_num_cpu:sum{%(nodeExporterSelector)s})
)
/ scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s}))
||| % $._config, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
@ -27,11 +26,8 @@ local g = import 'grafana-builder/grafana.libsonnet';
// average relates to the "CPU saturation" in the title.
g.panel('CPU Saturation (load1 per CPU)') +
g.queryPanel(|||
(
instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}
/ ignoring (instance) group_left
count without (instance) (instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s})
)
instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}
/ scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}))
||| % $._config, '{{instance}}', legendLink) +
g.stack +
// TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios.
@ -43,11 +39,8 @@ local g = import 'grafana-builder/grafana.libsonnet';
.addPanel(
g.panel('Memory Utilisation') +
g.queryPanel(|||
(
instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}
/ ignoring (instance) group_left
count without (instance) (instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s})
)
instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}
/ scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}))
||| % $._config, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
@ -123,11 +116,8 @@ local g = import 'grafana-builder/grafana.libsonnet';
// TODO: Does the partition by device make sense? Using the most utilized device per
// instance might make more sense.
g.queryPanel(|||
(
instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s}
/ ignoring (instance, device) group_left
count without (instance, device) (instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s})
)
instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s}
/ scalar(count(instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s}))
||| % $._config, '{{instance}} {{device}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
@ -135,11 +125,8 @@ local g = import 'grafana-builder/grafana.libsonnet';
.addPanel(
g.panel('Disk IO Saturation') +
g.queryPanel(|||
(
instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s}
/ ignoring (instance, device) group_left
count without (instance, device) (instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s})
)
instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s}
/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s}))
||| % $._config, '{{instance}} {{device}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
@ -150,19 +137,12 @@ local g = import 'grafana-builder/grafana.libsonnet';
.addPanel(
g.panel('Disk Space Utilisation') +
g.queryPanel(|||
(
sum without (device) (
max without (fstype, mountpoint) (
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
)
)
/ ignoring (instance) group_left
sum without (instance, device) (
max without (fstype, mountpoint) (
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
)
sum without (device) (
max without (fstype, mountpoint) (
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
)
)
)
/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s})))
||| % $._config, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },