From 97ef11376219a1e3ee2c5f21f105bdeb26ef43d0 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Wed, 14 Aug 2019 22:24:24 +0200 Subject: [PATCH] Make the severity of "critical" alerts configurable This addresses the blissful scenario where single-node failures are unproblematic. No reason to wake somebody up if a node is about to screw itself up by filling the disk. Signed-off-by: beorn7 --- docs/node-mixin/alerts/alerts.libsonnet | 8 ++++---- docs/node-mixin/config.libsonnet | 13 +++++++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 7b9fb890..4423f892 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -37,7 +37,7 @@ ||| % $._config, 'for': '1h', labels: { - severity: 'critical', + severity: '%(nodeCriticalSeverity)s' % $._config, }, annotations: { summary: 'Filesystem is predicted to run out of space within the next 4 hours.', @@ -73,7 +73,7 @@ ||| % $._config, 'for': '1h', labels: { - severity: 'critical', + severity: '%(nodeCriticalSeverity)s' % $._config, }, annotations: { summary: 'Filesystem has less than 3% space left.', @@ -113,7 +113,7 @@ ||| % $._config, 'for': '1h', labels: { - severity: 'critical', + severity: '%(nodeCriticalSeverity)s' % $._config, }, annotations: { summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.', @@ -149,7 +149,7 @@ ||| % $._config, 'for': '1h', labels: { - severity: 'critical', + severity: '%(nodeCriticalSeverity)s' % $._config, }, annotations: { summary: 'Filesystem has less than 3% inodes left.', diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet index 95070ca9..8cf9860f 100644 --- a/docs/node-mixin/config.libsonnet +++ b/docs/node-mixin/config.libsonnet @@ -17,6 +17,19 @@ // them here, e.g. 'device!="tmpfs"'. diskDeviceSelector: '', + // Some of the alerts are meant to fire if a critical failure of a + // node is imminent (e.g. the disk is about to run full). In a + // true “cloud native” setup, failures of a single node should be + // tolerated. Hence, even imminent failure of a single node is no + // reason to create a paging alert. However, in practice there are + // still many situations where operators like to get paged in time + // before a node runs out of disk space. nodeCriticalSeverity can + // be set to the desired severity for this kind of alerts. This + // can even be templated to depend on labels of the node, e.g. you + // could make this critical for traditional database masters but + // just a warning for K8s nodes. + nodeCriticalSeverity: 'critical', + grafana_prefix: '', }, }