2018-05-08 03:10:29 -07:00
|
|
|
{
|
|
|
|
_config+:: {
|
|
|
|
// Selectors are inserted between {} in Prometheus queries.
|
2019-07-16 10:34:27 -07:00
|
|
|
|
2019-10-30 14:52:36 -07:00
|
|
|
// Select the metrics coming from the node exporter. Note that all
|
|
|
|
// the selected metrics are shown stacked on top of each other in
|
|
|
|
// the 'USE Method / Cluster' dashboard. Consider disabling that
|
|
|
|
// dashboard if mixing up all those metrics in the same dashboard
|
|
|
|
// doesn't make sense (e.g. because they are coming from different
|
|
|
|
// clusters).
|
2019-07-16 12:18:17 -07:00
|
|
|
nodeExporterSelector: 'job="node"',
|
2018-05-08 03:10:29 -07:00
|
|
|
|
2019-07-17 14:54:31 -07:00
|
|
|
// Select the fstype for filesystem-related queries. If left
|
|
|
|
// empty, all filesystems are selected. If you have unusual
|
|
|
|
// filesystem you don't want to include in dashboards and
|
|
|
|
// alerting, you can exclude them here, e.g. 'fstype!="tmpfs"'.
|
2019-09-12 04:57:19 -07:00
|
|
|
fsSelector: 'fstype!=""',
|
2018-05-08 03:10:29 -07:00
|
|
|
|
2019-07-17 14:54:31 -07:00
|
|
|
// Select the device for disk-related queries. If left empty, all
|
|
|
|
// devices are selected. If you have unusual devices you don't
|
|
|
|
// want to include in dashboards and alerting, you can exclude
|
|
|
|
// them here, e.g. 'device!="tmpfs"'.
|
2019-09-12 04:57:19 -07:00
|
|
|
diskDeviceSelector: 'device!=""',
|
2019-07-16 10:34:27 -07:00
|
|
|
|
2019-08-14 13:24:24 -07:00
|
|
|
// Some of the alerts are meant to fire if a critical failure of a
|
|
|
|
// node is imminent (e.g. the disk is about to run full). In a
|
|
|
|
// true “cloud native” setup, failures of a single node should be
|
|
|
|
// tolerated. Hence, even imminent failure of a single node is no
|
|
|
|
// reason to create a paging alert. However, in practice there are
|
|
|
|
// still many situations where operators like to get paged in time
|
|
|
|
// before a node runs out of disk space. nodeCriticalSeverity can
|
|
|
|
// be set to the desired severity for this kind of alerts. This
|
|
|
|
// can even be templated to depend on labels of the node, e.g. you
|
|
|
|
// could make this critical for traditional database masters but
|
|
|
|
// just a warning for K8s nodes.
|
|
|
|
nodeCriticalSeverity: 'critical',
|
|
|
|
|
2020-03-02 07:24:51 -08:00
|
|
|
// Available disk space (%) thresholds on which to trigger the
|
|
|
|
// 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk
|
|
|
|
// usage grows in a way that it is predicted to run out in 4h or 1d
|
|
|
|
// and if the provided thresholds have been reached right now.
|
|
|
|
// In some cases you'll want to adjust these, e.g. by default Kubernetes
|
|
|
|
// runs the image garbage collection when the disk usage reaches 85%
|
|
|
|
// of its available space. In that case, you'll want to reduce the
|
|
|
|
// critical threshold below to something like 14 or 15, otherwise
|
|
|
|
// the alert could fire under normal node usage.
|
|
|
|
fsSpaceFillingUpWarningThreshold: 40,
|
|
|
|
fsSpaceFillingUpCriticalThreshold: 20,
|
|
|
|
|
2018-05-08 03:10:29 -07:00
|
|
|
grafana_prefix: '',
|
|
|
|
},
|
|
|
|
}
|