From cf8c6891cc610e54f70383addd4bb6079f0add35 Mon Sep 17 00:00:00 2001 From: PrometheusBot Date: Tue, 26 Nov 2024 12:17:19 +0100 Subject: [PATCH 01/18] Update common Prometheus files (#3188) Signed-off-by: prombot --- .github/workflows/golangci-lint.yml | 2 +- Makefile.common | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml index 30514699..e645ba30 100644 --- a/.github/workflows/golangci-lint.yml +++ b/.github/workflows/golangci-lint.yml @@ -36,4 +36,4 @@ jobs: uses: golangci/golangci-lint-action@971e284b6050e8a5849b72094c50ab08da042db8 # v6.1.1 with: args: --verbose - version: v1.61.0 + version: v1.62.0 diff --git a/Makefile.common b/Makefile.common index 09e5bff8..fc47bdbb 100644 --- a/Makefile.common +++ b/Makefile.common @@ -61,7 +61,7 @@ PROMU_URL := https://github.com/prometheus/promu/releases/download/v$(PROMU_ SKIP_GOLANGCI_LINT := GOLANGCI_LINT := GOLANGCI_LINT_OPTS ?= -GOLANGCI_LINT_VERSION ?= v1.61.0 +GOLANGCI_LINT_VERSION ?= v1.62.0 # golangci-lint only supports linux, darwin and windows platforms on i386/amd64/arm64. # windows isn't included here because of the path separator being different. ifeq ($(GOHOSTOS),$(filter $(GOHOSTOS),linux darwin)) From 226c80c83cd1f17c59560ea99c757d6f9a6f741a Mon Sep 17 00:00:00 2001 From: Ben Kochie Date: Tue, 5 Nov 2024 10:36:13 +0100 Subject: [PATCH 02/18] Add filesystem include flags Add support for allow lists of filesystem mount points and filesystem types. This allows for less messy regexps when you want to target only specific lists of mount points or filesystem types. Signed-off-by: Ben Kochie --- README.md | 4 +- collector/filesystem_aix.go | 4 +- collector/filesystem_bsd.go | 4 +- collector/filesystem_common.go | 132 ++++++++++++++++++++++---------- collector/filesystem_freebsd.go | 4 +- collector/filesystem_linux.go | 6 +- collector/filesystem_netbsd.go | 4 +- collector/filesystem_openbsd.go | 4 +- 8 files changed, 108 insertions(+), 54 deletions(-) diff --git a/README.md b/README.md index 44a9d2b1..ff5d27ea 100644 --- a/README.md +++ b/README.md @@ -99,8 +99,8 @@ cpu | flags | --collector.cpu.info.flags-include | N/A diskstats | device | --collector.diskstats.device-include | --collector.diskstats.device-exclude ethtool | device | --collector.ethtool.device-include | --collector.ethtool.device-exclude ethtool | metrics | --collector.ethtool.metrics-include | N/A -filesystem | fs-types | N/A | --collector.filesystem.fs-types-exclude -filesystem | mount-points | N/A | --collector.filesystem.mount-points-exclude +filesystem | fs-types | --collector.filesystem.fs-types-include | --collector.filesystem.fs-types-exclude +filesystem | mount-points | --collector.filesystem.mount-points-include | --collector.filesystem.mount-points-exclude hwmon | chip | --collector.hwmon.chip-include | --collector.hwmon.chip-exclude hwmon | sensor | --collector.hwmon.sensor-include | --collector.hwmon.sensor-exclude interrupts | name | --collector.interrupts.name-include | --collector.interrupts.name-exclude diff --git a/collector/filesystem_aix.go b/collector/filesystem_aix.go index 863a26a0..a1314a0f 100644 --- a/collector/filesystem_aix.go +++ b/collector/filesystem_aix.go @@ -32,12 +32,12 @@ func (c *filesystemCollector) GetStats() (stats []filesystemStats, err error) { return nil, err } for _, stat := range fsStat { - if c.excludedMountPointsPattern.MatchString(stat.MountPoint) { + if c.mountPointFilter.ignored(stat.MountPoint) { c.logger.Debug("Ignoring mount point", "mountpoint", stat.MountPoint) continue } fstype := stat.TypeString() - if c.excludedFSTypesPattern.MatchString(fstype) { + if c.fsTypeFilter.ignored(fstype) { c.logger.Debug("Ignoring fs type", "type", fstype) continue } diff --git a/collector/filesystem_bsd.go b/collector/filesystem_bsd.go index 2db3fb8f..d586c9ca 100644 --- a/collector/filesystem_bsd.go +++ b/collector/filesystem_bsd.go @@ -48,14 +48,14 @@ func (c *filesystemCollector) GetStats() (stats []filesystemStats, err error) { stats = []filesystemStats{} for i := 0; i < int(count); i++ { mountpoint := C.GoString(&mnt[i].f_mntonname[0]) - if c.excludedMountPointsPattern.MatchString(mountpoint) { + if c.mountPointFilter.ignored(mountpoint) { c.logger.Debug("Ignoring mount point", "mountpoint", mountpoint) continue } device := C.GoString(&mnt[i].f_mntfromname[0]) fstype := C.GoString(&mnt[i].f_fstypename[0]) - if c.excludedFSTypesPattern.MatchString(fstype) { + if c.fsTypeFilter.ignored(fstype) { c.logger.Debug("Ignoring fs type", "type", fstype) continue } diff --git a/collector/filesystem_common.go b/collector/filesystem_common.go index 0ec55e8a..41d439c8 100644 --- a/collector/filesystem_common.go +++ b/collector/filesystem_common.go @@ -19,8 +19,8 @@ package collector import ( "errors" + "fmt" "log/slog" - "regexp" "github.com/alecthomas/kingpin/v2" "github.com/prometheus/client_golang/prometheus" @@ -36,7 +36,7 @@ var ( mountPointsExcludeSet bool mountPointsExclude = kingpin.Flag( "collector.filesystem.mount-points-exclude", - "Regexp of mount points to exclude for filesystem collector.", + "Regexp of mount points to exclude for filesystem collector. (mutually exclusive to mount-points-include)", ).Default(defMountPointsExcluded).PreAction(func(c *kingpin.ParseContext) error { mountPointsExcludeSet = true return nil @@ -45,11 +45,15 @@ var ( "collector.filesystem.ignored-mount-points", "Regexp of mount points to ignore for filesystem collector.", ).Hidden().String() + mountPointsInclude = kingpin.Flag( + "collector.filesystem.mount-points-include", + "Regexp of mount points to include for filesystem collector. (mutually exclusive to mount-points-exclude)", + ).String() fsTypesExcludeSet bool fsTypesExclude = kingpin.Flag( "collector.filesystem.fs-types-exclude", - "Regexp of filesystem types to exclude for filesystem collector.", + "Regexp of filesystem types to exclude for filesystem collector. (mutually exclusive to fs-types-include)", ).Default(defFSTypesExcluded).PreAction(func(c *kingpin.ParseContext) error { fsTypesExcludeSet = true return nil @@ -58,13 +62,17 @@ var ( "collector.filesystem.ignored-fs-types", "Regexp of filesystem types to ignore for filesystem collector.", ).Hidden().String() + fsTypesInclude = kingpin.Flag( + "collector.filesystem.fs-types-include", + "Regexp of filesystem types to exclude for filesystem collector. (mutually exclusive to fs-types-exclude)", + ).String() filesystemLabelNames = []string{"device", "mountpoint", "fstype", "device_error"} ) type filesystemCollector struct { - excludedMountPointsPattern *regexp.Regexp - excludedFSTypesPattern *regexp.Regexp + mountPointFilter deviceFilter + fsTypeFilter deviceFilter sizeDesc, freeDesc, availDesc *prometheus.Desc filesDesc, filesFreeDesc *prometheus.Desc roDesc, deviceErrorDesc *prometheus.Desc @@ -89,29 +97,7 @@ func init() { // NewFilesystemCollector returns a new Collector exposing filesystems stats. func NewFilesystemCollector(logger *slog.Logger) (Collector, error) { - if *oldMountPointsExcluded != "" { - if !mountPointsExcludeSet { - logger.Warn("--collector.filesystem.ignored-mount-points is DEPRECATED and will be removed in 2.0.0, use --collector.filesystem.mount-points-exclude") - *mountPointsExclude = *oldMountPointsExcluded - } else { - return nil, errors.New("--collector.filesystem.ignored-mount-points and --collector.filesystem.mount-points-exclude are mutually exclusive") - } - } - - if *oldFSTypesExcluded != "" { - if !fsTypesExcludeSet { - logger.Warn("--collector.filesystem.ignored-fs-types is DEPRECATED and will be removed in 2.0.0, use --collector.filesystem.fs-types-exclude") - *fsTypesExclude = *oldFSTypesExcluded - } else { - return nil, errors.New("--collector.filesystem.ignored-fs-types and --collector.filesystem.fs-types-exclude are mutually exclusive") - } - } - - subsystem := "filesystem" - logger.Info("Parsed flag --collector.filesystem.mount-points-exclude", "flag", *mountPointsExclude) - mountPointPattern := regexp.MustCompile(*mountPointsExclude) - logger.Info("Parsed flag --collector.filesystem.fs-types-exclude", "flag", *fsTypesExclude) - filesystemsTypesPattern := regexp.MustCompile(*fsTypesExclude) + const subsystem = "filesystem" sizeDesc := prometheus.NewDesc( prometheus.BuildFQName(namespace, subsystem, "size_bytes"), @@ -162,18 +148,28 @@ func NewFilesystemCollector(logger *slog.Logger) (Collector, error) { nil, ) + mountPointFilter, err := newMountPointsFilter(logger) + if err != nil { + return nil, fmt.Errorf("unable to parse mount points filter flags: %w", err) + } + + fsTypeFilter, err := newFSTypeFilter(logger) + if err != nil { + return nil, fmt.Errorf("unable to parse fs types filter flags: %w", err) + } + return &filesystemCollector{ - excludedMountPointsPattern: mountPointPattern, - excludedFSTypesPattern: filesystemsTypesPattern, - sizeDesc: sizeDesc, - freeDesc: freeDesc, - availDesc: availDesc, - filesDesc: filesDesc, - filesFreeDesc: filesFreeDesc, - roDesc: roDesc, - deviceErrorDesc: deviceErrorDesc, - mountInfoDesc: mountInfoDesc, - logger: logger, + mountPointFilter: mountPointFilter, + fsTypeFilter: fsTypeFilter, + sizeDesc: sizeDesc, + freeDesc: freeDesc, + availDesc: availDesc, + filesDesc: filesDesc, + filesFreeDesc: filesFreeDesc, + roDesc: roDesc, + deviceErrorDesc: deviceErrorDesc, + mountInfoDesc: mountInfoDesc, + logger: logger, }, nil } @@ -230,3 +226,61 @@ func (c *filesystemCollector) Update(ch chan<- prometheus.Metric) error { } return nil } + +func newMountPointsFilter(logger *slog.Logger) (deviceFilter, error) { + if *oldMountPointsExcluded != "" { + if !mountPointsExcludeSet { + logger.Warn("--collector.filesystem.ignored-mount-points is DEPRECATED and will be removed in 2.0.0, use --collector.filesystem.mount-points-exclude") + *mountPointsExclude = *oldMountPointsExcluded + } else { + return deviceFilter{}, errors.New("--collector.filesystem.ignored-mount-points and --collector.filesystem.mount-points-exclude are mutually exclusive") + } + } + + if *mountPointsInclude != "" && !mountPointsExcludeSet { + logger.Debug("mount-points-exclude flag not set when mount-points-include flag is set, assuming include is desired") + *mountPointsExclude = "" + } + + if *mountPointsExclude != "" && *mountPointsInclude != "" { + return deviceFilter{}, errors.New("--collector.filesystem.mount-points-exclude and --collector.filesystem.mount-points-include are mutually exclusive") + } + + if *mountPointsExclude != "" { + logger.Info("Parsed flag --collector.filesystem.mount-points-exclude", "flag", *mountPointsExclude) + } + if *mountPointsInclude != "" { + logger.Info("Parsed flag --collector.filesystem.mount-points-include", "flag", *mountPointsInclude) + } + + return newDeviceFilter(*mountPointsExclude, *mountPointsInclude), nil +} + +func newFSTypeFilter(logger *slog.Logger) (deviceFilter, error) { + if *oldFSTypesExcluded != "" { + if !fsTypesExcludeSet { + logger.Warn("--collector.filesystem.ignored-fs-types is DEPRECATED and will be removed in 2.0.0, use --collector.filesystem.fs-types-exclude") + *fsTypesExclude = *oldFSTypesExcluded + } else { + return deviceFilter{}, errors.New("--collector.filesystem.ignored-fs-types and --collector.filesystem.fs-types-exclude are mutually exclusive") + } + } + + if *fsTypesInclude != "" && !fsTypesExcludeSet { + logger.Debug("fs-types-exclude flag not set when fs-types-include flag is set, assuming include is desired") + *fsTypesExclude = "" + } + + if *fsTypesExclude != "" && *fsTypesInclude != "" { + return deviceFilter{}, errors.New("--collector.filesystem.fs-types-exclude and --collector.filesystem.fs-types-include are mutually exclusive") + } + + if *fsTypesExclude != "" { + logger.Info("Parsed flag --collector.filesystem.fs-types-exclude", "flag", *fsTypesExclude) + } + if *fsTypesInclude != "" { + logger.Info("Parsed flag --collector.filesystem.fs-types-include", "flag", *fsTypesInclude) + } + + return newDeviceFilter(*fsTypesExclude, *fsTypesInclude), nil +} diff --git a/collector/filesystem_freebsd.go b/collector/filesystem_freebsd.go index 2a6026d3..502ae0a8 100644 --- a/collector/filesystem_freebsd.go +++ b/collector/filesystem_freebsd.go @@ -39,14 +39,14 @@ func (c *filesystemCollector) GetStats() ([]filesystemStats, error) { stats := []filesystemStats{} for _, fs := range buf { mountpoint := unix.ByteSliceToString(fs.Mntonname[:]) - if c.excludedMountPointsPattern.MatchString(mountpoint) { + if c.mountPointFilter.ignored(mountpoint) { c.logger.Debug("Ignoring mount point", "mountpoint", mountpoint) continue } device := unix.ByteSliceToString(fs.Mntfromname[:]) fstype := unix.ByteSliceToString(fs.Fstypename[:]) - if c.excludedFSTypesPattern.MatchString(fstype) { + if c.fsTypeFilter.ignored(fstype) { c.logger.Debug("Ignoring fs type", "type", fstype) continue } diff --git a/collector/filesystem_linux.go b/collector/filesystem_linux.go index 19c39717..3a7bda4d 100644 --- a/collector/filesystem_linux.go +++ b/collector/filesystem_linux.go @@ -73,12 +73,12 @@ func (c *filesystemCollector) GetStats() ([]filesystemStats, error) { go func() { for _, labels := range mps { - if c.excludedMountPointsPattern.MatchString(labels.mountPoint) { + if c.mountPointFilter.ignored(labels.mountPoint) { c.logger.Debug("Ignoring mount point", "mountpoint", labels.mountPoint) continue } - if c.excludedFSTypesPattern.MatchString(labels.fsType) { - c.logger.Debug("Ignoring fs", "type", labels.fsType) + if c.fsTypeFilter.ignored(labels.fsType) { + c.logger.Debug("Ignoring fs type", "type", labels.fsType) continue } diff --git a/collector/filesystem_netbsd.go b/collector/filesystem_netbsd.go index ba6a9f55..c7395893 100644 --- a/collector/filesystem_netbsd.go +++ b/collector/filesystem_netbsd.go @@ -97,14 +97,14 @@ func (c *filesystemCollector) GetStats() (stats []filesystemStats, err error) { stats = []filesystemStats{} for _, v := range mnt { mountpoint := unix.ByteSliceToString(v.F_mntonname[:]) - if c.excludedMountPointsPattern.MatchString(mountpoint) { + if c.mountPointFilter.ignored(mountpoint) { c.logger.Debug("msg", "Ignoring mount point", "mountpoint", mountpoint) continue } device := unix.ByteSliceToString(v.F_mntfromname[:]) fstype := unix.ByteSliceToString(v.F_fstypename[:]) - if c.excludedFSTypesPattern.MatchString(fstype) { + if c.fsTypeFilter.ignored(fstype) { c.logger.Debug("msg", "Ignoring fs type", "type", fstype) continue } diff --git a/collector/filesystem_openbsd.go b/collector/filesystem_openbsd.go index d7489364..fa9f8eb8 100644 --- a/collector/filesystem_openbsd.go +++ b/collector/filesystem_openbsd.go @@ -41,14 +41,14 @@ func (c *filesystemCollector) GetStats() (stats []filesystemStats, err error) { stats = []filesystemStats{} for _, v := range mnt { mountpoint := unix.ByteSliceToString(v.F_mntonname[:]) - if c.excludedMountPointsPattern.MatchString(mountpoint) { + if c.mountPointFilter.ignored(mountpoint) { c.logger.Debug("Ignoring mount point", "mountpoint", mountpoint) continue } device := unix.ByteSliceToString(v.F_mntfromname[:]) fstype := unix.ByteSliceToString(v.F_fstypename[:]) - if c.excludedFSTypesPattern.MatchString(fstype) { + if c.fsTypeFilter.ignored(fstype) { c.logger.Debug("Ignoring fs type", "type", fstype) continue } From 2604d19246c98af97a1dc802363c6bc904ee3cef Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 17 Dec 2024 14:04:22 +0100 Subject: [PATCH 03/18] build(deps): bump github.com/safchain/ethtool from 0.4.1 to 0.5.9 (#3195) Bumps [github.com/safchain/ethtool](https://github.com/safchain/ethtool) from 0.4.1 to 0.5.9. - [Release notes](https://github.com/safchain/ethtool/releases) - [Commits](https://github.com/safchain/ethtool/compare/v0.4.1...v0.5.9) --- updated-dependencies: - dependency-name: github.com/safchain/ethtool dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 4 ++-- go.sum | 9 ++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/go.mod b/go.mod index e3162464..f531d53c 100644 --- a/go.mod +++ b/go.mod @@ -27,9 +27,9 @@ require ( github.com/prometheus/common v0.60.1 github.com/prometheus/exporter-toolkit v0.13.1 github.com/prometheus/procfs v0.15.1 - github.com/safchain/ethtool v0.4.1 + github.com/safchain/ethtool v0.5.9 golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 - golang.org/x/sys v0.26.0 + golang.org/x/sys v0.27.0 howett.net/plist v1.0.1 ) diff --git a/go.sum b/go.sum index 92307274..5e456fd3 100644 --- a/go.sum +++ b/go.sum @@ -87,8 +87,8 @@ github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0leargg github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= -github.com/safchain/ethtool v0.4.1 h1:S6mEleTADqgynileXoiapt/nKnatyR6bmIHoF+h2ADo= -github.com/safchain/ethtool v0.4.1/go.mod h1:XLLnZmy4OCRTkksP/UiMjij96YmIsBfmBQcs7H6tA48= +github.com/safchain/ethtool v0.5.9 h1://6RvaOKFf3nQ0rl5+8zBbE4/72455VC9Jq61pfq67E= +github.com/safchain/ethtool v0.5.9/go.mod h1:w8oSsZeowyRaM7xJJBAbubzzrOkwO8TBgPSEqPP/5mg= github.com/siebenmann/go-kstat v0.0.0-20210513183136-173c9b0a9973 h1:GfSdC6wKfTGcgCS7BtzF5694Amne1pGCSTY252WhlEY= github.com/siebenmann/go-kstat v0.0.0-20210513183136-173c9b0a9973/go.mod h1:G81aIFAMS9ECrwBYR9YxhlPjWgrItd+Kje78O6+uqm8= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= @@ -114,9 +114,8 @@ golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20211031064116-611d5d643895/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= -golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= +golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= From a38a5d7b489c72d77ef7f144d00f004473d977b6 Mon Sep 17 00:00:00 2001 From: Jan Breitkopf <103336573+kyrbrbik@users.noreply.github.com> Date: Tue, 17 Dec 2024 14:11:26 +0100 Subject: [PATCH 04/18] alerts: exclude iowait from NodeCPUHighUsage alert (#3203) Signed-off-by: Jan Breitkopf --- docs/node-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index c3464809..80b73436 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -312,7 +312,7 @@ { alert: 'NodeCPUHighUsage', expr: ||| - sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode!="idle"}[2m]))) * 100 > %(cpuHighUsageThreshold)d + sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode!~"idle|iowait"}[2m]))) * 100 > %(cpuHighUsageThreshold)d ||| % $._config, 'for': '15m', labels: { From 983e6345d5f63c0e9ad1afb02a2bea426fde242a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 17 Dec 2024 18:22:08 +0100 Subject: [PATCH 05/18] build(deps): bump github.com/prometheus/exporter-toolkit (#3211) Bumps [github.com/prometheus/exporter-toolkit](https://github.com/prometheus/exporter-toolkit) from 0.13.1 to 0.13.2. - [Release notes](https://github.com/prometheus/exporter-toolkit/releases) - [Changelog](https://github.com/prometheus/exporter-toolkit/blob/master/CHANGELOG.md) - [Commits](https://github.com/prometheus/exporter-toolkit/compare/v0.13.1...v0.13.2) --- updated-dependencies: - dependency-name: github.com/prometheus/exporter-toolkit dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 18 +++++++++--------- go.sum | 39 ++++++++++++++++++++------------------- 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/go.mod b/go.mod index f531d53c..7d4f8471 100644 --- a/go.mod +++ b/go.mod @@ -24,12 +24,12 @@ require ( github.com/prometheus-community/go-runit v0.1.0 github.com/prometheus/client_golang v1.20.5 github.com/prometheus/client_model v0.6.1 - github.com/prometheus/common v0.60.1 - github.com/prometheus/exporter-toolkit v0.13.1 + github.com/prometheus/common v0.61.0 + github.com/prometheus/exporter-toolkit v0.13.2 github.com/prometheus/procfs v0.15.1 github.com/safchain/ethtool v0.5.9 golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 - golang.org/x/sys v0.27.0 + golang.org/x/sys v0.28.0 howett.net/plist v1.0.1 ) @@ -51,11 +51,11 @@ require ( github.com/xhit/go-str2duration/v2 v2.1.0 // indirect go.uber.org/atomic v1.7.0 // indirect go.uber.org/multierr v1.6.0 // indirect - golang.org/x/crypto v0.28.0 // indirect - golang.org/x/net v0.29.0 // indirect - golang.org/x/oauth2 v0.23.0 // indirect - golang.org/x/sync v0.8.0 // indirect - golang.org/x/text v0.19.0 // indirect - google.golang.org/protobuf v1.34.2 // indirect + golang.org/x/crypto v0.31.0 // indirect + golang.org/x/net v0.32.0 // indirect + golang.org/x/oauth2 v0.24.0 // indirect + golang.org/x/sync v0.10.0 // indirect + golang.org/x/text v0.21.0 // indirect + google.golang.org/protobuf v1.35.2 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect ) diff --git a/go.sum b/go.sum index 5e456fd3..5787f76d 100644 --- a/go.sum +++ b/go.sum @@ -79,10 +79,10 @@ github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+ github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= -github.com/prometheus/common v0.60.1 h1:FUas6GcOw66yB/73KC+BOZoFJmbo/1pojoILArPAaSc= -github.com/prometheus/common v0.60.1/go.mod h1:h0LYf1R1deLSKtD4Vdg8gy4RuOvENW2J/h19V5NADQw= -github.com/prometheus/exporter-toolkit v0.13.1 h1:Evsh0gWQo2bdOHlnz9+0Nm7/OFfIwhE2Ws4A2jIlR04= -github.com/prometheus/exporter-toolkit v0.13.1/go.mod h1:ujdv2YIOxtdFxxqtloLpbqmxd5J0Le6IITUvIRSWjj0= +github.com/prometheus/common v0.61.0 h1:3gv/GThfX0cV2lpO7gkTUwZru38mxevy90Bj8YFSRQQ= +github.com/prometheus/common v0.61.0/go.mod h1:zr29OCN/2BsJRaFwG8QOBr41D6kkchKbpeNH7pAjb/s= +github.com/prometheus/exporter-toolkit v0.13.2 h1:Z02fYtbqTMy2i/f+xZ+UK5jy/bl1Ex3ndzh06T/Q9DQ= +github.com/prometheus/exporter-toolkit v0.13.2/go.mod h1:tCqnfx21q6qN1KA4U3Bfb8uWzXfijIrJz3/kTIqMV7g= github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= @@ -94,32 +94,33 @@ github.com/siebenmann/go-kstat v0.0.0-20210513183136-173c9b0a9973/go.mod h1:G81a github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= -github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= -github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/xhit/go-str2duration/v2 v2.1.0 h1:lxklc02Drh6ynqX+DdPyp5pCKLUQpRT8bp8Ydu2Bstc= github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU= go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= -golang.org/x/crypto v0.28.0 h1:GBDwsMXVQi34v5CCYUm2jkJvu4cbtru2U4TN2PSyQnw= -golang.org/x/crypto v0.28.0/go.mod h1:rmgy+3RHxRZMyY0jjAJShp2zgEdOqj2AO7U0pYmeQ7U= +golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 h1:e66Fs6Z+fZTbFBAxKfP3PALWBtpfqks2bwGcexMxgtk= golang.org/x/exp v0.0.0-20240909161429-701f63a606c0/go.mod h1:2TbTHSBQa924w8M6Xs1QcRcFwyucIwBGpK1p2f1YFFY= -golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo= -golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0= -golang.org/x/oauth2 v0.23.0 h1:PbgcYx2W7i4LvjJWEbf0ngHV6qJYr86PkAV3bXdLEbs= -golang.org/x/oauth2 v0.23.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= -golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= -golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/net v0.32.0 h1:ZqPmj8Kzc+Y6e0+skZsuACbx+wzMgo5MQsJh9Qd6aYI= +golang.org/x/net v0.32.0/go.mod h1:CwU0IoeOlnQQWJ6ioyFrfRuomB8GKF6KbYXZVyeXNfs= +golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE= +golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20211031064116-611d5d643895/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= -golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= -google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= -google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= +golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= +google.golang.org/protobuf v1.35.2 h1:8Ar7bF+apOIoThw1EdZl0p1oWvMqTHmpA2fRTyZO8io= +google.golang.org/protobuf v1.35.2/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= From ff97e35a710fbd12ca3e7fbeaf96d55b0c480bf2 Mon Sep 17 00:00:00 2001 From: Ben Kochie Date: Sun, 10 Nov 2024 11:58:03 +0100 Subject: [PATCH 06/18] Enable goimports linter Enable `goimports` golangci-lint linter and fix issues. Signed-off-by: Ben Kochie --- .golangci.yml | 10 +--------- collector/cpufreq_linux.go | 5 +++-- collector/interrupts_common.go | 3 ++- collector/runit.go | 3 ++- collector/selinux_linux.go | 3 ++- collector/softirqs_common.go | 3 ++- collector/uname.go | 3 ++- 7 files changed, 14 insertions(+), 16 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index 1f1b0f63..8731aefd 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,17 +1,9 @@ linters: enable: - depguard + - goimports - misspell - revive - disable: - # Disable soon to deprecated[1] linters that lead to false - # positives when build tags disable certain files[2] - # 1: https://github.com/golangci/golangci-lint/issues/1841 - # 2: https://github.com/prometheus/node_exporter/issues/1545 - - deadcode - - unused - - structcheck - - varcheck issues: exclude-rules: diff --git a/collector/cpufreq_linux.go b/collector/cpufreq_linux.go index d6b3e42d..79e2308c 100644 --- a/collector/cpufreq_linux.go +++ b/collector/cpufreq_linux.go @@ -18,10 +18,11 @@ package collector import ( "fmt" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/procfs/sysfs" "log/slog" "strings" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/procfs/sysfs" ) type cpuFreqCollector struct { diff --git a/collector/interrupts_common.go b/collector/interrupts_common.go index 08baa106..80cd7171 100644 --- a/collector/interrupts_common.go +++ b/collector/interrupts_common.go @@ -18,9 +18,10 @@ package collector import ( + "log/slog" + "github.com/alecthomas/kingpin/v2" "github.com/prometheus/client_golang/prometheus" - "log/slog" ) type interruptsCollector struct { diff --git a/collector/runit.go b/collector/runit.go index 8065d90c..2813bfd0 100644 --- a/collector/runit.go +++ b/collector/runit.go @@ -17,10 +17,11 @@ package collector import ( + "log/slog" + "github.com/alecthomas/kingpin/v2" "github.com/prometheus-community/go-runit/runit" "github.com/prometheus/client_golang/prometheus" - "log/slog" ) var runitServiceDir = kingpin.Flag("collector.runit.servicedir", "Path to runit service directory.").Default("/etc/service").String() diff --git a/collector/selinux_linux.go b/collector/selinux_linux.go index 9000fe0d..f10b43fa 100644 --- a/collector/selinux_linux.go +++ b/collector/selinux_linux.go @@ -17,9 +17,10 @@ package collector import ( + "log/slog" + "github.com/opencontainers/selinux/go-selinux" "github.com/prometheus/client_golang/prometheus" - "log/slog" ) type selinuxCollector struct { diff --git a/collector/softirqs_common.go b/collector/softirqs_common.go index cd9ccb94..73a9f7be 100644 --- a/collector/softirqs_common.go +++ b/collector/softirqs_common.go @@ -18,9 +18,10 @@ package collector import ( "fmt" + "log/slog" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/procfs" - "log/slog" ) type softirqsCollector struct { diff --git a/collector/uname.go b/collector/uname.go index 6b4f06e5..c3e96b0b 100644 --- a/collector/uname.go +++ b/collector/uname.go @@ -18,8 +18,9 @@ package collector import ( - "github.com/prometheus/client_golang/prometheus" "log/slog" + + "github.com/prometheus/client_golang/prometheus" ) var unameDesc = prometheus.NewDesc( From d0c1d00d18867297326bc7b1c21de36ae50e6a92 Mon Sep 17 00:00:00 2001 From: Tom <12222103+critchtionary@users.noreply.github.com> Date: Thu, 19 Dec 2024 15:49:22 +0000 Subject: [PATCH 07/18] Migrate dashboards to new grafonnet library (#3147) Migrated away from deprecated Grafonnet library. This replaces panels using Angular JS which are disabled by default in Grafana 11 and will be unsupported in Grafana 12. Fixes #3046 --------- Signed-off-by: Tom <12222103+critchtionary@users.noreply.github.com> --- docs/node-mixin/dashboards/use.libsonnet | 825 ++++++++++---------- docs/node-mixin/jsonnetfile.json | 15 +- docs/node-mixin/lib/prom-mixin.libsonnet | 925 +++++++++++------------ 3 files changed, 871 insertions(+), 894 deletions(-) diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index fce411ab..f9d9e07c 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -1,469 +1,480 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; +local grafana = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; +local variable = dashboard.variable; +local row = grafana.panel.row; +local prometheus = grafana.query.prometheus; + +local timeSeriesPanel = grafana.panel.timeSeries; +local tsOptions = timeSeriesPanel.options; +local tsStandardOptions = timeSeriesPanel.standardOptions; +local tsQueryOptions = timeSeriesPanel.queryOptions; +local tsCustom = timeSeriesPanel.fieldConfig.defaults.custom; +local tsLegend = tsOptions.legend; local c = import '../config.libsonnet'; -local datasourceTemplate = { - current: { - text: 'default', - value: 'default', - }, - hide: 0, - label: 'Data Source', - name: 'datasource', - options: [], - query: 'prometheus', - refresh: 1, - regex: '', - type: 'datasource', -}; +local datasource = variable.datasource.new( + 'datasource', 'prometheus' +); + +local tsCommonPanelOptions = + variable.query.withDatasourceFromVariable(datasource) + + tsCustom.stacking.withMode('normal') + + tsCustom.withFillOpacity(100) + + tsCustom.withShowPoints('never') + + tsLegend.withShowLegend(false) + + tsOptions.tooltip.withMode('multi') + + tsOptions.tooltip.withSort('desc'); local CPUUtilisation = - graphPanel.new( + timeSeriesPanel.new( 'CPU Utilisation', - datasource='$datasource', - span=6, - format='percentunit', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; + ) + + tsCommonPanelOptions + + tsStandardOptions.withUnit('percentunit'); local CPUSaturation = // TODO: Is this a useful panel? At least there should be some explanation how load // average relates to the "CPU saturation" in the title. - graphPanel.new( + timeSeriesPanel.new( 'CPU Saturation (Load1 per CPU)', - datasource='$datasource', - span=6, - format='percentunit', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; + ) + + tsCommonPanelOptions + + tsStandardOptions.withUnit('percentunit'); local memoryUtilisation = - graphPanel.new( + timeSeriesPanel.new( 'Memory Utilisation', - datasource='$datasource', - span=6, - format='percentunit', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; + ) + + tsCommonPanelOptions + + tsStandardOptions.withUnit('percentunit'); local memorySaturation = - graphPanel.new( + timeSeriesPanel.new( 'Memory Saturation (Major Page Faults)', - datasource='$datasource', - span=6, - format='rds', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; + ) + + tsCommonPanelOptions + + tsStandardOptions.withUnit('rds'); + +local networkOverrides = tsStandardOptions.withOverrides( + [ + tsStandardOptions.override.byRegexp.new('/Transmit/') + + tsStandardOptions.override.byRegexp.withPropertiesFromOptions( + tsCustom.withTransform('negative-Y') + ), + ] +); local networkUtilisation = - graphPanel.new( + timeSeriesPanel.new( 'Network Utilisation (Bytes Receive/Transmit)', - datasource='$datasource', - span=6, - format='Bps', - stack=true, - fill=10, - legend_show=false, ) - .addSeriesOverride({ alias: '/Receive/', stack: 'A' }) - .addSeriesOverride({ alias: '/Transmit/', stack: 'B', transform: 'negative-Y' }) - { tooltip+: { sort: 2 } }; + + tsCommonPanelOptions + + tsStandardOptions.withUnit('Bps') + + networkOverrides; local networkSaturation = - graphPanel.new( + timeSeriesPanel.new( 'Network Saturation (Drops Receive/Transmit)', - datasource='$datasource', - span=6, - format='Bps', - stack=true, - fill=10, - legend_show=false, ) - .addSeriesOverride({ alias: '/ Receive/', stack: 'A' }) - .addSeriesOverride({ alias: '/ Transmit/', stack: 'B', transform: 'negative-Y' }) - { tooltip+: { sort: 2 } }; + + tsCommonPanelOptions + + tsStandardOptions.withUnit('Bps') + + networkOverrides; local diskIOUtilisation = - graphPanel.new( + timeSeriesPanel.new( 'Disk IO Utilisation', - datasource='$datasource', - span=6, - format='percentunit', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; + ) + + tsCommonPanelOptions + + tsStandardOptions.withUnit('percentunit'); local diskIOSaturation = - graphPanel.new( + timeSeriesPanel.new( 'Disk IO Saturation', - datasource='$datasource', - span=6, - format='percentunit', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; + ) + + tsCommonPanelOptions + + tsStandardOptions.withUnit('percentunit'); local diskSpaceUtilisation = - graphPanel.new( + timeSeriesPanel.new( 'Disk Space Utilisation', - datasource='$datasource', - span=12, - format='percentunit', - stack=true, - fill=10, - legend_show=false, - ) { tooltip+: { sort: 2 } }; + ) + + tsCommonPanelOptions + + tsStandardOptions.withUnit('percentunit'); { - _clusterTemplate:: template.new( - name='cluster', - datasource='$datasource', - query='label_values(node_time_seconds, %s)' % $._config.clusterLabel, - current='', - hide=if $._config.showMultiCluster then '' else '2', - refresh=2, - includeAll=false, - sort=1 - ), + _clusterVariable:: + variable.query.new('cluster') + + variable.query.withDatasourceFromVariable(datasource) + + variable.query.queryTypes.withLabelValues( + $._config.clusterLabel, + 'node_time_seconds', + ) + + (if $._config.showMultiCluster then variable.query.generalOptions.showOnDashboard.withLabelAndValue() else variable.query.generalOptions.showOnDashboard.withNothing()) + + variable.query.refresh.onTime() + + variable.query.selectionOptions.withIncludeAll(false) + + variable.query.withSort(asc=true), grafanaDashboards+:: { 'node-rsrc-use.json': - dashboard.new( '%sUSE Method / Node' % $._config.dashboardNamePrefix, - time_from='now-1h', - tags=($._config.dashboardTags), - timezone='utc', - refresh='30s', - graphTooltip='shared_crosshair', - uid=std.md5('node-rsrc-use.json') ) - .addTemplate(datasourceTemplate) - .addTemplate($._clusterTemplate) - .addTemplate( - template.new( + + dashboard.time.withFrom('now-1h') + + dashboard.withTags($._config.dashboardTags) + + dashboard.withTimezone('utc') + + dashboard.withRefresh('30s') + + dashboard.graphTooltip.withSharedCrosshair() + + dashboard.withUid(std.md5('node-rsrc-use.json')) + + dashboard.withVariables([ + datasource, + $._clusterVariable, + variable.query.new('instance') + + variable.query.withDatasourceFromVariable(datasource) + + variable.query.queryTypes.withLabelValues( 'instance', - '$datasource', - 'label_values(node_exporter_build_info{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}, instance)' % $._config, - refresh='time', - sort=1 - ) - ) - .addRow( - row.new('CPU') - .addPanel(CPUUtilisation.addTarget(prometheus.target('instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Utilisation'))) - .addPanel(CPUSaturation.addTarget(prometheus.target('instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Saturation'))) - ) - .addRow( - row.new('Memory') - .addPanel(memoryUtilisation.addTarget(prometheus.target('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Utilisation'))) - .addPanel(memorySaturation.addTarget(prometheus.target('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Major page Faults'))) - ) - .addRow( - row.new('Network') - .addPanel( - networkUtilisation - .addTarget(prometheus.target('instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Receive')) - .addTarget(prometheus.target('instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Transmit')) - ) - .addPanel( - networkSaturation - .addTarget(prometheus.target('instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Receive')) - .addTarget(prometheus.target('instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Transmit')) - ) - ) - .addRow( - row.new('Disk IO') - .addPanel(diskIOUtilisation.addTarget(prometheus.target('instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{device}}'))) - .addPanel(diskIOSaturation.addTarget(prometheus.target('instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{device}}'))) - ) - .addRow( - row.new('Disk Space') - .addPanel( - diskSpaceUtilisation.addTarget(prometheus.target( - ||| - sort_desc(1 - - ( - max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"}) - / - max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"}) - ) != 0 - ) - ||| % $._config, legendFormat='{{device}}' - )) + 'node_exporter_build_info{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}' % $._config, ) + + variable.query.refresh.onTime() + + variable.query.withSort(asc=true), + ]) + + dashboard.withPanels( + grafana.util.grid.makeGrid([ + row.new('CPU') + + row.withPanels([ + CPUUtilisation + tsQueryOptions.withTargets([prometheus.new('$datasource', 'instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('Utilisation')]), + CPUSaturation + tsQueryOptions.withTargets([prometheus.new('$datasource', 'instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('Saturation')]), + ]), + row.new('Memory') + + row.withPanels([ + memoryUtilisation + tsQueryOptions.withTargets([prometheus.new('$datasource', 'instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('Utilisation')]), + memorySaturation + tsQueryOptions.withTargets([prometheus.new('$datasource', 'instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('Major page Faults')]), + ]), + row.new('Network') + + row.withPanels([ + networkUtilisation + tsQueryOptions.withTargets([ + prometheus.new('$datasource', 'instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('Receive'), + prometheus.new('$datasource', 'instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('Transmit'), + ]), + networkSaturation + tsQueryOptions.withTargets([ + prometheus.new('$datasource', 'instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('Receive'), + prometheus.new('$datasource', 'instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('Transmit'), + ]), + ]), + row.new('Disk IO') + + row.withPanels([ + diskIOUtilisation + tsQueryOptions.withTargets([prometheus.new('$datasource', 'instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('{{device}}')]), + diskIOSaturation + tsQueryOptions.withTargets([prometheus.new('$datasource', 'instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('{{device}}')]), + ]), + ], panelWidth=12, panelHeight=7) + + grafana.util.grid.makeGrid([ + row.new('Disk Space') + + row.withPanels([ + diskSpaceUtilisation + tsQueryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + sort_desc(1 - + ( + max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"}) + / + max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"}) + ) != 0 + ) + ||| % $._config + ) + prometheus.withLegendFormat('{{device}}'), + ]), + ]), + ], panelWidth=24, panelHeight=7, startY=34), ), - 'node-cluster-rsrc-use.json': dashboard.new( '%sUSE Method / Cluster' % $._config.dashboardNamePrefix, - time_from='now-1h', - tags=($._config.dashboardTags), - timezone='utc', - refresh='30s', - graphTooltip='shared_crosshair', - uid=std.md5('node-cluster-rsrc-use.json') ) - .addTemplate(datasourceTemplate) - .addTemplate($._clusterTemplate) - .addRow( - row.new('CPU') - .addPanel( - CPUUtilisation - .addTarget(prometheus.target( - ||| - (( - instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} - * - instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} - ) != 0 ) - / scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) - ||| % $._config, legendFormat='{{ instance }}' - )) - ) - .addPanel( - CPUSaturation - .addTarget(prometheus.target( - ||| - ( - instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} - / scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) - ) != 0 - ||| % $._config, legendFormat='{{instance}}' - )) - ) - ) - .addRow( - row.new('Memory') - .addPanel( - memoryUtilisation - .addTarget(prometheus.target( - ||| - ( - instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} - / scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) - ) != 0 - ||| % $._config, legendFormat='{{instance}}', - )) - ) - .addPanel(memorySaturation.addTarget(prometheus.target('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{instance}}'))) - ) - .addRow( - row.new('Network') - .addPanel( - networkUtilisation - .addTarget(prometheus.target('instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Receive')) - .addTarget(prometheus.target('instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Transmit')) - ) - .addPanel( - networkSaturation - .addTarget(prometheus.target('instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Receive')) - .addTarget(prometheus.target('instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Transmit')) - ) - ) - .addRow( - row.new('Disk IO') - .addPanel( - diskIOUtilisation - .addTarget(prometheus.target( - ||| - ( - instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} - / scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) - ) != 0 - ||| % $._config, legendFormat='{{instance}} {{device}}' - )) - ) - .addPanel( - diskIOSaturation - .addTarget(prometheus.target( - ||| - ( + + dashboard.time.withFrom('now-1h') + + dashboard.withTags($._config.dashboardTags) + + dashboard.withTimezone('utc') + + dashboard.withRefresh('30s') + + dashboard.graphTooltip.withSharedCrosshair() + + dashboard.withUid(std.md5('node-cluster-rsrc-use.json')) + + dashboard.withVariables([ + datasource, + $._clusterVariable, + variable.query.withDatasourceFromVariable(datasource) + + variable.query.refresh.onTime() + + variable.query.withSort(asc=true), + ]) + + dashboard.withPanels( + grafana.util.grid.makeGrid([ + row.new('CPU') + + row.withPanels([ + CPUUtilisation + tsQueryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + (( + instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} + * + instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} + ) != 0 ) + / scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) + ||| % $._config + ) + prometheus.withLegendFormat('{{ instance }}'), + ]), + CPUSaturation + tsQueryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + ( + instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} + / scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) + ) != 0 + ||| % $._config + ) + prometheus.withLegendFormat('{{ instance }}'), + ]), + ]), + row.new('Memory') + + row.withPanels([ + memoryUtilisation + tsQueryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + ( + instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} + / scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) + ) != 0 + ||| % $._config + ) + prometheus.withLegendFormat('{{ instance }}'), + ]), + memorySaturation + tsQueryOptions.withTargets([ + prometheus.new( + '$datasource', + 'instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}' % $._config + ) + prometheus.withLegendFormat('{{ instance }}'), + ]), + ]), + row.new('Network') + + row.withPanels([ + networkUtilisation + tsQueryOptions.withTargets([ + prometheus.new( + '$datasource', + 'instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config + ) + prometheus.withLegendFormat('{{ instance }} Receive'), + prometheus.new( + '$datasource', + 'instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config + ) + prometheus.withLegendFormat('{{ instance }} Transmit'), + ]), + networkSaturation + tsQueryOptions.withTargets([ + prometheus.new( + '$datasource', + 'instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config + ) + prometheus.withLegendFormat('{{ instance }} Receive'), + prometheus.new( + '$datasource', + 'instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config + ) + prometheus.withLegendFormat('{{ instance }} Transmit'), + ]), + ]), + row.new('Disk IO') + + row.withPanels([ + diskIOUtilisation + tsQueryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} + / scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) + ||| % $._config + ) + prometheus.withLegendFormat('{{ instance }} {{device}}'), + ]), + diskIOSaturation + tsQueryOptions.withTargets([prometheus.new( + '$datasource', + ||| instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} / scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"})) - ) != 0 - ||| % $._config, legendFormat='{{instance}} {{device}}' - )) - ) - ) - .addRow( - row.new('Disk Space') - .addPanel( - diskSpaceUtilisation - .addTarget(prometheus.target( - ||| - sum without (device) ( - max without (fstype, mountpoint) (( - node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"} - - - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"} - ) != 0) - ) - / scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"}))) - ||| % $._config, legendFormat='{{instance}}' - )) - ) + ||| % $._config + ) + prometheus.withLegendFormat('{{ instance }} {{device}}')]), + ]), + ], panelWidth=12, panelHeight=7) + + grafana.util.grid.makeGrid([ + row.new('Disk Space') + + row.withPanels([ + diskSpaceUtilisation + tsQueryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + sum without (device) ( + max without (fstype, mountpoint) (( + node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"} + - + node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"} + ) != 0) + ) + / scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"}))) + ||| % $._config + ) + prometheus.withLegendFormat('{{ instance }}'), + ]), + ]), + ], panelWidth=24, panelHeight=7, startY=34), ), } + if $._config.showMultiCluster then { 'node-multicluster-rsrc-use.json': dashboard.new( '%sUSE Method / Multi-cluster' % $._config.dashboardNamePrefix, - time_from='now-1h', - tags=($._config.dashboardTags), - timezone='utc', - refresh='30s', - graphTooltip='shared_crosshair', - uid=std.md5('node-multicluster-rsrc-use.json') ) - .addTemplate(datasourceTemplate) - .addRow( - row.new('CPU') - .addPanel( - CPUUtilisation - .addTarget(prometheus.target( - ||| - sum( - (( - instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s} - * - instance:node_num_cpu:sum{%(nodeExporterSelector)s} - ) != 0) - / scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s})) - ) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config - )) - ) - .addPanel( - CPUSaturation - .addTarget(prometheus.target( - ||| - sum(( - instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s} - / scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s})) - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config - )) - ) - ) - .addRow( - row.new('Memory') - .addPanel( - memoryUtilisation - .addTarget(prometheus.target( - ||| - sum(( - instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s} - / scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s})) - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config - )) - ) - .addPanel( - memorySaturation - .addTarget(prometheus.target( - ||| - sum(( - instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s} - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config - )) - ) - ) - .addRow( - row.new('Network') - .addPanel( - networkUtilisation - .addTarget(prometheus.target( - ||| - sum(( - instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}} Receive' % $._config - )) - .addTarget(prometheus.target( - ||| - sum(( - instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}} Transmit' % $._config - )) - ) - .addPanel( - networkSaturation - .addTarget(prometheus.target( - ||| - sum(( - instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}} Receive' % $._config - )) - .addTarget(prometheus.target( - ||| - sum(( - instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} - ) != 0) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}} Transmit' % $._config - )) - ) - ) - .addRow( - row.new('Disk IO') - .addPanel( - diskIOUtilisation - .addTarget(prometheus.target( - ||| - sum(( - instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s} - / scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s})) - ) != 0) by (%(clusterLabel)s, device) - ||| % $._config, legendFormat='{{%(clusterLabel)s}} {{device}}' % $._config - )) - ) - .addPanel( - diskIOSaturation - .addTarget(prometheus.target( - ||| - sum(( - instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s} - / scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s})) - ) != 0) by (%(clusterLabel)s, device) - ||| % $._config, legendFormat='{{%(clusterLabel)s}} {{device}}' % $._config - )) - ) - ) - .addRow( - row.new('Disk Space') - .addPanel( - diskSpaceUtilisation - .addTarget(prometheus.target( - ||| - sum ( - sum without (device) ( - max without (fstype, mountpoint, instance, pod) (( - node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s} - ) != 0) - ) - / scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}))) - ) by (%(clusterLabel)s) - ||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config - )) - ) + + dashboard.time.withFrom('now-1h') + + dashboard.withTags($._config.dashboardTags) + + dashboard.withTimezone('utc') + + dashboard.withRefresh('30s') + + dashboard.graphTooltip.withSharedCrosshair() + + dashboard.withUid(std.md5('node-multicluster-rsrc-use.json')) + + dashboard.withVariables([ + datasource, + variable.query.withDatasourceFromVariable(datasource) + + variable.query.refresh.onTime() + + variable.query.withSort(asc=true), + ]) + + dashboard.withPanels( + grafana.util.grid.makeGrid([ + row.new('CPU') + + row.withPanels([ + CPUUtilisation + tsQueryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + sum( + (( + instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s} + * + instance:node_num_cpu:sum{%(nodeExporterSelector)s} + ) != 0) + / scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s})) + ) by (%(clusterLabel)s) + ||| % $._config + ) + prometheus.withLegendFormat('{{%(clusterLabel)s}}'), + ]), + CPUSaturation + tsQueryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + sum(( + instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s} + / scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s})) + ) != 0) by (%(clusterLabel)s) + ||| % $._config + ) + prometheus.withLegendFormat('{{%(clusterLabel)s}}'), + ]), + ]), + row.new('Memory') + + row.withPanels([ + memoryUtilisation + tsQueryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + sum(( + instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s} + / scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s})) + ) != 0) by (%(clusterLabel)s) + ||| % $._config + ) + prometheus.withLegendFormat('{{%(clusterLabel)s}}'), + ]), + memorySaturation + tsQueryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + sum(( + instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s} + ) != 0) by (%(clusterLabel)s) + ||| + % $._config + ) + prometheus.withLegendFormat('{{%(clusterLabel)s}}'), + ]), + ]), + row.new('Network') + + row.withPanels([ + networkUtilisation + tsQueryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + sum(( + instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} + ) != 0) by (%(clusterLabel)s) + ||| % $._config + ) + prometheus.withLegendFormat('{{%(clusterLabel)s}} Receive'), + prometheus.new( + '$datasource', + ||| + sum(( + instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} + ) != 0) by (%(clusterLabel)s) + ||| % $._config + ) + prometheus.withLegendFormat('{{%(clusterLabel)s}} Transmit'), + ]), + networkSaturation + tsQueryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + sum(( + instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} + ) != 0) by (%(clusterLabel)s) + ||| % $._config + ) + prometheus.withLegendFormat('{{%(clusterLabel)s}} Receive'), + prometheus.new( + '$datasource', + ||| + sum(( + instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s} + ) != 0) by (%(clusterLabel)s) + ||| % $._config + ) + prometheus.withLegendFormat('{{%(clusterLabel)s}} Transmit'), + ]), + ]), + row.new('Disk IO') + + row.withPanels([ + diskIOUtilisation + tsQueryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + sum(( + instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s} + / scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s})) + ) != 0) by (%(clusterLabel)s, device) + ||| % $._config + ) + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{device}}'), + ]), + diskIOSaturation + tsQueryOptions.withTargets([prometheus.new( + '$datasource', + ||| + sum(( + instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s} + / scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s})) + ) != 0) by (%(clusterLabel)s, device) + ||| % $._config + ) + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{device}}')]), + ]), + + ], panelWidth=12, panelHeight=7) + + grafana.util.grid.makeGrid([ + row.new('Disk Space') + + row.withPanels([ + diskSpaceUtilisation + tsQueryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + sum ( + sum without (device) ( + max without (fstype, mountpoint, instance, pod) (( + node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s} + ) != 0) + ) + / scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}))) + ) by (%(clusterLabel)s) + ||| % $._config + ) + prometheus.withLegendFormat('{{%(clusterLabel)s}}'), + ]), + ]), + ], panelWidth=24, panelHeight=7, startY=34), ), } else {}, } diff --git a/docs/node-mixin/jsonnetfile.json b/docs/node-mixin/jsonnetfile.json index 721d4833..2d56d912 100644 --- a/docs/node-mixin/jsonnetfile.json +++ b/docs/node-mixin/jsonnetfile.json @@ -4,20 +4,11 @@ { "source": { "git": { - "remote": "https://github.com/grafana/grafonnet-lib.git", - "subdir": "grafonnet" + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-latest" } }, - "version": "master" - }, - { - "source": { - "git": { - "remote": "https://github.com/grafana/grafonnet-lib.git", - "subdir": "grafonnet-7.0" - } - }, - "version": "master" + "version": "main" } ], "legacyImports": false diff --git a/docs/node-mixin/lib/prom-mixin.libsonnet b/docs/node-mixin/lib/prom-mixin.libsonnet index bd01cfd4..f18c273c 100644 --- a/docs/node-mixin/lib/prom-mixin.libsonnet +++ b/docs/node-mixin/lib/prom-mixin.libsonnet @@ -1,560 +1,535 @@ -local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; +local grafana = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; local dashboard = grafana.dashboard; -local row = grafana.row; -local prometheus = grafana.prometheus; -local template = grafana.template; -local graphPanel = grafana.graphPanel; -local grafana70 = import 'github.com/grafana/grafonnet-lib/grafonnet-7.0/grafana.libsonnet'; -local gaugePanel = grafana70.panel.gauge; -local table = grafana70.panel.table; +local row = grafana.panel.row; +local prometheus = grafana.query.prometheus; +local variable = dashboard.variable; + +local timeSeriesPanel = grafana.panel.timeSeries; +local tsOptions = timeSeriesPanel.options; +local tsStandardOptions = timeSeriesPanel.standardOptions; +local tsQueryOptions = timeSeriesPanel.queryOptions; +local tsCustom = timeSeriesPanel.fieldConfig.defaults.custom; + +local gaugePanel = grafana.panel.gauge; +local gaugeStep = gaugePanel.standardOptions.threshold.step; + +local table = grafana.panel.table; +local tableStep = table.standardOptions.threshold.step; +local tableOverride = table.standardOptions.override; +local tableTransformation = table.queryOptions.transformation; { new(config=null, platform=null, uid=null):: { - local prometheusDatasourceTemplate = { - current: { - text: 'default', - value: 'default', - }, - hide: 0, - label: 'Data Source', - name: 'datasource', - options: [], - query: 'prometheus', - refresh: 1, - regex: '', - type: 'datasource', - }, + local prometheusDatasourceVariable = variable.datasource.new( + 'datasource', 'prometheus' + ), - local clusterTemplatePrototype = - template.new( - 'cluster', - '$datasource', - '', - hide=if config.showMultiCluster then '' else '2', - refresh='time', - label='Cluster', - ), - local clusterTemplate = - if platform == 'Darwin' then - clusterTemplatePrototype - { query: 'label_values(node_uname_info{%(nodeExporterSelector)s, sysname="Darwin"}, %(clusterLabel)s)' % config } - else - clusterTemplatePrototype - { query: 'label_values(node_uname_info{%(nodeExporterSelector)s, sysname!="Darwin"}, %(clusterLabel)s)' % config }, + local clusterVariablePrototype = + variable.query.new('cluster') + + variable.query.withDatasourceFromVariable(prometheusDatasourceVariable) + + (if config.showMultiCluster then variable.query.generalOptions.showOnDashboard.withLabelAndValue() else variable.query.generalOptions.showOnDashboard.withNothing()) + + variable.query.refresh.onTime() + + variable.query.generalOptions.withLabel('Cluster'), - local instanceTemplatePrototype = - template.new( - 'instance', - '$datasource', - '', - refresh='time', - label='Instance', - ), - local instanceTemplate = + local clusterVariable = if platform == 'Darwin' then - instanceTemplatePrototype - { query: 'label_values(node_uname_info{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster", sysname="Darwin"}, instance)' % config } + clusterVariablePrototype + + variable.query.queryTypes.withLabelValues( + ' %(clusterLabel)s' % config, + 'node_uname_info{%(nodeExporterSelector)s, sysname="Darwin"}' % config, + ) else - instanceTemplatePrototype - { query: 'label_values(node_uname_info{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster", sysname!="Darwin"}, instance)' % config }, + clusterVariablePrototype + + variable.query.queryTypes.withLabelValues( + '%(clusterLabel)s' % config, + 'node_uname_info{%(nodeExporterSelector)s, sysname!="Darwin"}' % config, + ), + + local instanceVariablePrototype = + variable.query.new('instance') + + variable.query.withDatasourceFromVariable(prometheusDatasourceVariable) + + variable.query.refresh.onTime() + + variable.query.generalOptions.withLabel('Instance'), + + local instanceVariable = + if platform == 'Darwin' then + instanceVariablePrototype + + variable.query.queryTypes.withLabelValues( + 'instance', + 'node_uname_info{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster", sysname="Darwin"}' % config, + ) + else + instanceVariablePrototype + + variable.query.queryTypes.withLabelValues( + 'instance', + 'node_uname_info{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster", sysname!="Darwin"}' % config, + ), local idleCPU = - graphPanel.new( - 'CPU Usage', - datasource='$datasource', - span=6, - format='percentunit', - max=1, - min=0, - stack=true, - ) - .addTarget(prometheus.target( - ||| - ( - (1 - sum without (mode) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode=~"idle|iowait|steal", instance="$instance", %(clusterLabel)s="$cluster"}[$__rate_interval]))) - / ignoring(cpu) group_left - count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance", %(clusterLabel)s="$cluster"}) - ) - ||| % config, - legendFormat='{{cpu}}', - intervalFactor=5, - )), - - local systemLoad = - graphPanel.new( - 'Load Average', - datasource='$datasource', - span=6, - format='short', - min=0, - fill=0, - ) - .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config, legendFormat='1m load average')) - .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config, legendFormat='5m load average')) - .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config, legendFormat='15m load average')) - .addTarget(prometheus.target('count(node_cpu_seconds_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", mode="idle"})' % config, legendFormat='logical cores')), - - local memoryGraphPanelPrototype = - graphPanel.new( - 'Memory Usage', - datasource='$datasource', - span=9, - format='bytes', - min=0, - ), - local memoryGraph = - if platform == 'Linux' then - memoryGraphPanelPrototype { stack: true } - .addTarget(prometheus.target( + timeSeriesPanel.new('CPU Usage') + + variable.query.withDatasourceFromVariable(prometheusDatasourceVariable) + + tsStandardOptions.withUnit('percentunit') + + tsCustom.stacking.withMode('normal') + + tsStandardOptions.withMax(1) + + tsStandardOptions.withMin(0) + + tsOptions.tooltip.withMode('multi') + + tsCustom.withFillOpacity(10) + + tsCustom.withShowPoints('never') + + tsQueryOptions.withTargets([ + prometheus.new( + '$datasource', ||| ( - node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} - - - node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} - - - node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} - - - node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} + (1 - sum without (mode) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode=~"idle|iowait|steal", instance="$instance", %(clusterLabel)s="$cluster"}[$__rate_interval]))) + / ignoring(cpu) group_left + count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance", %(clusterLabel)s="$cluster"}) ) ||| % config, - legendFormat='memory used' - )) - .addTarget(prometheus.target('node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config, legendFormat='memory buffers')) - .addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config, legendFormat='memory cached')) - .addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config, legendFormat='memory free')) + ) + + prometheus.withLegendFormat('{{cpu}}') + + prometheus.withIntervalFactor(5), + ]), + + local systemLoad = + timeSeriesPanel.new('Load Average') + + variable.query.withDatasourceFromVariable(prometheusDatasourceVariable) + + tsStandardOptions.withUnit('short') + + tsStandardOptions.withMin(0) + + tsCustom.withFillOpacity(0) + + tsCustom.withShowPoints('never') + + tsOptions.tooltip.withMode('multi') + + tsQueryOptions.withTargets([ + prometheus.new('$datasource', 'node_load1{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('1m load average'), + prometheus.new('$datasource', 'node_load5{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('5m load average'), + prometheus.new('$datasource', 'node_load15{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('15m load average'), + prometheus.new('$datasource', 'count(node_cpu_seconds_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", mode="idle"})' % config) + prometheus.withLegendFormat('logical cores'), + ]), + + local memoryGraphPanelPrototype = + timeSeriesPanel.new('Memory Usage') + + variable.query.withDatasourceFromVariable(prometheusDatasourceVariable) + + tsStandardOptions.withUnit('bytes') + + tsStandardOptions.withMin(0) + + tsOptions.tooltip.withMode('multi') + + tsCustom.withFillOpacity(10) + + tsCustom.withShowPoints('never'), + + local memoryGraph = + if platform == 'Linux' then + memoryGraphPanelPrototype + + tsCustom.stacking.withMode('normal') + + tsQueryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + ( + node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} + - + node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} + - + node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} + - + node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} + ) + ||| % config, + ) + prometheus.withLegendFormat('memory used'), + prometheus.new('$datasource', 'node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('memory buffers'), + prometheus.new('$datasource', 'node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('memory cached'), + prometheus.new('$datasource', 'node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('memory free'), + ]) else if platform == 'Darwin' then // not useful to stack - memoryGraphPanelPrototype { stack: false } - .addTarget(prometheus.target('node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config, legendFormat='Physical Memory')) - .addTarget(prometheus.target( - ||| - ( - node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} - - node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} + - node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} + - node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} - ) - ||| % config, legendFormat='Memory Used' - )) - .addTarget(prometheus.target( - ||| - ( - node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} - - node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} - ) - ||| % config, legendFormat='App Memory' - )) - .addTarget(prometheus.target('node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config, legendFormat='Wired Memory')) - .addTarget(prometheus.target('node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config, legendFormat='Compressed')) + memoryGraphPanelPrototype + + tsCustom.stacking.withMode('none') + + tsQueryOptions.withTargets([ + prometheus.new('$datasource', 'node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('Physical Memory'), + prometheus.new( + '$datasource', + ||| + ( + node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} - + node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} + + node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} + + node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} + ) + ||| % config + ) + prometheus.withLegendFormat( + 'Memory Used' + ), + prometheus.new( + '$datasource', + ||| + ( + node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} - + node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} + ) + ||| % config + ) + prometheus.withLegendFormat( + 'App Memory' + ), + prometheus.new('$datasource', 'node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('Wired Memory'), + prometheus.new('$datasource', 'node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('Compressed'), + ]) + else if platform == 'AIX' then - memoryGraphPanelPrototype { stack: false } - .addTarget(prometheus.target('node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config, legendFormat='Physical Memory')) - .addTarget(prometheus.target( - ||| - ( - node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} - - node_memory_available_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} - ) - ||| % config, legendFormat='Memory Used' - )), + memoryGraphPanelPrototype + + tsCustom.stacking.withMode('none') + + tsQueryOptions.withTargets([ + prometheus.new('$datasource', 'node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config) + prometheus.withLegendFormat('Physical Memory'), + prometheus.new( + '$datasource', + ||| + ( + node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} - + node_memory_available_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} + ) + ||| % config + ) + prometheus.withLegendFormat('Memory Used'), + ]), // NOTE: avg() is used to circumvent a label change caused by a node_exporter rollout. local memoryGaugePanelPrototype = - gaugePanel.new( - title='Memory Usage', - datasource='$datasource', - ) - .addThresholdStep('rgba(50, 172, 45, 0.97)') - .addThresholdStep('rgba(237, 129, 40, 0.89)', 80) - .addThresholdStep('rgba(245, 54, 54, 0.9)', 90) - .setFieldConfig(max=100, min=0, unit='percent') - + { - span: 3, - }, + gaugePanel.new('Memory Usage') + + variable.query.withDatasourceFromVariable(prometheusDatasourceVariable) + + gaugePanel.standardOptions.thresholds.withSteps([ + gaugeStep.withColor('rgba(50, 172, 45, 0.97)'), + gaugeStep.withColor('rgba(237, 129, 40, 0.89)') + gaugeStep.withValue(80), + gaugeStep.withColor('rgba(245, 54, 54, 0.9)') + gaugeStep.withValue(90), + ]) + + gaugePanel.standardOptions.withMax(100) + + gaugePanel.standardOptions.withMin(0) + + gaugePanel.standardOptions.withUnit('percent'), local memoryGauge = if platform == 'Linux' then memoryGaugePanelPrototype - - .addTarget(prometheus.target( - ||| - 100 - - ( - avg(node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) / - avg(node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) - * 100 - ) - ||| % config, - )) + + gaugePanel.queryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + 100 - + ( + avg(node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) / + avg(node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) + * 100 + ) + ||| % config, + ), + ]) else if platform == 'Darwin' then memoryGaugePanelPrototype - .addTarget(prometheus.target( - ||| - ( - ( - avg(node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) - - avg(node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) + - avg(node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) + - avg(node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) - ) / - avg(node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) - ) - * - 100 - ||| % config - )) + + gaugePanel.queryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + ( + ( + avg(node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) - + avg(node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) + + avg(node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) + + avg(node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) + ) / + avg(node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) + ) + * + 100 + ||| % config + ), + ]) + else if platform == 'AIX' then memoryGaugePanelPrototype - .addTarget(prometheus.target( - ||| - 100 - - ( - avg(node_memory_available_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) / - avg(node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) - * 100 - ) - ||| % config - )), + + gaugePanel.queryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + 100 - + ( + avg(node_memory_available_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) / + avg(node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) + * 100 + ) + ||| % config + ), + ]), local diskIO = - graphPanel.new( - 'Disk I/O', - datasource='$datasource', - span=6, - min=0, - fill=0, - ) - // TODO: Does it make sense to have those three in the same panel? - .addTarget(prometheus.target( - 'rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(diskDeviceSelector)s}[$__rate_interval])' % config, - legendFormat='{{device}} read', - intervalFactor=1, - )) - .addTarget(prometheus.target( - 'rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(diskDeviceSelector)s}[$__rate_interval])' % config, - legendFormat='{{device}} written', - intervalFactor=1, - )) - .addTarget(prometheus.target( - 'rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(diskDeviceSelector)s}[$__rate_interval])' % config, - legendFormat='{{device}} io time', - intervalFactor=1, - )) + - { - seriesOverrides: [ - { - alias: '/ read| written/', - yaxis: 1, - }, - { - alias: '/ io time/', - yaxis: 2, - }, - ], - yaxes: [ - self.yaxe(format='Bps'), - self.yaxe(format='percentunit'), - ], - }, + timeSeriesPanel.new('Disk I/O') + + variable.query.withDatasourceFromVariable(prometheusDatasourceVariable) + + tsStandardOptions.withMin(0) + + tsCustom.withFillOpacity(0) + + tsCustom.withShowPoints('never') + + tsOptions.tooltip.withMode('multi') + + tsQueryOptions.withTargets([ + // TODO: Does it make sense to have those three in the same panel? + prometheus.new('$datasource', 'rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(diskDeviceSelector)s}[$__rate_interval])' % config) + + prometheus.withLegendFormat('{{device}} read') + + prometheus.withIntervalFactor(1), + prometheus.new('$datasource', 'rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(diskDeviceSelector)s}[$__rate_interval])' % config) + + prometheus.withLegendFormat('{{device}} written') + + prometheus.withIntervalFactor(1), + prometheus.new('$datasource', 'rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(diskDeviceSelector)s}[$__rate_interval])' % config) + + prometheus.withLegendFormat('{{device}} io time') + + prometheus.withIntervalFactor(1), + ]) + + tsStandardOptions.withOverrides( + [ + tsStandardOptions.override.byRegexp.new('/ read| written/') + + tsStandardOptions.override.byRegexp.withPropertiesFromOptions( + tsStandardOptions.withUnit('Bps') + ), + tsStandardOptions.override.byRegexp.new('/ io time/') + + tsStandardOptions.override.byRegexp.withPropertiesFromOptions(tsStandardOptions.withUnit('percentunit')), + ] + ), local diskSpaceUsage = - table.new( - title='Disk Space Usage', - datasource='$datasource', - ) - .setFieldConfig(unit='decbytes') - .addThresholdStep(color='green', value=null) - .addThresholdStep(color='yellow', value=0.8) - .addThresholdStep(color='red', value=0.9) - .addTarget(prometheus.target( - ||| - max by (mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(fsSelector)s, %(fsMountpointSelector)s}) - ||| % config, - legendFormat='', - instant=true, - format='table' - )) - .addTarget(prometheus.target( - ||| - max by (mountpoint) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(fsSelector)s, %(fsMountpointSelector)s}) - ||| % config, - legendFormat='', - instant=true, - format='table' - )) - .addOverride( - matcher={ - id: 'byName', - options: 'Mounted on', - }, - properties=[ - { - id: 'custom.width', - value: 260, - }, - ], - ) - .addOverride( - matcher={ - id: 'byName', - options: 'Size', - }, - properties=[ - - { - id: 'custom.width', - value: 93, - }, - - ], - ) - .addOverride( - matcher={ - id: 'byName', - options: 'Used', - }, - properties=[ - { - id: 'custom.width', - value: 72, - }, - ], - ) - .addOverride( - matcher={ - id: 'byName', - options: 'Available', - }, - properties=[ - { - id: 'custom.width', - value: 88, - }, - ], - ) - - .addOverride( - matcher={ - id: 'byName', - options: 'Used, %', - }, - properties=[ - { - id: 'unit', - value: 'percentunit', - }, - { - id: 'custom.displayMode', - value: 'gradient-gauge', - }, - { - id: 'max', - value: 1, - }, - { - id: 'min', - value: 0, - }, + table.new('Disk Space Usage') + + variable.query.withDatasourceFromVariable(prometheusDatasourceVariable) + + table.standardOptions.withUnit('decbytes') + + table.standardOptions.thresholds.withSteps( + [ + tableStep.withColor('green'), + tableStep.withColor('yellow') + gaugeStep.withValue(0.8), + tableStep.withColor('red') + gaugeStep.withValue(0.9), ] ) - + { span: 6 } - + { - transformations: [ + + table.queryOptions.withTargets([ + prometheus.new( + '$datasource', + ||| + max by (mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(fsSelector)s, %(fsMountpointSelector)s}) + ||| % config + ) + + prometheus.withLegendFormat('') + + prometheus.withInstant() + + prometheus.withFormat('table'), + prometheus.new( + '$datasource', + ||| + max by (mountpoint) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(fsSelector)s, %(fsMountpointSelector)s}) + ||| % config + ) + + prometheus.withLegendFormat('') + + prometheus.withInstant() + + prometheus.withFormat('table'), + ]) + + table.standardOptions.withOverrides([ + tableOverride.byName.new('Mounted on') + + tableOverride.byName.withProperty('custom.width', 260), + tableOverride.byName.new('Size') + + tableOverride.byName.withProperty('custom.width', 93), + tableOverride.byName.new('Used') + + tableOverride.byName.withProperty('custom.width', 72), + tableOverride.byName.new('Available') + + tableOverride.byName.withProperty('custom.width', 88), + tableOverride.byName.new('Used, %') + + tableOverride.byName.withProperty('unit', 'percentunit') + + tableOverride.byName.withPropertiesFromOptions( + table.fieldConfig.defaults.custom.withCellOptions( + { type: 'gauge' }, + ) + ) + + tableOverride.byName.withProperty('max', 1) + + tableOverride.byName.withProperty('min', 0), + ]) + + table.queryOptions.withTransformations([ + tableTransformation.withId('groupBy') + + tableTransformation.withOptions( { - id: 'groupBy', - options: { - fields: { - 'Value #A': { - aggregations: [ - 'lastNotNull', - ], - operation: 'aggregate', - }, - 'Value #B': { - aggregations: [ - 'lastNotNull', - ], - operation: 'aggregate', - }, - mountpoint: { - aggregations: [], - operation: 'groupby', - }, + fields: { + 'Value #A': { + aggregations: [ + 'lastNotNull', + ], + operation: 'aggregate', + }, + 'Value #B': { + aggregations: [ + 'lastNotNull', + ], + operation: 'aggregate', + }, + mountpoint: { + aggregations: [], + operation: 'groupby', }, }, - }, + } + ), + tableTransformation.withId('merge'), + tableTransformation.withId('calculateField') + + tableTransformation.withOptions( { - id: 'merge', - options: {}, - }, - { - id: 'calculateField', - options: { - alias: 'Used', - binary: { - left: 'Value #A (lastNotNull)', - operator: '-', - reducer: 'sum', - right: 'Value #B (lastNotNull)', - }, - mode: 'binary', - reduce: { - reducer: 'sum', - }, + alias: 'Used', + binary: { + left: 'Value #A (lastNotNull)', + operator: '-', + reducer: 'sum', + right: 'Value #B (lastNotNull)', }, - }, - { - id: 'calculateField', - options: { - alias: 'Used, %', - binary: { - left: 'Used', - operator: '/', - reducer: 'sum', - right: 'Value #A (lastNotNull)', - }, - mode: 'binary', - reduce: { - reducer: 'sum', - }, + mode: 'binary', + reduce: { + reducer: 'sum', }, - }, + } + ), + tableTransformation.withId('calculateField') + + tableTransformation.withOptions( { - id: 'organize', - options: { - excludeByName: {}, - indexByName: {}, - renameByName: { - 'Value #A (lastNotNull)': 'Size', - 'Value #B (lastNotNull)': 'Available', - mountpoint: 'Mounted on', + alias: 'Used, %', + binary: { + left: 'Used', + operator: '/', + reducer: 'sum', + right: 'Value #A (lastNotNull)', + }, + mode: 'binary', + reduce: { + reducer: 'sum', + }, + } + ), + tableTransformation.withId('organize') + + tableTransformation.withOptions( + { + excludeByName: {}, + indexByName: {}, + renameByName: { + 'Value #A (lastNotNull)': 'Size', + 'Value #B (lastNotNull)': 'Available', + mountpoint: 'Mounted on', + }, + } + ), + tableTransformation.withId('sortBy') + + tableTransformation.withOptions( + { + fields: {}, + sort: [ + { + field: 'Mounted on', }, - }, - }, - { - id: 'sortBy', - options: { - fields: {}, - sort: [ - { - field: 'Mounted on', - }, - ], - }, - }, - ], - }, + ], + } + ), + ]), local networkReceived = - graphPanel.new( - 'Network Received', - description='Network received (bits/s)', - datasource='$datasource', - span=6, - format='bps', - min=0, - fill=0, - ) - .addTarget(prometheus.target( - 'rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", device!="lo"}[$__rate_interval]) * 8' % config, - legendFormat='{{device}}', - intervalFactor=1, - )), + timeSeriesPanel.new('Network Received') + + timeSeriesPanel.panelOptions.withDescription('Network received (bits/s)') + + variable.query.withDatasourceFromVariable(prometheusDatasourceVariable) + + tsStandardOptions.withUnit('bps') + + tsStandardOptions.withMin(0) + + tsCustom.withFillOpacity(0) + + tsCustom.withShowPoints('never') + + tsOptions.tooltip.withMode('multi') + + tsQueryOptions.withTargets([ + prometheus.new('$datasource', 'rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", device!="lo"}[$__rate_interval]) * 8' % config) + + prometheus.withLegendFormat('{{device}}') + + prometheus.withIntervalFactor(1), + ]), local networkTransmitted = - graphPanel.new( - 'Network Transmitted', - description='Network transmitted (bits/s)', - datasource='$datasource', - span=6, - format='bps', - min=0, - fill=0, - ) - .addTarget(prometheus.target( - 'rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", device!="lo"}[$__rate_interval]) * 8' % config, - legendFormat='{{device}}', - intervalFactor=1, - )), + timeSeriesPanel.new('Network Transmitted') + + timeSeriesPanel.panelOptions.withDescription('Network transmitted (bits/s)') + + variable.query.withDatasourceFromVariable(prometheusDatasourceVariable) + + tsStandardOptions.withUnit('bps') + + tsStandardOptions.withMin(0) + + tsCustom.withFillOpacity(0) + + tsOptions.tooltip.withMode('multi') + + tsQueryOptions.withTargets([ + prometheus.new('$datasource', 'rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", device!="lo"}[$__rate_interval]) * 8' % config) + + prometheus.withLegendFormat('{{device}}') + + prometheus.withIntervalFactor(1), + ]), local cpuRow = row.new('CPU') - .addPanel(idleCPU) - .addPanel(systemLoad), + + row.withPanels([ + idleCPU, + systemLoad, + ]), - local memoryRow = - row.new('Memory') - .addPanel(memoryGraph) - .addPanel(memoryGauge), + local memoryRow = [ + row.new('Memory') + row.gridPos.withY(8), + memoryGraph + row.gridPos.withX(0) + row.gridPos.withY(9) + row.gridPos.withH(7) + row.gridPos.withW(18), + memoryGauge + row.gridPos.withX(18) + row.gridPos.withY(9) + row.gridPos.withH(7) + row.gridPos.withW(6), + ], local diskRow = row.new('Disk') - .addPanel(diskIO) - .addPanel(diskSpaceUsage), + + row.withPanels([ + diskIO, + diskSpaceUsage, + ]), local networkRow = row.new('Network') - .addPanel(networkReceived) - .addPanel(networkTransmitted), + + row.withPanels([ + networkReceived, + networkTransmitted, + ]), - local rows = - [ + local panels = + grafana.util.grid.makeGrid([ cpuRow, - memoryRow, + ], panelWidth=12, panelHeight=7) + + memoryRow + + grafana.util.grid.makeGrid([ diskRow, networkRow, - ], + ], panelWidth=12, panelHeight=7, startY=18), - local templates = + local variables = [ - prometheusDatasourceTemplate, - clusterTemplate, - instanceTemplate, + prometheusDatasourceVariable, + clusterVariable, + instanceVariable, ], - dashboard: if platform == 'Linux' then dashboard.new( '%sNodes' % config.dashboardNamePrefix, - time_from='now-1h', - tags=(config.dashboardTags), - timezone='utc', - refresh='30s', - uid=std.md5(uid), - graphTooltip='shared_crosshair' ) - .addTemplates(templates) - .addRows(rows) + + dashboard.time.withFrom('now-1h') + + dashboard.withTags(config.dashboardTags) + + dashboard.withTimezone('utc') + + dashboard.withRefresh('30s') + + dashboard.withUid(std.md5(uid)) + + dashboard.graphTooltip.withSharedCrosshair() + + dashboard.withVariables(variables) + + dashboard.withPanels(panels) else if platform == 'Darwin' then dashboard.new( '%sMacOS' % config.dashboardNamePrefix, - time_from='now-1h', - tags=(config.dashboardTags), - timezone='utc', - refresh='30s', - uid=std.md5(uid), - graphTooltip='shared_crosshair' ) - .addTemplates(templates) - .addRows(rows) + + dashboard.time.withFrom('now-1h') + + dashboard.withTags(config.dashboardTags) + + dashboard.withTimezone('utc') + + dashboard.withRefresh('30s') + + dashboard.withUid(std.md5(uid)) + + dashboard.graphTooltip.withSharedCrosshair() + + dashboard.withVariables(variables) + + dashboard.withPanels(panels) else if platform == 'AIX' then dashboard.new( '%sAIX' % config.dashboardNamePrefix, - time_from='now-1h', - tags=(config.dashboardTags), - timezone='utc', - refresh='30s', - uid=std.md5(uid), - graphTooltip='shared_crosshair' ) - .addTemplates(templates) - .addRows(rows), + + dashboard.time.withFrom('now-1h') + + dashboard.withTags(config.dashboardTags) + + dashboard.withTimezone('utc') + + dashboard.withRefresh('30s') + + dashboard.withUid(std.md5(uid)) + + dashboard.graphTooltip.withSharedCrosshair() + + dashboard.withVariables(variables) + + dashboard.withPanels(panels), }, } From 2fccdf4e17cd768052948b93445b67f468d1292d Mon Sep 17 00:00:00 2001 From: Duologic Date: Mon, 12 Sep 2022 15:11:50 +0200 Subject: [PATCH 08/18] fix(docs): add node(Warning|Critical)WindowHours to node-mixin Signed-off-by: Duologic --- docs/node-mixin/alerts/alerts.libsonnet | 16 ++++++++-------- docs/node-mixin/config.libsonnet | 10 ++++++++++ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 80b73436..1679de72 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -10,7 +10,7 @@ ( node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < %(fsSpaceFillingUpWarningThreshold)d and - predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[%(fsSpaceFillingUpPredictionWindow)s], 24*60*60) < 0 + predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[%(fsSpaceFillingUpPredictionWindow)s], %(nodeWarningWindowHours)s*60*60) < 0 and node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 ) @@ -20,7 +20,7 @@ severity: 'warning', }, annotations: { - summary: 'Filesystem is predicted to run out of space within the next 24 hours.', + summary: 'Filesystem is predicted to run out of space within the next %(nodeWarningWindowHours)s hours.' % $._config, description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.', }, }, @@ -30,7 +30,7 @@ ( node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < %(fsSpaceFillingUpCriticalThreshold)d and - predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], 4*60*60) < 0 + predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], %(nodeCriticalWindowHours)s*60*60) < 0 and node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 ) @@ -40,7 +40,7 @@ severity: '%(nodeCriticalSeverity)s' % $._config, }, annotations: { - summary: 'Filesystem is predicted to run out of space within the next 4 hours.', + summary: 'Filesystem is predicted to run out of space within the next %(nodeCriticalWindowHours)s hours.' % $._config, description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.', }, }, @@ -86,7 +86,7 @@ ( node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < 40 and - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], 24*60*60) < 0 + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], %(nodeWarningWindowHours)s*60*60) < 0 and node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 ) @@ -96,7 +96,7 @@ severity: 'warning', }, annotations: { - summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.', + summary: 'Filesystem is predicted to run out of inodes within the next %(nodeWarningWindowHours)s hours.' % $._config, description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.', }, }, @@ -106,7 +106,7 @@ ( node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < 20 and - predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], 4*60*60) < 0 + predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], %(nodeCriticalWindowHours)s*60*60) < 0 and node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0 ) @@ -116,7 +116,7 @@ severity: '%(nodeCriticalSeverity)s' % $._config, }, annotations: { - summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.', + summary: 'Filesystem is predicted to run out of inodes within the next %(nodeCriticalWindowHours)s hours.' % $._config, description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.', }, }, diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet index df2af23f..1a4b3caa 100644 --- a/docs/node-mixin/config.libsonnet +++ b/docs/node-mixin/config.libsonnet @@ -50,6 +50,16 @@ // 'NodeSystemSaturation' alert. systemSaturationPerCoreThreshold: 2, + // Some of the alerts use predict_linear() to fire alerts ahead of time to + // prevent unrecoverable situations (eg. no more disk space). However, the + // node may have automatic processes (cronjobs) in place to prevent that + // within a certain time window, this may not align with the default time + // window of these alerts. This can cause these alerts to start flapping. + // By reducing the time window, the system gets more time to + // resolve this before problems occur. + nodeWarningWindowHours: '24', + nodeCriticalWindowHours: '4', + // Available disk space (%) thresholds on which to trigger the // 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk // usage grows in a way that it is predicted to run out in 4h or 1d From 505363a67d3d1bdeb76f101bdc91232d288374a6 Mon Sep 17 00:00:00 2001 From: liyang Date: Wed, 11 Dec 2024 11:39:59 +0800 Subject: [PATCH 09/18] chore: add instance label in NodeHighNumberConntrackEntriesUsed alert description Signed-off-by: liyang --- docs/node-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 1679de72..dc93c9a8 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -191,7 +191,7 @@ ||| % $._config, annotations: { summary: 'Number of conntrack are getting close to the limit.', - description: '{{ $value | humanizePercentage }} of conntrack entries are used.', + description: '{{ $labels.instance }} {{ $value | humanizePercentage }} of conntrack entries are used.', }, labels: { severity: 'warning', From 2b5cef35617ed04a307f0a7a39fd99e3039a1b06 Mon Sep 17 00:00:00 2001 From: Pranshu Srivastava Date: Mon, 23 Sep 2024 03:44:34 +0530 Subject: [PATCH 10/18] fix: address `master` issues Sync dependencies and log using the machinery introduced in #3097. Signed-off-by: Pranshu Srivastava --- go.mod | 2 ++ go.sum | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/go.mod b/go.mod index 7d4f8471..cb870e01 100644 --- a/go.mod +++ b/go.mod @@ -8,6 +8,7 @@ require ( github.com/coreos/go-systemd/v22 v22.5.0 github.com/dennwc/btrfs v0.0.0-20240418142341-0167142bde7a github.com/ema/qdisc v1.0.0 + github.com/go-kit/log v0.2.1 github.com/godbus/dbus/v5 v5.1.0 github.com/hashicorp/go-envparse v0.1.0 github.com/hodgesds/perf-utils v0.7.0 @@ -38,6 +39,7 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/dennwc/ioctl v1.0.0 // indirect + github.com/go-logfmt/logfmt v0.5.1 // indirect github.com/google/go-cmp v0.6.0 // indirect github.com/jpillora/backoff v1.0.0 // indirect github.com/klauspost/compress v1.17.9 // indirect diff --git a/go.sum b/go.sum index 5787f76d..2dbc00c7 100644 --- a/go.sum +++ b/go.sum @@ -21,6 +21,10 @@ github.com/dennwc/ioctl v1.0.0 h1:DsWAAjIxRqNcLn9x6mwfuf2pet3iB7aK90K4tF16rLg= github.com/dennwc/ioctl v1.0.0/go.mod h1:ellh2YB5ldny99SBU/VX7Nq0xiZbHphf1DrtHxxjMk0= github.com/ema/qdisc v1.0.0 h1:EHLG08FVRbWLg8uRICa3xzC9Zm0m7HyMHfXobWFnXYg= github.com/ema/qdisc v1.0.0/go.mod h1:FhIc0fLYi7f+lK5maMsesDqwYojIOh3VfRs8EVd5YJQ= +github.com/go-kit/log v0.2.1 h1:MRVx0/zhvdseW+Gza6N9rVzU/IVzaeE1SFI4raAhmBU= +github.com/go-kit/log v0.2.1/go.mod h1:NwTd00d/i8cPZ3xOwwiv2PO5MOcx78fFErGNcVmBjv0= +github.com/go-logfmt/logfmt v0.5.1 h1:otpy5pqBCBZ1ng9RQ0dPu4PN7ba75Y/aA+UpowDyNVA= +github.com/go-logfmt/logfmt v0.5.1/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KEVveWlfTs= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk= github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= From b8aac7c92e2d7f3dea56b2d4802d4ef3ee29c3cc Mon Sep 17 00:00:00 2001 From: Pranshu Srivastava Date: Thu, 30 May 2024 13:00:31 +0530 Subject: [PATCH 11/18] collector/cpu: Support CPU online status Blocked by: https://github.com/prometheus/procfs/pull/644. Signed-off-by: Pranshu Srivastava --- collector/cpu_linux.go | 61 +++++++++++++++++++++++++++++++++++------- go.mod | 4 +-- go.sum | 8 ++---- 3 files changed, 54 insertions(+), 19 deletions(-) diff --git a/collector/cpu_linux.go b/collector/cpu_linux.go index 1ee7b94d..8ca70365 100644 --- a/collector/cpu_linux.go +++ b/collector/cpu_linux.go @@ -17,6 +17,7 @@ package collector import ( + "errors" "fmt" "log/slog" "os" @@ -26,15 +27,17 @@ import ( "strconv" "sync" + "golang.org/x/exp/maps" + "github.com/alecthomas/kingpin/v2" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/procfs" "github.com/prometheus/procfs/sysfs" - "golang.org/x/exp/maps" ) type cpuCollector struct { - fs procfs.FS + procfs procfs.FS + sysfs sysfs.FS cpu *prometheus.Desc cpuInfo *prometheus.Desc cpuFrequencyHz *prometheus.Desc @@ -45,6 +48,7 @@ type cpuCollector struct { cpuPackageThrottle *prometheus.Desc cpuIsolated *prometheus.Desc logger *slog.Logger + cpuOnline *prometheus.Desc cpuStats map[int64]procfs.CPUStat cpuStatsMutex sync.Mutex isolatedCpus []uint16 @@ -70,17 +74,17 @@ func init() { // NewCPUCollector returns a new Collector exposing kernel/system statistics. func NewCPUCollector(logger *slog.Logger) (Collector, error) { - fs, err := procfs.NewFS(*procPath) + pfs, err := procfs.NewFS(*procPath) if err != nil { return nil, fmt.Errorf("failed to open procfs: %w", err) } - sysfs, err := sysfs.NewFS(*sysPath) + sfs, err := sysfs.NewFS(*sysPath) if err != nil { return nil, fmt.Errorf("failed to open sysfs: %w", err) } - isolcpus, err := sysfs.IsolatedCPUs() + isolcpus, err := sfs.IsolatedCPUs() if err != nil { if !os.IsNotExist(err) { return nil, fmt.Errorf("Unable to get isolated cpus: %w", err) @@ -89,8 +93,9 @@ func NewCPUCollector(logger *slog.Logger) (Collector, error) { } c := &cpuCollector{ - fs: fs, - cpu: nodeCPUSecondsDesc, + procfs: pfs, + sysfs: sfs, + cpu: nodeCPUSecondsDesc, cpuInfo: prometheus.NewDesc( prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "info"), "CPU information from /proc/cpuinfo.", @@ -131,6 +136,11 @@ func NewCPUCollector(logger *slog.Logger) (Collector, error) { "Whether each core is isolated, information from /sys/devices/system/cpu/isolated.", []string{"cpu"}, nil, ), + cpuOnline: prometheus.NewDesc( + prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "online"), + "CPUs that are online and being scheduled.", + []string{"cpu"}, nil, + ), logger: logger, isolatedCpus: isolcpus, cpuStats: make(map[int64]procfs.CPUStat), @@ -177,12 +187,21 @@ func (c *cpuCollector) Update(ch chan<- prometheus.Metric) error { if c.isolatedCpus != nil { c.updateIsolated(ch) } - return c.updateThermalThrottle(ch) + err := c.updateThermalThrottle(ch) + if err != nil { + return err + } + err = c.updateOnline(ch) + if err != nil { + return err + } + + return nil } // updateInfo reads /proc/cpuinfo func (c *cpuCollector) updateInfo(ch chan<- prometheus.Metric) error { - info, err := c.fs.CPUInfo() + info, err := c.procfs.CPUInfo() if err != nil { return err } @@ -333,9 +352,31 @@ func (c *cpuCollector) updateIsolated(ch chan<- prometheus.Metric) { } } +// updateOnline reads /sys/devices/system/cpu/cpu*/online through sysfs and exports online status metrics. +func (c *cpuCollector) updateOnline(ch chan<- prometheus.Metric) error { + cpus, err := c.sysfs.CPUs() + if err != nil { + return err + } + // No-op if the system does not support CPU online stats. + cpu0 := cpus[0] + if _, err := cpu0.Online(); err != nil && errors.Is(err, os.ErrNotExist) { + return nil + } + for _, cpu := range cpus { + setOnline := float64(0) + if online, _ := cpu.Online(); online { + setOnline = 1 + } + ch <- prometheus.MustNewConstMetric(c.cpuOnline, prometheus.GaugeValue, setOnline, cpu.Number()) + } + + return nil +} + // updateStat reads /proc/stat through procfs and exports CPU-related metrics. func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error { - stats, err := c.fs.Stat() + stats, err := c.procfs.Stat() if err != nil { return err } diff --git a/go.mod b/go.mod index cb870e01..4d120b7f 100644 --- a/go.mod +++ b/go.mod @@ -8,7 +8,6 @@ require ( github.com/coreos/go-systemd/v22 v22.5.0 github.com/dennwc/btrfs v0.0.0-20240418142341-0167142bde7a github.com/ema/qdisc v1.0.0 - github.com/go-kit/log v0.2.1 github.com/godbus/dbus/v5 v5.1.0 github.com/hashicorp/go-envparse v0.1.0 github.com/hodgesds/perf-utils v0.7.0 @@ -27,7 +26,7 @@ require ( github.com/prometheus/client_model v0.6.1 github.com/prometheus/common v0.61.0 github.com/prometheus/exporter-toolkit v0.13.2 - github.com/prometheus/procfs v0.15.1 + github.com/prometheus/procfs v0.15.2-0.20240603130017-1754b780536b // == v0.15.1 + https://github.com/prometheus/procfs/commit/1754b780536bb81082baa913e04cc4fff4d2baea github.com/safchain/ethtool v0.5.9 golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 golang.org/x/sys v0.28.0 @@ -39,7 +38,6 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/dennwc/ioctl v1.0.0 // indirect - github.com/go-logfmt/logfmt v0.5.1 // indirect github.com/google/go-cmp v0.6.0 // indirect github.com/jpillora/backoff v1.0.0 // indirect github.com/klauspost/compress v1.17.9 // indirect diff --git a/go.sum b/go.sum index 2dbc00c7..7bc8a084 100644 --- a/go.sum +++ b/go.sum @@ -21,10 +21,6 @@ github.com/dennwc/ioctl v1.0.0 h1:DsWAAjIxRqNcLn9x6mwfuf2pet3iB7aK90K4tF16rLg= github.com/dennwc/ioctl v1.0.0/go.mod h1:ellh2YB5ldny99SBU/VX7Nq0xiZbHphf1DrtHxxjMk0= github.com/ema/qdisc v1.0.0 h1:EHLG08FVRbWLg8uRICa3xzC9Zm0m7HyMHfXobWFnXYg= github.com/ema/qdisc v1.0.0/go.mod h1:FhIc0fLYi7f+lK5maMsesDqwYojIOh3VfRs8EVd5YJQ= -github.com/go-kit/log v0.2.1 h1:MRVx0/zhvdseW+Gza6N9rVzU/IVzaeE1SFI4raAhmBU= -github.com/go-kit/log v0.2.1/go.mod h1:NwTd00d/i8cPZ3xOwwiv2PO5MOcx78fFErGNcVmBjv0= -github.com/go-logfmt/logfmt v0.5.1 h1:otpy5pqBCBZ1ng9RQ0dPu4PN7ba75Y/aA+UpowDyNVA= -github.com/go-logfmt/logfmt v0.5.1/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KEVveWlfTs= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk= github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= @@ -87,8 +83,8 @@ github.com/prometheus/common v0.61.0 h1:3gv/GThfX0cV2lpO7gkTUwZru38mxevy90Bj8YFS github.com/prometheus/common v0.61.0/go.mod h1:zr29OCN/2BsJRaFwG8QOBr41D6kkchKbpeNH7pAjb/s= github.com/prometheus/exporter-toolkit v0.13.2 h1:Z02fYtbqTMy2i/f+xZ+UK5jy/bl1Ex3ndzh06T/Q9DQ= github.com/prometheus/exporter-toolkit v0.13.2/go.mod h1:tCqnfx21q6qN1KA4U3Bfb8uWzXfijIrJz3/kTIqMV7g= -github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= -github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/prometheus/procfs v0.15.2-0.20240603130017-1754b780536b h1:4EJkx3vycI+n5JY5ht+bnSUGamkmmXkpcNeO/OBT/0A= +github.com/prometheus/procfs v0.15.2-0.20240603130017-1754b780536b/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/safchain/ethtool v0.5.9 h1://6RvaOKFf3nQ0rl5+8zBbE4/72455VC9Jq61pfq67E= From c3c8645923e821de3fbe0f040708310163388cc1 Mon Sep 17 00:00:00 2001 From: PrometheusBot Date: Sun, 29 Dec 2024 19:18:49 +0100 Subject: [PATCH 12/18] Update common Prometheus files (#3218) Signed-off-by: prombot --- .github/workflows/golangci-lint.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml index e645ba30..01b943b9 100644 --- a/.github/workflows/golangci-lint.yml +++ b/.github/workflows/golangci-lint.yml @@ -26,7 +26,7 @@ jobs: - name: Checkout repository uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Install Go - uses: actions/setup-go@41dfa10bad2bb2ae585af6ee5bb4d7d973ad74ed # v5.1.0 + uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a # v5.2.0 with: go-version: 1.23.x - name: Install snmp_exporter/generator dependencies From 3ed9edfde7008b8e180354eb346c573bcc79cf39 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 1 Jan 2025 18:08:28 +0100 Subject: [PATCH 13/18] build(deps): bump github.com/mdlayher/wifi from 0.3.0 to 0.3.1 (#3219) Bumps [github.com/mdlayher/wifi](https://github.com/mdlayher/wifi) from 0.3.0 to 0.3.1. - [Release notes](https://github.com/mdlayher/wifi/releases) - [Commits](https://github.com/mdlayher/wifi/compare/v0.3.0...v0.3.1) --- updated-dependencies: - dependency-name: github.com/mdlayher/wifi dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 4d120b7f..2fc10c0f 100644 --- a/go.mod +++ b/go.mod @@ -18,7 +18,7 @@ require ( github.com/mattn/go-xmlrpc v0.0.3 github.com/mdlayher/ethtool v0.2.0 github.com/mdlayher/netlink v1.7.2 - github.com/mdlayher/wifi v0.3.0 + github.com/mdlayher/wifi v0.3.1 github.com/opencontainers/selinux v1.11.1 github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 github.com/prometheus-community/go-runit v0.1.0 diff --git a/go.sum b/go.sum index 7bc8a084..919dfa3e 100644 --- a/go.sum +++ b/go.sum @@ -61,8 +61,8 @@ github.com/mdlayher/socket v0.4.1 h1:eM9y2/jlbs1M615oshPQOHZzj6R6wMT7bX5NPiQvn2U github.com/mdlayher/socket v0.4.1/go.mod h1:cAqeGjoufqdxWkD7DkpyS+wcefOtmu5OQ8KuoJGIReA= github.com/mdlayher/vsock v1.2.1 h1:pC1mTJTvjo1r9n9fbm7S1j04rCgCzhCOS5DY0zqHlnQ= github.com/mdlayher/vsock v1.2.1/go.mod h1:NRfCibel++DgeMD8z/hP+PPTjlNJsdPOmxcnENvE+SE= -github.com/mdlayher/wifi v0.3.0 h1:ZfS81w/7xTWBJfhM77K0k6m3sJckwoNOoZUwOW34omo= -github.com/mdlayher/wifi v0.3.0/go.mod h1:/bdkqKYl+lD4recmQM6bTHxMrEUW70reibTyr93CAd0= +github.com/mdlayher/wifi v0.3.1 h1:bZDuMI1f7z5BtUUO3NgHRdR/R88YtywIe6dsEFI0Txs= +github.com/mdlayher/wifi v0.3.1/go.mod h1:ODQaObvsglghTuNhezD9grkTB4shVNc28aJfTXmvSi8= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= From 6f3c3456325f47a1407c9e66f99d379cfd5dd036 Mon Sep 17 00:00:00 2001 From: Xuhui Zhu Date: Thu, 22 Aug 2024 16:42:48 -0400 Subject: [PATCH 14/18] add gpu clk Signed-off-by: Xuhui Zhu --- collector/hwmon_linux.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/collector/hwmon_linux.go b/collector/hwmon_linux.go index 9c0e065c..f76e7351 100644 --- a/collector/hwmon_linux.go +++ b/collector/hwmon_linux.go @@ -44,7 +44,7 @@ var ( hwmonSensorTypes = []string{ "vrm", "beep_enable", "update_interval", "in", "cpu", "fan", "pwm", "temp", "curr", "power", "energy", "humidity", - "intrusion", + "intrusion", "freq", } ) @@ -357,6 +357,15 @@ func (c *hwMonCollector) updateHwmon(ch chan<- prometheus.Metric, dir string) er continue } + if sensorType == "freq" && element == "input" { + if label, ok := sensorData["label"]; ok { + sensorLabel := cleanMetricName(label) + desc := prometheus.NewDesc("node_hwmon_freq_hertz", "Hardware monitor for GPU frequency in MHz", hwmonLabelDesc, nil) + ch <- prometheus.MustNewConstMetric( + desc, prometheus.GaugeValue, parsedValue/1000000.0, append(labels[:len(labels)-1], sensorLabel)...) + } + continue + } // fallback, just dump the metric as is desc := prometheus.NewDesc(name, "Hardware monitor "+sensorType+" element "+element, hwmonLabelDesc, nil) From 89a21a9c4c70ff6fd411c10420cf0e5c7aadc375 Mon Sep 17 00:00:00 2001 From: Xuhui Zhu Date: Fri, 23 Aug 2024 16:34:23 -0400 Subject: [PATCH 15/18] restruct code Signed-off-by: Xuhui Zhu --- collector/hwmon_linux.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/collector/hwmon_linux.go b/collector/hwmon_linux.go index f76e7351..ad8da41b 100644 --- a/collector/hwmon_linux.go +++ b/collector/hwmon_linux.go @@ -360,7 +360,7 @@ func (c *hwMonCollector) updateHwmon(ch chan<- prometheus.Metric, dir string) er if sensorType == "freq" && element == "input" { if label, ok := sensorData["label"]; ok { sensorLabel := cleanMetricName(label) - desc := prometheus.NewDesc("node_hwmon_freq_hertz", "Hardware monitor for GPU frequency in MHz", hwmonLabelDesc, nil) + desc := prometheus.NewDesc(name+"_freq_mhz", "Hardware monitor for GPU frequency in MHz", hwmonLabelDesc, nil) ch <- prometheus.MustNewConstMetric( desc, prometheus.GaugeValue, parsedValue/1000000.0, append(labels[:len(labels)-1], sensorLabel)...) } From 0c1af8dfe009e079e71cb1ce63e2756ba067d941 Mon Sep 17 00:00:00 2001 From: Xuhui Zhu Date: Tue, 27 Aug 2024 16:48:32 -0400 Subject: [PATCH 16/18] add test Signed-off-by: Xuhui Zhu --- collector/fixtures/e2e-output.txt | 10 ++++++++-- collector/fixtures/sys.ttar | 26 +++++++++++++++++++++++--- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index a80f6b63..eb6be483 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -893,6 +893,10 @@ node_hwmon_fan_target_rpm{chip="nct6779",sensor="fan2"} 27000 # HELP node_hwmon_fan_tolerance Hardware monitor fan element tolerance # TYPE node_hwmon_fan_tolerance gauge node_hwmon_fan_tolerance{chip="nct6779",sensor="fan2"} 0 +# HELP node_hwmon_freq_freq_mhz Hardware monitor for GPU frequency in MHz +# TYPE node_hwmon_freq_freq_mhz gauge +node_hwmon_freq_freq_mhz{chip="hwmon4",sensor="mclk"} 300 +node_hwmon_freq_freq_mhz{chip="hwmon4",sensor="sclk"} 214 # HELP node_hwmon_in_alarm Hardware sensor alarm status (in) # TYPE node_hwmon_in_alarm gauge node_hwmon_in_alarm{chip="nct6779",sensor="in0"} 0 @@ -1006,8 +1010,10 @@ node_hwmon_pwm_weight_temp_step_tol{chip="nct6779",sensor="pwm1"} 0 # TYPE node_hwmon_sensor_label gauge node_hwmon_sensor_label{chip="hwmon4",label="foosensor",sensor="temp1"} 1 node_hwmon_sensor_label{chip="hwmon4",label="foosensor",sensor="temp2"} 1 -node_hwmon_sensor_label{chip="platform_applesmc_768",label="Left side ",sensor="fan1"} 1 -node_hwmon_sensor_label{chip="platform_applesmc_768",label="Right side ",sensor="fan2"} 1 +node_hwmon_sensor_label{chip="hwmon4",label="mclk",sensor="freq2"} 1 +node_hwmon_sensor_label{chip="hwmon4",label="sclk",sensor="freq1"} 1 +node_hwmon_sensor_label{chip="platform_applesmc_768",label="Left side",sensor="fan1"} 1 +node_hwmon_sensor_label{chip="platform_applesmc_768",label="Right side",sensor="fan2"} 1 node_hwmon_sensor_label{chip="platform_coretemp_0",label="Core 0",sensor="temp2"} 1 node_hwmon_sensor_label{chip="platform_coretemp_0",label="Core 1",sensor="temp3"} 1 node_hwmon_sensor_label{chip="platform_coretemp_0",label="Core 2",sensor="temp4"} 1 diff --git a/collector/fixtures/sys.ttar b/collector/fixtures/sys.ttar index 6fcf094d..7b2f27b8 100644 --- a/collector/fixtures/sys.ttar +++ b/collector/fixtures/sys.ttar @@ -437,6 +437,26 @@ Lines: 1 100000 Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/hwmon/hwmon4/freq1_input +Lines: 1 +214000000 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/hwmon/hwmon4/freq1_label +Lines: 1 +sclk +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/hwmon/hwmon4/freq2_input +Lines: 1 +300000000 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/class/hwmon/hwmon4/freq2_label +Lines: 1 +mclk +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/class/hwmon/hwmon5 SymlinkTo: ../../devices/platform/bogus.0/hwmon/hwmon5/ # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -1337,7 +1357,7 @@ Mode: 444 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/class/nvme/nvme0/model Lines: 1 -Samsung SSD 970 PRO 512GB +Samsung SSD 970 PRO 512GB Mode: 444 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/class/nvme/nvme0/serial @@ -2750,7 +2770,7 @@ Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/platform/applesmc.768/fan1_label Lines: 1 -Left side +Left side Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/platform/applesmc.768/fan1_manual @@ -2784,7 +2804,7 @@ Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/platform/applesmc.768/fan2_label Lines: 1 -Right side +Right side Mode: 644 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Path: sys/devices/platform/applesmc.768/fan2_manual From 8f9a914bee023a614f4eea1005baf2e112bbadb0 Mon Sep 17 00:00:00 2001 From: Xuhui Zhu Date: Tue, 27 Aug 2024 16:59:20 -0400 Subject: [PATCH 17/18] add arm test Signed-off-by: Xuhui Zhu --- collector/fixtures/e2e-64k-page-output.txt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/collector/fixtures/e2e-64k-page-output.txt b/collector/fixtures/e2e-64k-page-output.txt index 605149da..55583bad 100644 --- a/collector/fixtures/e2e-64k-page-output.txt +++ b/collector/fixtures/e2e-64k-page-output.txt @@ -871,6 +871,10 @@ node_hwmon_fan_target_rpm{chip="nct6779",sensor="fan2"} 27000 # HELP node_hwmon_fan_tolerance Hardware monitor fan element tolerance # TYPE node_hwmon_fan_tolerance gauge node_hwmon_fan_tolerance{chip="nct6779",sensor="fan2"} 0 +# HELP node_hwmon_freq_freq_mhz Hardware monitor for GPU frequency in MHz +# TYPE node_hwmon_freq_freq_mhz gauge +node_hwmon_freq_freq_mhz{chip="hwmon4",sensor="mclk"} 300 +node_hwmon_freq_freq_mhz{chip="hwmon4",sensor="sclk"} 214 # HELP node_hwmon_in_alarm Hardware sensor alarm status (in) # TYPE node_hwmon_in_alarm gauge node_hwmon_in_alarm{chip="nct6779",sensor="in0"} 0 @@ -984,8 +988,10 @@ node_hwmon_pwm_weight_temp_step_tol{chip="nct6779",sensor="pwm1"} 0 # TYPE node_hwmon_sensor_label gauge node_hwmon_sensor_label{chip="hwmon4",label="foosensor",sensor="temp1"} 1 node_hwmon_sensor_label{chip="hwmon4",label="foosensor",sensor="temp2"} 1 -node_hwmon_sensor_label{chip="platform_applesmc_768",label="Left side ",sensor="fan1"} 1 -node_hwmon_sensor_label{chip="platform_applesmc_768",label="Right side ",sensor="fan2"} 1 +node_hwmon_sensor_label{chip="hwmon4",label="mclk",sensor="freq2"} 1 +node_hwmon_sensor_label{chip="hwmon4",label="sclk",sensor="freq1"} 1 +node_hwmon_sensor_label{chip="platform_applesmc_768",label="Left side",sensor="fan1"} 1 +node_hwmon_sensor_label{chip="platform_applesmc_768",label="Right side",sensor="fan2"} 1 node_hwmon_sensor_label{chip="platform_coretemp_0",label="Core 0",sensor="temp2"} 1 node_hwmon_sensor_label{chip="platform_coretemp_0",label="Core 1",sensor="temp3"} 1 node_hwmon_sensor_label{chip="platform_coretemp_0",label="Core 2",sensor="temp4"} 1 From acdd9b813d71b856fe34df97caa1ed90c8587692 Mon Sep 17 00:00:00 2001 From: Daniel Swarbrick Date: Fri, 10 Jan 2025 10:58:08 +0100 Subject: [PATCH 18/18] arp: optimize netlink interface name resolution (#3133) github.com/jsimonetti/rtnetlink provides a high level rtnl wrapper around the lower level rtnetlink functions, which essentially does all that we need. The rtnl.Conn.Neighbors uses an internal cache for resolving interface indexes to names, so it makes at most one rtnetlink call per interface to resolve the name. Using this high level wrapper hugely simplifies our code and makes it easier to understand and maintain. Fixes: #3075 Signed-off-by: Daniel Swarbrick --- collector/arp_linux.go | 38 +++++++++++--------------------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/collector/arp_linux.go b/collector/arp_linux.go index 7f417d82..e1ba52c8 100644 --- a/collector/arp_linux.go +++ b/collector/arp_linux.go @@ -17,13 +17,11 @@ package collector import ( - "errors" "fmt" "log/slog" - "net" "github.com/alecthomas/kingpin/v2" - "github.com/jsimonetti/rtnetlink/v2" + "github.com/jsimonetti/rtnetlink/v2/rtnl" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/procfs" "golang.org/x/sys/unix" @@ -76,44 +74,30 @@ func getTotalArpEntries(deviceEntries []procfs.ARPEntry) map[string]uint32 { } func getTotalArpEntriesRTNL() (map[string]uint32, error) { - conn, err := rtnetlink.Dial(nil) + conn, err := rtnl.Dial(nil) if err != nil { return nil, err } defer conn.Close() - neighbors, err := conn.Neigh.List() + // Neighbors will also contain IPv6 neighbors, but since this is purely an ARP collector, + // restrict to AF_INET. + neighbors, err := conn.Neighbours(nil, unix.AF_INET) if err != nil { return nil, err } - ifIndexEntries := make(map[uint32]uint32) + // Map of interface name to ARP neighbor count. + entries := make(map[string]uint32) for _, n := range neighbors { - // Neighbors will also contain IPv6 neighbors, but since this is purely an ARP collector, - // restrict to AF_INET. Also skip entries which have state NUD_NOARP to conform to output - // of /proc/net/arp. - if n.Family == unix.AF_INET && n.State&unix.NUD_NOARP == 0 { - ifIndexEntries[n.Index]++ + // Skip entries which have state NUD_NOARP to conform to output of /proc/net/arp. + if n.State&unix.NUD_NOARP == 0 { + entries[n.Interface.Name]++ } } - enumEntries := make(map[string]uint32) - - // Convert interface indexes to names. - for ifIndex, entryCount := range ifIndexEntries { - iface, err := net.InterfaceByIndex(int(ifIndex)) - if err != nil { - if errors.Unwrap(err).Error() == "no such network interface" { - continue - } - return nil, err - } - - enumEntries[iface.Name] = entryCount - } - - return enumEntries, nil + return entries, nil } func (c *arpCollector) Update(ch chan<- prometheus.Metric) error {