Merge branch 'master' of github.com:prometheus/node_exporter into add-arp-states

Signed-off-by: Emin Umut Gercek <eumutgercek@gmail.com>
This commit is contained in:
Emin Umut Gercek 2025-01-13 20:53:17 +03:00
commit d679000a11
30 changed files with 1176 additions and 1069 deletions

View file

@ -26,7 +26,7 @@ jobs:
- name: Checkout repository
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Install Go
uses: actions/setup-go@41dfa10bad2bb2ae585af6ee5bb4d7d973ad74ed # v5.1.0
uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a # v5.2.0
with:
go-version: 1.23.x
- name: Install snmp_exporter/generator dependencies
@ -36,4 +36,4 @@ jobs:
uses: golangci/golangci-lint-action@971e284b6050e8a5849b72094c50ab08da042db8 # v6.1.1
with:
args: --verbose
version: v1.61.0
version: v1.62.0

View file

@ -1,17 +1,9 @@
linters:
enable:
- depguard
- goimports
- misspell
- revive
disable:
# Disable soon to deprecated[1] linters that lead to false
# positives when build tags disable certain files[2]
# 1: https://github.com/golangci/golangci-lint/issues/1841
# 2: https://github.com/prometheus/node_exporter/issues/1545
- deadcode
- unused
- structcheck
- varcheck
issues:
exclude-rules:

View file

@ -61,7 +61,7 @@ PROMU_URL := https://github.com/prometheus/promu/releases/download/v$(PROMU_
SKIP_GOLANGCI_LINT :=
GOLANGCI_LINT :=
GOLANGCI_LINT_OPTS ?=
GOLANGCI_LINT_VERSION ?= v1.61.0
GOLANGCI_LINT_VERSION ?= v1.62.0
# golangci-lint only supports linux, darwin and windows platforms on i386/amd64/arm64.
# windows isn't included here because of the path separator being different.
ifeq ($(GOHOSTOS),$(filter $(GOHOSTOS),linux darwin))

View file

@ -99,8 +99,8 @@ cpu | flags | --collector.cpu.info.flags-include | N/A
diskstats | device | --collector.diskstats.device-include | --collector.diskstats.device-exclude
ethtool | device | --collector.ethtool.device-include | --collector.ethtool.device-exclude
ethtool | metrics | --collector.ethtool.metrics-include | N/A
filesystem | fs-types | N/A | --collector.filesystem.fs-types-exclude
filesystem | mount-points | N/A | --collector.filesystem.mount-points-exclude
filesystem | fs-types | --collector.filesystem.fs-types-include | --collector.filesystem.fs-types-exclude
filesystem | mount-points | --collector.filesystem.mount-points-include | --collector.filesystem.mount-points-exclude
hwmon | chip | --collector.hwmon.chip-include | --collector.hwmon.chip-exclude
hwmon | sensor | --collector.hwmon.sensor-include | --collector.hwmon.sensor-exclude
interrupts | name | --collector.interrupts.name-include | --collector.interrupts.name-exclude

View file

@ -17,13 +17,11 @@
package collector
import (
"errors"
"fmt"
"log/slog"
"net"
"github.com/alecthomas/kingpin/v2"
"github.com/jsimonetti/rtnetlink/v2"
"github.com/jsimonetti/rtnetlink/v2/rtnl"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/procfs"
"golang.org/x/sys/unix"
@ -97,53 +95,41 @@ func getTotalArpEntries(deviceEntries []procfs.ARPEntry) map[string]uint32 {
}
func getArpEntriesRTNL() (map[string]uint32, map[string]map[string]int, error) {
conn, err := rtnetlink.Dial(nil)
conn, err := rtnl.Dial(nil)
if err != nil {
return nil, nil, err
}
defer conn.Close()
neighbors, err := conn.Neigh.List()
// Neighbors will also contain IPv6 neighbors, but since this is purely an ARP collector,
// restrict to AF_INET.
neighbors, err := conn.Neighbours(nil, unix.AF_INET)
if err != nil {
return nil, nil, err
}
ifIndexEntries := make(map[uint32]uint32)
ifIndexStates := make(map[uint32]map[string]int)
// Map of interface name to ARP neighbor count.
entries := make(map[string]uint32)
// Map of map[InterfaceName]map[StateName]int
states := make(map[string]map[string]int)
for _, n := range neighbors {
// Neighbors will also contain IPv6 neighbors, but since this is purely an ARP collector,
// restrict to AF_INET. Also skip entries which have state NUD_NOARP to conform to output
// of /proc/net/arp.
if n.Family == unix.AF_INET && n.State&unix.NUD_NOARP == 0 {
ifIndexEntries[n.Index]++
_, ok := ifIndexStates[n.Index]
if !ok {
ifIndexStates[n.Index] = make(map[string]int)
}
ifIndexStates[n.Index][neighborStatesMap[n.State]]++
}
}
enumEntries := make(map[string]uint32)
enumStates := make(map[string]map[string]int)
// Convert interface indexes to names.
for ifIndex, entryCount := range ifIndexEntries {
iface, err := net.InterfaceByIndex(int(ifIndex))
if err != nil {
if errors.Unwrap(err).Error() == "no such network interface" {
continue
}
return nil, nil, err
// Skip entries which have state NUD_NOARP to conform to output of /proc/net/arp.
if n.State&unix.NUD_NOARP != unix.NUD_NOARP {
continue
}
enumEntries[iface.Name] = entryCount
enumStates[iface.Name] = ifIndexStates[ifIndex]
entries[n.Interface.Name]++
_, ok := states[n.Interface.Name]
if !ok {
states[n.Interface.Name] = make(map[string]int)
}
states[n.Interface.Name][neighborStatesMap[n.State]]++
}
return enumEntries, enumStates, nil
return entries, states, nil
}
func (c *arpCollector) Update(ch chan<- prometheus.Metric) error {

View file

@ -17,6 +17,7 @@
package collector
import (
"errors"
"fmt"
"log/slog"
"os"
@ -26,15 +27,17 @@ import (
"strconv"
"sync"
"golang.org/x/exp/maps"
"github.com/alecthomas/kingpin/v2"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/procfs"
"github.com/prometheus/procfs/sysfs"
"golang.org/x/exp/maps"
)
type cpuCollector struct {
fs procfs.FS
procfs procfs.FS
sysfs sysfs.FS
cpu *prometheus.Desc
cpuInfo *prometheus.Desc
cpuFrequencyHz *prometheus.Desc
@ -45,6 +48,7 @@ type cpuCollector struct {
cpuPackageThrottle *prometheus.Desc
cpuIsolated *prometheus.Desc
logger *slog.Logger
cpuOnline *prometheus.Desc
cpuStats map[int64]procfs.CPUStat
cpuStatsMutex sync.Mutex
isolatedCpus []uint16
@ -70,17 +74,17 @@ func init() {
// NewCPUCollector returns a new Collector exposing kernel/system statistics.
func NewCPUCollector(logger *slog.Logger) (Collector, error) {
fs, err := procfs.NewFS(*procPath)
pfs, err := procfs.NewFS(*procPath)
if err != nil {
return nil, fmt.Errorf("failed to open procfs: %w", err)
}
sysfs, err := sysfs.NewFS(*sysPath)
sfs, err := sysfs.NewFS(*sysPath)
if err != nil {
return nil, fmt.Errorf("failed to open sysfs: %w", err)
}
isolcpus, err := sysfs.IsolatedCPUs()
isolcpus, err := sfs.IsolatedCPUs()
if err != nil {
if !os.IsNotExist(err) {
return nil, fmt.Errorf("Unable to get isolated cpus: %w", err)
@ -89,8 +93,9 @@ func NewCPUCollector(logger *slog.Logger) (Collector, error) {
}
c := &cpuCollector{
fs: fs,
cpu: nodeCPUSecondsDesc,
procfs: pfs,
sysfs: sfs,
cpu: nodeCPUSecondsDesc,
cpuInfo: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "info"),
"CPU information from /proc/cpuinfo.",
@ -131,6 +136,11 @@ func NewCPUCollector(logger *slog.Logger) (Collector, error) {
"Whether each core is isolated, information from /sys/devices/system/cpu/isolated.",
[]string{"cpu"}, nil,
),
cpuOnline: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "online"),
"CPUs that are online and being scheduled.",
[]string{"cpu"}, nil,
),
logger: logger,
isolatedCpus: isolcpus,
cpuStats: make(map[int64]procfs.CPUStat),
@ -177,12 +187,21 @@ func (c *cpuCollector) Update(ch chan<- prometheus.Metric) error {
if c.isolatedCpus != nil {
c.updateIsolated(ch)
}
return c.updateThermalThrottle(ch)
err := c.updateThermalThrottle(ch)
if err != nil {
return err
}
err = c.updateOnline(ch)
if err != nil {
return err
}
return nil
}
// updateInfo reads /proc/cpuinfo
func (c *cpuCollector) updateInfo(ch chan<- prometheus.Metric) error {
info, err := c.fs.CPUInfo()
info, err := c.procfs.CPUInfo()
if err != nil {
return err
}
@ -333,9 +352,31 @@ func (c *cpuCollector) updateIsolated(ch chan<- prometheus.Metric) {
}
}
// updateOnline reads /sys/devices/system/cpu/cpu*/online through sysfs and exports online status metrics.
func (c *cpuCollector) updateOnline(ch chan<- prometheus.Metric) error {
cpus, err := c.sysfs.CPUs()
if err != nil {
return err
}
// No-op if the system does not support CPU online stats.
cpu0 := cpus[0]
if _, err := cpu0.Online(); err != nil && errors.Is(err, os.ErrNotExist) {
return nil
}
for _, cpu := range cpus {
setOnline := float64(0)
if online, _ := cpu.Online(); online {
setOnline = 1
}
ch <- prometheus.MustNewConstMetric(c.cpuOnline, prometheus.GaugeValue, setOnline, cpu.Number())
}
return nil
}
// updateStat reads /proc/stat through procfs and exports CPU-related metrics.
func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error {
stats, err := c.fs.Stat()
stats, err := c.procfs.Stat()
if err != nil {
return err
}

View file

@ -18,10 +18,11 @@ package collector
import (
"fmt"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/procfs/sysfs"
"log/slog"
"strings"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/procfs/sysfs"
)
type cpuFreqCollector struct {

View file

@ -32,12 +32,12 @@ func (c *filesystemCollector) GetStats() (stats []filesystemStats, err error) {
return nil, err
}
for _, stat := range fsStat {
if c.excludedMountPointsPattern.MatchString(stat.MountPoint) {
if c.mountPointFilter.ignored(stat.MountPoint) {
c.logger.Debug("Ignoring mount point", "mountpoint", stat.MountPoint)
continue
}
fstype := stat.TypeString()
if c.excludedFSTypesPattern.MatchString(fstype) {
if c.fsTypeFilter.ignored(fstype) {
c.logger.Debug("Ignoring fs type", "type", fstype)
continue
}

View file

@ -48,14 +48,14 @@ func (c *filesystemCollector) GetStats() (stats []filesystemStats, err error) {
stats = []filesystemStats{}
for i := 0; i < int(count); i++ {
mountpoint := C.GoString(&mnt[i].f_mntonname[0])
if c.excludedMountPointsPattern.MatchString(mountpoint) {
if c.mountPointFilter.ignored(mountpoint) {
c.logger.Debug("Ignoring mount point", "mountpoint", mountpoint)
continue
}
device := C.GoString(&mnt[i].f_mntfromname[0])
fstype := C.GoString(&mnt[i].f_fstypename[0])
if c.excludedFSTypesPattern.MatchString(fstype) {
if c.fsTypeFilter.ignored(fstype) {
c.logger.Debug("Ignoring fs type", "type", fstype)
continue
}

View file

@ -19,8 +19,8 @@ package collector
import (
"errors"
"fmt"
"log/slog"
"regexp"
"github.com/alecthomas/kingpin/v2"
"github.com/prometheus/client_golang/prometheus"
@ -36,7 +36,7 @@ var (
mountPointsExcludeSet bool
mountPointsExclude = kingpin.Flag(
"collector.filesystem.mount-points-exclude",
"Regexp of mount points to exclude for filesystem collector.",
"Regexp of mount points to exclude for filesystem collector. (mutually exclusive to mount-points-include)",
).Default(defMountPointsExcluded).PreAction(func(c *kingpin.ParseContext) error {
mountPointsExcludeSet = true
return nil
@ -45,11 +45,15 @@ var (
"collector.filesystem.ignored-mount-points",
"Regexp of mount points to ignore for filesystem collector.",
).Hidden().String()
mountPointsInclude = kingpin.Flag(
"collector.filesystem.mount-points-include",
"Regexp of mount points to include for filesystem collector. (mutually exclusive to mount-points-exclude)",
).String()
fsTypesExcludeSet bool
fsTypesExclude = kingpin.Flag(
"collector.filesystem.fs-types-exclude",
"Regexp of filesystem types to exclude for filesystem collector.",
"Regexp of filesystem types to exclude for filesystem collector. (mutually exclusive to fs-types-include)",
).Default(defFSTypesExcluded).PreAction(func(c *kingpin.ParseContext) error {
fsTypesExcludeSet = true
return nil
@ -58,13 +62,17 @@ var (
"collector.filesystem.ignored-fs-types",
"Regexp of filesystem types to ignore for filesystem collector.",
).Hidden().String()
fsTypesInclude = kingpin.Flag(
"collector.filesystem.fs-types-include",
"Regexp of filesystem types to exclude for filesystem collector. (mutually exclusive to fs-types-exclude)",
).String()
filesystemLabelNames = []string{"device", "mountpoint", "fstype", "device_error"}
)
type filesystemCollector struct {
excludedMountPointsPattern *regexp.Regexp
excludedFSTypesPattern *regexp.Regexp
mountPointFilter deviceFilter
fsTypeFilter deviceFilter
sizeDesc, freeDesc, availDesc *prometheus.Desc
filesDesc, filesFreeDesc *prometheus.Desc
roDesc, deviceErrorDesc *prometheus.Desc
@ -89,29 +97,7 @@ func init() {
// NewFilesystemCollector returns a new Collector exposing filesystems stats.
func NewFilesystemCollector(logger *slog.Logger) (Collector, error) {
if *oldMountPointsExcluded != "" {
if !mountPointsExcludeSet {
logger.Warn("--collector.filesystem.ignored-mount-points is DEPRECATED and will be removed in 2.0.0, use --collector.filesystem.mount-points-exclude")
*mountPointsExclude = *oldMountPointsExcluded
} else {
return nil, errors.New("--collector.filesystem.ignored-mount-points and --collector.filesystem.mount-points-exclude are mutually exclusive")
}
}
if *oldFSTypesExcluded != "" {
if !fsTypesExcludeSet {
logger.Warn("--collector.filesystem.ignored-fs-types is DEPRECATED and will be removed in 2.0.0, use --collector.filesystem.fs-types-exclude")
*fsTypesExclude = *oldFSTypesExcluded
} else {
return nil, errors.New("--collector.filesystem.ignored-fs-types and --collector.filesystem.fs-types-exclude are mutually exclusive")
}
}
subsystem := "filesystem"
logger.Info("Parsed flag --collector.filesystem.mount-points-exclude", "flag", *mountPointsExclude)
mountPointPattern := regexp.MustCompile(*mountPointsExclude)
logger.Info("Parsed flag --collector.filesystem.fs-types-exclude", "flag", *fsTypesExclude)
filesystemsTypesPattern := regexp.MustCompile(*fsTypesExclude)
const subsystem = "filesystem"
sizeDesc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "size_bytes"),
@ -162,18 +148,28 @@ func NewFilesystemCollector(logger *slog.Logger) (Collector, error) {
nil,
)
mountPointFilter, err := newMountPointsFilter(logger)
if err != nil {
return nil, fmt.Errorf("unable to parse mount points filter flags: %w", err)
}
fsTypeFilter, err := newFSTypeFilter(logger)
if err != nil {
return nil, fmt.Errorf("unable to parse fs types filter flags: %w", err)
}
return &filesystemCollector{
excludedMountPointsPattern: mountPointPattern,
excludedFSTypesPattern: filesystemsTypesPattern,
sizeDesc: sizeDesc,
freeDesc: freeDesc,
availDesc: availDesc,
filesDesc: filesDesc,
filesFreeDesc: filesFreeDesc,
roDesc: roDesc,
deviceErrorDesc: deviceErrorDesc,
mountInfoDesc: mountInfoDesc,
logger: logger,
mountPointFilter: mountPointFilter,
fsTypeFilter: fsTypeFilter,
sizeDesc: sizeDesc,
freeDesc: freeDesc,
availDesc: availDesc,
filesDesc: filesDesc,
filesFreeDesc: filesFreeDesc,
roDesc: roDesc,
deviceErrorDesc: deviceErrorDesc,
mountInfoDesc: mountInfoDesc,
logger: logger,
}, nil
}
@ -230,3 +226,61 @@ func (c *filesystemCollector) Update(ch chan<- prometheus.Metric) error {
}
return nil
}
func newMountPointsFilter(logger *slog.Logger) (deviceFilter, error) {
if *oldMountPointsExcluded != "" {
if !mountPointsExcludeSet {
logger.Warn("--collector.filesystem.ignored-mount-points is DEPRECATED and will be removed in 2.0.0, use --collector.filesystem.mount-points-exclude")
*mountPointsExclude = *oldMountPointsExcluded
} else {
return deviceFilter{}, errors.New("--collector.filesystem.ignored-mount-points and --collector.filesystem.mount-points-exclude are mutually exclusive")
}
}
if *mountPointsInclude != "" && !mountPointsExcludeSet {
logger.Debug("mount-points-exclude flag not set when mount-points-include flag is set, assuming include is desired")
*mountPointsExclude = ""
}
if *mountPointsExclude != "" && *mountPointsInclude != "" {
return deviceFilter{}, errors.New("--collector.filesystem.mount-points-exclude and --collector.filesystem.mount-points-include are mutually exclusive")
}
if *mountPointsExclude != "" {
logger.Info("Parsed flag --collector.filesystem.mount-points-exclude", "flag", *mountPointsExclude)
}
if *mountPointsInclude != "" {
logger.Info("Parsed flag --collector.filesystem.mount-points-include", "flag", *mountPointsInclude)
}
return newDeviceFilter(*mountPointsExclude, *mountPointsInclude), nil
}
func newFSTypeFilter(logger *slog.Logger) (deviceFilter, error) {
if *oldFSTypesExcluded != "" {
if !fsTypesExcludeSet {
logger.Warn("--collector.filesystem.ignored-fs-types is DEPRECATED and will be removed in 2.0.0, use --collector.filesystem.fs-types-exclude")
*fsTypesExclude = *oldFSTypesExcluded
} else {
return deviceFilter{}, errors.New("--collector.filesystem.ignored-fs-types and --collector.filesystem.fs-types-exclude are mutually exclusive")
}
}
if *fsTypesInclude != "" && !fsTypesExcludeSet {
logger.Debug("fs-types-exclude flag not set when fs-types-include flag is set, assuming include is desired")
*fsTypesExclude = ""
}
if *fsTypesExclude != "" && *fsTypesInclude != "" {
return deviceFilter{}, errors.New("--collector.filesystem.fs-types-exclude and --collector.filesystem.fs-types-include are mutually exclusive")
}
if *fsTypesExclude != "" {
logger.Info("Parsed flag --collector.filesystem.fs-types-exclude", "flag", *fsTypesExclude)
}
if *fsTypesInclude != "" {
logger.Info("Parsed flag --collector.filesystem.fs-types-include", "flag", *fsTypesInclude)
}
return newDeviceFilter(*fsTypesExclude, *fsTypesInclude), nil
}

View file

@ -39,14 +39,14 @@ func (c *filesystemCollector) GetStats() ([]filesystemStats, error) {
stats := []filesystemStats{}
for _, fs := range buf {
mountpoint := unix.ByteSliceToString(fs.Mntonname[:])
if c.excludedMountPointsPattern.MatchString(mountpoint) {
if c.mountPointFilter.ignored(mountpoint) {
c.logger.Debug("Ignoring mount point", "mountpoint", mountpoint)
continue
}
device := unix.ByteSliceToString(fs.Mntfromname[:])
fstype := unix.ByteSliceToString(fs.Fstypename[:])
if c.excludedFSTypesPattern.MatchString(fstype) {
if c.fsTypeFilter.ignored(fstype) {
c.logger.Debug("Ignoring fs type", "type", fstype)
continue
}

View file

@ -73,12 +73,12 @@ func (c *filesystemCollector) GetStats() ([]filesystemStats, error) {
go func() {
for _, labels := range mps {
if c.excludedMountPointsPattern.MatchString(labels.mountPoint) {
if c.mountPointFilter.ignored(labels.mountPoint) {
c.logger.Debug("Ignoring mount point", "mountpoint", labels.mountPoint)
continue
}
if c.excludedFSTypesPattern.MatchString(labels.fsType) {
c.logger.Debug("Ignoring fs", "type", labels.fsType)
if c.fsTypeFilter.ignored(labels.fsType) {
c.logger.Debug("Ignoring fs type", "type", labels.fsType)
continue
}

View file

@ -97,14 +97,14 @@ func (c *filesystemCollector) GetStats() (stats []filesystemStats, err error) {
stats = []filesystemStats{}
for _, v := range mnt {
mountpoint := unix.ByteSliceToString(v.F_mntonname[:])
if c.excludedMountPointsPattern.MatchString(mountpoint) {
if c.mountPointFilter.ignored(mountpoint) {
c.logger.Debug("msg", "Ignoring mount point", "mountpoint", mountpoint)
continue
}
device := unix.ByteSliceToString(v.F_mntfromname[:])
fstype := unix.ByteSliceToString(v.F_fstypename[:])
if c.excludedFSTypesPattern.MatchString(fstype) {
if c.fsTypeFilter.ignored(fstype) {
c.logger.Debug("msg", "Ignoring fs type", "type", fstype)
continue
}

View file

@ -41,14 +41,14 @@ func (c *filesystemCollector) GetStats() (stats []filesystemStats, err error) {
stats = []filesystemStats{}
for _, v := range mnt {
mountpoint := unix.ByteSliceToString(v.F_mntonname[:])
if c.excludedMountPointsPattern.MatchString(mountpoint) {
if c.mountPointFilter.ignored(mountpoint) {
c.logger.Debug("Ignoring mount point", "mountpoint", mountpoint)
continue
}
device := unix.ByteSliceToString(v.F_mntfromname[:])
fstype := unix.ByteSliceToString(v.F_fstypename[:])
if c.excludedFSTypesPattern.MatchString(fstype) {
if c.fsTypeFilter.ignored(fstype) {
c.logger.Debug("Ignoring fs type", "type", fstype)
continue
}

View file

@ -871,6 +871,10 @@ node_hwmon_fan_target_rpm{chip="nct6779",sensor="fan2"} 27000
# HELP node_hwmon_fan_tolerance Hardware monitor fan element tolerance
# TYPE node_hwmon_fan_tolerance gauge
node_hwmon_fan_tolerance{chip="nct6779",sensor="fan2"} 0
# HELP node_hwmon_freq_freq_mhz Hardware monitor for GPU frequency in MHz
# TYPE node_hwmon_freq_freq_mhz gauge
node_hwmon_freq_freq_mhz{chip="hwmon4",sensor="mclk"} 300
node_hwmon_freq_freq_mhz{chip="hwmon4",sensor="sclk"} 214
# HELP node_hwmon_in_alarm Hardware sensor alarm status (in)
# TYPE node_hwmon_in_alarm gauge
node_hwmon_in_alarm{chip="nct6779",sensor="in0"} 0
@ -984,8 +988,10 @@ node_hwmon_pwm_weight_temp_step_tol{chip="nct6779",sensor="pwm1"} 0
# TYPE node_hwmon_sensor_label gauge
node_hwmon_sensor_label{chip="hwmon4",label="foosensor",sensor="temp1"} 1
node_hwmon_sensor_label{chip="hwmon4",label="foosensor",sensor="temp2"} 1
node_hwmon_sensor_label{chip="platform_applesmc_768",label="Left side ",sensor="fan1"} 1
node_hwmon_sensor_label{chip="platform_applesmc_768",label="Right side ",sensor="fan2"} 1
node_hwmon_sensor_label{chip="hwmon4",label="mclk",sensor="freq2"} 1
node_hwmon_sensor_label{chip="hwmon4",label="sclk",sensor="freq1"} 1
node_hwmon_sensor_label{chip="platform_applesmc_768",label="Left side",sensor="fan1"} 1
node_hwmon_sensor_label{chip="platform_applesmc_768",label="Right side",sensor="fan2"} 1
node_hwmon_sensor_label{chip="platform_coretemp_0",label="Core 0",sensor="temp2"} 1
node_hwmon_sensor_label{chip="platform_coretemp_0",label="Core 1",sensor="temp3"} 1
node_hwmon_sensor_label{chip="platform_coretemp_0",label="Core 2",sensor="temp4"} 1

View file

@ -893,6 +893,10 @@ node_hwmon_fan_target_rpm{chip="nct6779",sensor="fan2"} 27000
# HELP node_hwmon_fan_tolerance Hardware monitor fan element tolerance
# TYPE node_hwmon_fan_tolerance gauge
node_hwmon_fan_tolerance{chip="nct6779",sensor="fan2"} 0
# HELP node_hwmon_freq_freq_mhz Hardware monitor for GPU frequency in MHz
# TYPE node_hwmon_freq_freq_mhz gauge
node_hwmon_freq_freq_mhz{chip="hwmon4",sensor="mclk"} 300
node_hwmon_freq_freq_mhz{chip="hwmon4",sensor="sclk"} 214
# HELP node_hwmon_in_alarm Hardware sensor alarm status (in)
# TYPE node_hwmon_in_alarm gauge
node_hwmon_in_alarm{chip="nct6779",sensor="in0"} 0
@ -1006,8 +1010,10 @@ node_hwmon_pwm_weight_temp_step_tol{chip="nct6779",sensor="pwm1"} 0
# TYPE node_hwmon_sensor_label gauge
node_hwmon_sensor_label{chip="hwmon4",label="foosensor",sensor="temp1"} 1
node_hwmon_sensor_label{chip="hwmon4",label="foosensor",sensor="temp2"} 1
node_hwmon_sensor_label{chip="platform_applesmc_768",label="Left side ",sensor="fan1"} 1
node_hwmon_sensor_label{chip="platform_applesmc_768",label="Right side ",sensor="fan2"} 1
node_hwmon_sensor_label{chip="hwmon4",label="mclk",sensor="freq2"} 1
node_hwmon_sensor_label{chip="hwmon4",label="sclk",sensor="freq1"} 1
node_hwmon_sensor_label{chip="platform_applesmc_768",label="Left side",sensor="fan1"} 1
node_hwmon_sensor_label{chip="platform_applesmc_768",label="Right side",sensor="fan2"} 1
node_hwmon_sensor_label{chip="platform_coretemp_0",label="Core 0",sensor="temp2"} 1
node_hwmon_sensor_label{chip="platform_coretemp_0",label="Core 1",sensor="temp3"} 1
node_hwmon_sensor_label{chip="platform_coretemp_0",label="Core 2",sensor="temp4"} 1

View file

@ -437,6 +437,26 @@ Lines: 1
100000
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/hwmon/hwmon4/freq1_input
Lines: 1
214000000
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/hwmon/hwmon4/freq1_label
Lines: 1
sclk
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/hwmon/hwmon4/freq2_input
Lines: 1
300000000
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/hwmon/hwmon4/freq2_label
Lines: 1
mclk
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/hwmon/hwmon5
SymlinkTo: ../../devices/platform/bogus.0/hwmon/hwmon5/
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@ -1337,7 +1357,7 @@ Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/nvme/nvme0/model
Lines: 1
Samsung SSD 970 PRO 512GB
Samsung SSD 970 PRO 512GB
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/nvme/nvme0/serial
@ -2750,7 +2770,7 @@ Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/devices/platform/applesmc.768/fan1_label
Lines: 1
Left side
Left side
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/devices/platform/applesmc.768/fan1_manual
@ -2784,7 +2804,7 @@ Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/devices/platform/applesmc.768/fan2_label
Lines: 1
Right side
Right side
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/devices/platform/applesmc.768/fan2_manual

View file

@ -44,7 +44,7 @@ var (
hwmonSensorTypes = []string{
"vrm", "beep_enable", "update_interval", "in", "cpu", "fan",
"pwm", "temp", "curr", "power", "energy", "humidity",
"intrusion",
"intrusion", "freq",
}
)
@ -357,6 +357,15 @@ func (c *hwMonCollector) updateHwmon(ch chan<- prometheus.Metric, dir string) er
continue
}
if sensorType == "freq" && element == "input" {
if label, ok := sensorData["label"]; ok {
sensorLabel := cleanMetricName(label)
desc := prometheus.NewDesc(name+"_freq_mhz", "Hardware monitor for GPU frequency in MHz", hwmonLabelDesc, nil)
ch <- prometheus.MustNewConstMetric(
desc, prometheus.GaugeValue, parsedValue/1000000.0, append(labels[:len(labels)-1], sensorLabel)...)
}
continue
}
// fallback, just dump the metric as is
desc := prometheus.NewDesc(name, "Hardware monitor "+sensorType+" element "+element, hwmonLabelDesc, nil)

View file

@ -18,9 +18,10 @@
package collector
import (
"log/slog"
"github.com/alecthomas/kingpin/v2"
"github.com/prometheus/client_golang/prometheus"
"log/slog"
)
type interruptsCollector struct {

View file

@ -17,10 +17,11 @@
package collector
import (
"log/slog"
"github.com/alecthomas/kingpin/v2"
"github.com/prometheus-community/go-runit/runit"
"github.com/prometheus/client_golang/prometheus"
"log/slog"
)
var runitServiceDir = kingpin.Flag("collector.runit.servicedir", "Path to runit service directory.").Default("/etc/service").String()

View file

@ -17,9 +17,10 @@
package collector
import (
"log/slog"
"github.com/opencontainers/selinux/go-selinux"
"github.com/prometheus/client_golang/prometheus"
"log/slog"
)
type selinuxCollector struct {

View file

@ -18,9 +18,10 @@ package collector
import (
"fmt"
"log/slog"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/procfs"
"log/slog"
)
type softirqsCollector struct {

View file

@ -18,8 +18,9 @@
package collector
import (
"github.com/prometheus/client_golang/prometheus"
"log/slog"
"github.com/prometheus/client_golang/prometheus"
)
var unameDesc = prometheus.NewDesc(

View file

@ -10,7 +10,7 @@
(
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < %(fsSpaceFillingUpWarningThreshold)d
and
predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[%(fsSpaceFillingUpPredictionWindow)s], 24*60*60) < 0
predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[%(fsSpaceFillingUpPredictionWindow)s], %(nodeWarningWindowHours)s*60*60) < 0
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0
)
@ -20,7 +20,7 @@
severity: 'warning',
},
annotations: {
summary: 'Filesystem is predicted to run out of space within the next 24 hours.',
summary: 'Filesystem is predicted to run out of space within the next %(nodeWarningWindowHours)s hours.' % $._config,
description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.',
},
},
@ -30,7 +30,7 @@
(
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < %(fsSpaceFillingUpCriticalThreshold)d
and
predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], 4*60*60) < 0
predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], %(nodeCriticalWindowHours)s*60*60) < 0
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0
)
@ -40,7 +40,7 @@
severity: '%(nodeCriticalSeverity)s' % $._config,
},
annotations: {
summary: 'Filesystem is predicted to run out of space within the next 4 hours.',
summary: 'Filesystem is predicted to run out of space within the next %(nodeCriticalWindowHours)s hours.' % $._config,
description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.',
},
},
@ -86,7 +86,7 @@
(
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < 40
and
predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], 24*60*60) < 0
predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], %(nodeWarningWindowHours)s*60*60) < 0
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0
)
@ -96,7 +96,7 @@
severity: 'warning',
},
annotations: {
summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.',
summary: 'Filesystem is predicted to run out of inodes within the next %(nodeWarningWindowHours)s hours.' % $._config,
description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.',
},
},
@ -106,7 +106,7 @@
(
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} * 100 < 20
and
predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], 4*60*60) < 0
predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s}[6h], %(nodeCriticalWindowHours)s*60*60) < 0
and
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s,%(fsMountpointSelector)s} == 0
)
@ -116,7 +116,7 @@
severity: '%(nodeCriticalSeverity)s' % $._config,
},
annotations: {
summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.',
summary: 'Filesystem is predicted to run out of inodes within the next %(nodeCriticalWindowHours)s hours.' % $._config,
description: 'Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.',
},
},
@ -191,7 +191,7 @@
||| % $._config,
annotations: {
summary: 'Number of conntrack are getting close to the limit.',
description: '{{ $value | humanizePercentage }} of conntrack entries are used.',
description: '{{ $labels.instance }} {{ $value | humanizePercentage }} of conntrack entries are used.',
},
labels: {
severity: 'warning',
@ -312,7 +312,7 @@
{
alert: 'NodeCPUHighUsage',
expr: |||
sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode!="idle"}[2m]))) * 100 > %(cpuHighUsageThreshold)d
sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode!~"idle|iowait"}[2m]))) * 100 > %(cpuHighUsageThreshold)d
||| % $._config,
'for': '15m',
labels: {

View file

@ -50,6 +50,16 @@
// 'NodeSystemSaturation' alert.
systemSaturationPerCoreThreshold: 2,
// Some of the alerts use predict_linear() to fire alerts ahead of time to
// prevent unrecoverable situations (eg. no more disk space). However, the
// node may have automatic processes (cronjobs) in place to prevent that
// within a certain time window, this may not align with the default time
// window of these alerts. This can cause these alerts to start flapping.
// By reducing the time window, the system gets more time to
// resolve this before problems occur.
nodeWarningWindowHours: '24',
nodeCriticalWindowHours: '4',
// Available disk space (%) thresholds on which to trigger the
// 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk
// usage grows in a way that it is predicted to run out in 4h or 1d

View file

@ -1,469 +1,480 @@
local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet';
local grafana = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
local dashboard = grafana.dashboard;
local row = grafana.row;
local prometheus = grafana.prometheus;
local template = grafana.template;
local graphPanel = grafana.graphPanel;
local variable = dashboard.variable;
local row = grafana.panel.row;
local prometheus = grafana.query.prometheus;
local timeSeriesPanel = grafana.panel.timeSeries;
local tsOptions = timeSeriesPanel.options;
local tsStandardOptions = timeSeriesPanel.standardOptions;
local tsQueryOptions = timeSeriesPanel.queryOptions;
local tsCustom = timeSeriesPanel.fieldConfig.defaults.custom;
local tsLegend = tsOptions.legend;
local c = import '../config.libsonnet';
local datasourceTemplate = {
current: {
text: 'default',
value: 'default',
},
hide: 0,
label: 'Data Source',
name: 'datasource',
options: [],
query: 'prometheus',
refresh: 1,
regex: '',
type: 'datasource',
};
local datasource = variable.datasource.new(
'datasource', 'prometheus'
);
local tsCommonPanelOptions =
variable.query.withDatasourceFromVariable(datasource)
+ tsCustom.stacking.withMode('normal')
+ tsCustom.withFillOpacity(100)
+ tsCustom.withShowPoints('never')
+ tsLegend.withShowLegend(false)
+ tsOptions.tooltip.withMode('multi')
+ tsOptions.tooltip.withSort('desc');
local CPUUtilisation =
graphPanel.new(
timeSeriesPanel.new(
'CPU Utilisation',
datasource='$datasource',
span=6,
format='percentunit',
stack=true,
fill=10,
legend_show=false,
) { tooltip+: { sort: 2 } };
)
+ tsCommonPanelOptions
+ tsStandardOptions.withUnit('percentunit');
local CPUSaturation =
// TODO: Is this a useful panel? At least there should be some explanation how load
// average relates to the "CPU saturation" in the title.
graphPanel.new(
timeSeriesPanel.new(
'CPU Saturation (Load1 per CPU)',
datasource='$datasource',
span=6,
format='percentunit',
stack=true,
fill=10,
legend_show=false,
) { tooltip+: { sort: 2 } };
)
+ tsCommonPanelOptions
+ tsStandardOptions.withUnit('percentunit');
local memoryUtilisation =
graphPanel.new(
timeSeriesPanel.new(
'Memory Utilisation',
datasource='$datasource',
span=6,
format='percentunit',
stack=true,
fill=10,
legend_show=false,
) { tooltip+: { sort: 2 } };
)
+ tsCommonPanelOptions
+ tsStandardOptions.withUnit('percentunit');
local memorySaturation =
graphPanel.new(
timeSeriesPanel.new(
'Memory Saturation (Major Page Faults)',
datasource='$datasource',
span=6,
format='rds',
stack=true,
fill=10,
legend_show=false,
) { tooltip+: { sort: 2 } };
)
+ tsCommonPanelOptions
+ tsStandardOptions.withUnit('rds');
local networkOverrides = tsStandardOptions.withOverrides(
[
tsStandardOptions.override.byRegexp.new('/Transmit/')
+ tsStandardOptions.override.byRegexp.withPropertiesFromOptions(
tsCustom.withTransform('negative-Y')
),
]
);
local networkUtilisation =
graphPanel.new(
timeSeriesPanel.new(
'Network Utilisation (Bytes Receive/Transmit)',
datasource='$datasource',
span=6,
format='Bps',
stack=true,
fill=10,
legend_show=false,
)
.addSeriesOverride({ alias: '/Receive/', stack: 'A' })
.addSeriesOverride({ alias: '/Transmit/', stack: 'B', transform: 'negative-Y' })
{ tooltip+: { sort: 2 } };
+ tsCommonPanelOptions
+ tsStandardOptions.withUnit('Bps')
+ networkOverrides;
local networkSaturation =
graphPanel.new(
timeSeriesPanel.new(
'Network Saturation (Drops Receive/Transmit)',
datasource='$datasource',
span=6,
format='Bps',
stack=true,
fill=10,
legend_show=false,
)
.addSeriesOverride({ alias: '/ Receive/', stack: 'A' })
.addSeriesOverride({ alias: '/ Transmit/', stack: 'B', transform: 'negative-Y' })
{ tooltip+: { sort: 2 } };
+ tsCommonPanelOptions
+ tsStandardOptions.withUnit('Bps')
+ networkOverrides;
local diskIOUtilisation =
graphPanel.new(
timeSeriesPanel.new(
'Disk IO Utilisation',
datasource='$datasource',
span=6,
format='percentunit',
stack=true,
fill=10,
legend_show=false,
) { tooltip+: { sort: 2 } };
)
+ tsCommonPanelOptions
+ tsStandardOptions.withUnit('percentunit');
local diskIOSaturation =
graphPanel.new(
timeSeriesPanel.new(
'Disk IO Saturation',
datasource='$datasource',
span=6,
format='percentunit',
stack=true,
fill=10,
legend_show=false,
) { tooltip+: { sort: 2 } };
)
+ tsCommonPanelOptions
+ tsStandardOptions.withUnit('percentunit');
local diskSpaceUtilisation =
graphPanel.new(
timeSeriesPanel.new(
'Disk Space Utilisation',
datasource='$datasource',
span=12,
format='percentunit',
stack=true,
fill=10,
legend_show=false,
) { tooltip+: { sort: 2 } };
)
+ tsCommonPanelOptions
+ tsStandardOptions.withUnit('percentunit');
{
_clusterTemplate:: template.new(
name='cluster',
datasource='$datasource',
query='label_values(node_time_seconds, %s)' % $._config.clusterLabel,
current='',
hide=if $._config.showMultiCluster then '' else '2',
refresh=2,
includeAll=false,
sort=1
),
_clusterVariable::
variable.query.new('cluster')
+ variable.query.withDatasourceFromVariable(datasource)
+ variable.query.queryTypes.withLabelValues(
$._config.clusterLabel,
'node_time_seconds',
)
+ (if $._config.showMultiCluster then variable.query.generalOptions.showOnDashboard.withLabelAndValue() else variable.query.generalOptions.showOnDashboard.withNothing())
+ variable.query.refresh.onTime()
+ variable.query.selectionOptions.withIncludeAll(false)
+ variable.query.withSort(asc=true),
grafanaDashboards+:: {
'node-rsrc-use.json':
dashboard.new(
'%sUSE Method / Node' % $._config.dashboardNamePrefix,
time_from='now-1h',
tags=($._config.dashboardTags),
timezone='utc',
refresh='30s',
graphTooltip='shared_crosshair',
uid=std.md5('node-rsrc-use.json')
)
.addTemplate(datasourceTemplate)
.addTemplate($._clusterTemplate)
.addTemplate(
template.new(
+ dashboard.time.withFrom('now-1h')
+ dashboard.withTags($._config.dashboardTags)
+ dashboard.withTimezone('utc')
+ dashboard.withRefresh('30s')
+ dashboard.graphTooltip.withSharedCrosshair()
+ dashboard.withUid(std.md5('node-rsrc-use.json'))
+ dashboard.withVariables([
datasource,
$._clusterVariable,
variable.query.new('instance')
+ variable.query.withDatasourceFromVariable(datasource)
+ variable.query.queryTypes.withLabelValues(
'instance',
'$datasource',
'label_values(node_exporter_build_info{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}, instance)' % $._config,
refresh='time',
sort=1
)
)
.addRow(
row.new('CPU')
.addPanel(CPUUtilisation.addTarget(prometheus.target('instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Utilisation')))
.addPanel(CPUSaturation.addTarget(prometheus.target('instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Saturation')))
)
.addRow(
row.new('Memory')
.addPanel(memoryUtilisation.addTarget(prometheus.target('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Utilisation')))
.addPanel(memorySaturation.addTarget(prometheus.target('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Major page Faults')))
)
.addRow(
row.new('Network')
.addPanel(
networkUtilisation
.addTarget(prometheus.target('instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Receive'))
.addTarget(prometheus.target('instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Transmit'))
)
.addPanel(
networkSaturation
.addTarget(prometheus.target('instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Receive'))
.addTarget(prometheus.target('instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Transmit'))
)
)
.addRow(
row.new('Disk IO')
.addPanel(diskIOUtilisation.addTarget(prometheus.target('instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{device}}')))
.addPanel(diskIOSaturation.addTarget(prometheus.target('instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{device}}')))
)
.addRow(
row.new('Disk Space')
.addPanel(
diskSpaceUtilisation.addTarget(prometheus.target(
|||
sort_desc(1 -
(
max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"})
/
max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"})
) != 0
)
||| % $._config, legendFormat='{{device}}'
))
'node_exporter_build_info{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}' % $._config,
)
+ variable.query.refresh.onTime()
+ variable.query.withSort(asc=true),
])
+ dashboard.withPanels(
grafana.util.grid.makeGrid([
row.new('CPU')
+ row.withPanels([
CPUUtilisation + tsQueryOptions.withTargets([prometheus.new('$datasource', 'instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('Utilisation')]),
CPUSaturation + tsQueryOptions.withTargets([prometheus.new('$datasource', 'instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('Saturation')]),
]),
row.new('Memory')
+ row.withPanels([
memoryUtilisation + tsQueryOptions.withTargets([prometheus.new('$datasource', 'instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('Utilisation')]),
memorySaturation + tsQueryOptions.withTargets([prometheus.new('$datasource', 'instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('Major page Faults')]),
]),
row.new('Network')
+ row.withPanels([
networkUtilisation + tsQueryOptions.withTargets([
prometheus.new('$datasource', 'instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('Receive'),
prometheus.new('$datasource', 'instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('Transmit'),
]),
networkSaturation + tsQueryOptions.withTargets([
prometheus.new('$datasource', 'instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('Receive'),
prometheus.new('$datasource', 'instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('Transmit'),
]),
]),
row.new('Disk IO')
+ row.withPanels([
diskIOUtilisation + tsQueryOptions.withTargets([prometheus.new('$datasource', 'instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('{{device}}')]),
diskIOSaturation + tsQueryOptions.withTargets([prometheus.new('$datasource', 'instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config) + prometheus.withLegendFormat('{{device}}')]),
]),
], panelWidth=12, panelHeight=7)
+ grafana.util.grid.makeGrid([
row.new('Disk Space')
+ row.withPanels([
diskSpaceUtilisation + tsQueryOptions.withTargets([
prometheus.new(
'$datasource',
|||
sort_desc(1 -
(
max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"})
/
max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"})
) != 0
)
||| % $._config
) + prometheus.withLegendFormat('{{device}}'),
]),
]),
], panelWidth=24, panelHeight=7, startY=34),
),
'node-cluster-rsrc-use.json':
dashboard.new(
'%sUSE Method / Cluster' % $._config.dashboardNamePrefix,
time_from='now-1h',
tags=($._config.dashboardTags),
timezone='utc',
refresh='30s',
graphTooltip='shared_crosshair',
uid=std.md5('node-cluster-rsrc-use.json')
)
.addTemplate(datasourceTemplate)
.addTemplate($._clusterTemplate)
.addRow(
row.new('CPU')
.addPanel(
CPUUtilisation
.addTarget(prometheus.target(
|||
((
instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
*
instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
) != 0 )
/ scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}))
||| % $._config, legendFormat='{{ instance }}'
))
)
.addPanel(
CPUSaturation
.addTarget(prometheus.target(
|||
(
instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
/ scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}))
) != 0
||| % $._config, legendFormat='{{instance}}'
))
)
)
.addRow(
row.new('Memory')
.addPanel(
memoryUtilisation
.addTarget(prometheus.target(
|||
(
instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
/ scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}))
) != 0
||| % $._config, legendFormat='{{instance}}',
))
)
.addPanel(memorySaturation.addTarget(prometheus.target('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{instance}}')))
)
.addRow(
row.new('Network')
.addPanel(
networkUtilisation
.addTarget(prometheus.target('instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Receive'))
.addTarget(prometheus.target('instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Transmit'))
)
.addPanel(
networkSaturation
.addTarget(prometheus.target('instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Receive'))
.addTarget(prometheus.target('instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Transmit'))
)
)
.addRow(
row.new('Disk IO')
.addPanel(
diskIOUtilisation
.addTarget(prometheus.target(
|||
(
instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
/ scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}))
) != 0
||| % $._config, legendFormat='{{instance}} {{device}}'
))
)
.addPanel(
diskIOSaturation
.addTarget(prometheus.target(
|||
(
+ dashboard.time.withFrom('now-1h')
+ dashboard.withTags($._config.dashboardTags)
+ dashboard.withTimezone('utc')
+ dashboard.withRefresh('30s')
+ dashboard.graphTooltip.withSharedCrosshair()
+ dashboard.withUid(std.md5('node-cluster-rsrc-use.json'))
+ dashboard.withVariables([
datasource,
$._clusterVariable,
variable.query.withDatasourceFromVariable(datasource)
+ variable.query.refresh.onTime()
+ variable.query.withSort(asc=true),
])
+ dashboard.withPanels(
grafana.util.grid.makeGrid([
row.new('CPU')
+ row.withPanels([
CPUUtilisation + tsQueryOptions.withTargets([
prometheus.new(
'$datasource',
|||
((
instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
*
instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
) != 0 )
/ scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}))
||| % $._config
) + prometheus.withLegendFormat('{{ instance }}'),
]),
CPUSaturation + tsQueryOptions.withTargets([
prometheus.new(
'$datasource',
|||
(
instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
/ scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}))
) != 0
||| % $._config
) + prometheus.withLegendFormat('{{ instance }}'),
]),
]),
row.new('Memory')
+ row.withPanels([
memoryUtilisation + tsQueryOptions.withTargets([
prometheus.new(
'$datasource',
|||
(
instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
/ scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}))
) != 0
||| % $._config
) + prometheus.withLegendFormat('{{ instance }}'),
]),
memorySaturation + tsQueryOptions.withTargets([
prometheus.new(
'$datasource',
'instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}' % $._config
) + prometheus.withLegendFormat('{{ instance }}'),
]),
]),
row.new('Network')
+ row.withPanels([
networkUtilisation + tsQueryOptions.withTargets([
prometheus.new(
'$datasource',
'instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config
) + prometheus.withLegendFormat('{{ instance }} Receive'),
prometheus.new(
'$datasource',
'instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config
) + prometheus.withLegendFormat('{{ instance }} Transmit'),
]),
networkSaturation + tsQueryOptions.withTargets([
prometheus.new(
'$datasource',
'instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config
) + prometheus.withLegendFormat('{{ instance }} Receive'),
prometheus.new(
'$datasource',
'instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config
) + prometheus.withLegendFormat('{{ instance }} Transmit'),
]),
]),
row.new('Disk IO')
+ row.withPanels([
diskIOUtilisation + tsQueryOptions.withTargets([
prometheus.new(
'$datasource',
|||
instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
/ scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}))
||| % $._config
) + prometheus.withLegendFormat('{{ instance }} {{device}}'),
]),
diskIOSaturation + tsQueryOptions.withTargets([prometheus.new(
'$datasource',
|||
instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}))
) != 0
||| % $._config, legendFormat='{{instance}} {{device}}'
))
)
)
.addRow(
row.new('Disk Space')
.addPanel(
diskSpaceUtilisation
.addTarget(prometheus.target(
|||
sum without (device) (
max without (fstype, mountpoint) ((
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"}
-
node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"}
) != 0)
)
/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"})))
||| % $._config, legendFormat='{{instance}}'
))
)
||| % $._config
) + prometheus.withLegendFormat('{{ instance }} {{device}}')]),
]),
], panelWidth=12, panelHeight=7)
+ grafana.util.grid.makeGrid([
row.new('Disk Space')
+ row.withPanels([
diskSpaceUtilisation + tsQueryOptions.withTargets([
prometheus.new(
'$datasource',
|||
sum without (device) (
max without (fstype, mountpoint) ((
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"}
-
node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"}
) != 0)
)
/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s, %(clusterLabel)s="$cluster"})))
||| % $._config
) + prometheus.withLegendFormat('{{ instance }}'),
]),
]),
], panelWidth=24, panelHeight=7, startY=34),
),
} +
if $._config.showMultiCluster then {
'node-multicluster-rsrc-use.json':
dashboard.new(
'%sUSE Method / Multi-cluster' % $._config.dashboardNamePrefix,
time_from='now-1h',
tags=($._config.dashboardTags),
timezone='utc',
refresh='30s',
graphTooltip='shared_crosshair',
uid=std.md5('node-multicluster-rsrc-use.json')
)
.addTemplate(datasourceTemplate)
.addRow(
row.new('CPU')
.addPanel(
CPUUtilisation
.addTarget(prometheus.target(
|||
sum(
((
instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s}
*
instance:node_num_cpu:sum{%(nodeExporterSelector)s}
) != 0)
/ scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s}))
) by (%(clusterLabel)s)
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config
))
)
.addPanel(
CPUSaturation
.addTarget(prometheus.target(
|||
sum((
instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}
/ scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}))
) != 0) by (%(clusterLabel)s)
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config
))
)
)
.addRow(
row.new('Memory')
.addPanel(
memoryUtilisation
.addTarget(prometheus.target(
|||
sum((
instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}
/ scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}))
) != 0) by (%(clusterLabel)s)
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config
))
)
.addPanel(
memorySaturation
.addTarget(prometheus.target(
|||
sum((
instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s}
) != 0) by (%(clusterLabel)s)
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config
))
)
)
.addRow(
row.new('Network')
.addPanel(
networkUtilisation
.addTarget(prometheus.target(
|||
sum((
instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}
) != 0) by (%(clusterLabel)s)
||| % $._config, legendFormat='{{%(clusterLabel)s}} Receive' % $._config
))
.addTarget(prometheus.target(
|||
sum((
instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}
) != 0) by (%(clusterLabel)s)
||| % $._config, legendFormat='{{%(clusterLabel)s}} Transmit' % $._config
))
)
.addPanel(
networkSaturation
.addTarget(prometheus.target(
|||
sum((
instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}
) != 0) by (%(clusterLabel)s)
||| % $._config, legendFormat='{{%(clusterLabel)s}} Receive' % $._config
))
.addTarget(prometheus.target(
|||
sum((
instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}
) != 0) by (%(clusterLabel)s)
||| % $._config, legendFormat='{{%(clusterLabel)s}} Transmit' % $._config
))
)
)
.addRow(
row.new('Disk IO')
.addPanel(
diskIOUtilisation
.addTarget(prometheus.target(
|||
sum((
instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}
/ scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}))
) != 0) by (%(clusterLabel)s, device)
||| % $._config, legendFormat='{{%(clusterLabel)s}} {{device}}' % $._config
))
)
.addPanel(
diskIOSaturation
.addTarget(prometheus.target(
|||
sum((
instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}
/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}))
) != 0) by (%(clusterLabel)s, device)
||| % $._config, legendFormat='{{%(clusterLabel)s}} {{device}}' % $._config
))
)
)
.addRow(
row.new('Disk Space')
.addPanel(
diskSpaceUtilisation
.addTarget(prometheus.target(
|||
sum (
sum without (device) (
max without (fstype, mountpoint, instance, pod) ((
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}
) != 0)
)
/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s})))
) by (%(clusterLabel)s)
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config
))
)
+ dashboard.time.withFrom('now-1h')
+ dashboard.withTags($._config.dashboardTags)
+ dashboard.withTimezone('utc')
+ dashboard.withRefresh('30s')
+ dashboard.graphTooltip.withSharedCrosshair()
+ dashboard.withUid(std.md5('node-multicluster-rsrc-use.json'))
+ dashboard.withVariables([
datasource,
variable.query.withDatasourceFromVariable(datasource)
+ variable.query.refresh.onTime()
+ variable.query.withSort(asc=true),
])
+ dashboard.withPanels(
grafana.util.grid.makeGrid([
row.new('CPU')
+ row.withPanels([
CPUUtilisation + tsQueryOptions.withTargets([
prometheus.new(
'$datasource',
|||
sum(
((
instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s}
*
instance:node_num_cpu:sum{%(nodeExporterSelector)s}
) != 0)
/ scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s}))
) by (%(clusterLabel)s)
||| % $._config
) + prometheus.withLegendFormat('{{%(clusterLabel)s}}'),
]),
CPUSaturation + tsQueryOptions.withTargets([
prometheus.new(
'$datasource',
|||
sum((
instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}
/ scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}))
) != 0) by (%(clusterLabel)s)
||| % $._config
) + prometheus.withLegendFormat('{{%(clusterLabel)s}}'),
]),
]),
row.new('Memory')
+ row.withPanels([
memoryUtilisation + tsQueryOptions.withTargets([
prometheus.new(
'$datasource',
|||
sum((
instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}
/ scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}))
) != 0) by (%(clusterLabel)s)
||| % $._config
) + prometheus.withLegendFormat('{{%(clusterLabel)s}}'),
]),
memorySaturation + tsQueryOptions.withTargets([
prometheus.new(
'$datasource',
|||
sum((
instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s}
) != 0) by (%(clusterLabel)s)
|||
% $._config
) + prometheus.withLegendFormat('{{%(clusterLabel)s}}'),
]),
]),
row.new('Network')
+ row.withPanels([
networkUtilisation + tsQueryOptions.withTargets([
prometheus.new(
'$datasource',
|||
sum((
instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}
) != 0) by (%(clusterLabel)s)
||| % $._config
) + prometheus.withLegendFormat('{{%(clusterLabel)s}} Receive'),
prometheus.new(
'$datasource',
|||
sum((
instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}
) != 0) by (%(clusterLabel)s)
||| % $._config
) + prometheus.withLegendFormat('{{%(clusterLabel)s}} Transmit'),
]),
networkSaturation + tsQueryOptions.withTargets([
prometheus.new(
'$datasource',
|||
sum((
instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}
) != 0) by (%(clusterLabel)s)
||| % $._config
) + prometheus.withLegendFormat('{{%(clusterLabel)s}} Receive'),
prometheus.new(
'$datasource',
|||
sum((
instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}
) != 0) by (%(clusterLabel)s)
||| % $._config
) + prometheus.withLegendFormat('{{%(clusterLabel)s}} Transmit'),
]),
]),
row.new('Disk IO')
+ row.withPanels([
diskIOUtilisation + tsQueryOptions.withTargets([
prometheus.new(
'$datasource',
|||
sum((
instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}
/ scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}))
) != 0) by (%(clusterLabel)s, device)
||| % $._config
) + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{device}}'),
]),
diskIOSaturation + tsQueryOptions.withTargets([prometheus.new(
'$datasource',
|||
sum((
instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}
/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}))
) != 0) by (%(clusterLabel)s, device)
||| % $._config
) + prometheus.withLegendFormat('{{%(clusterLabel)s}} {{device}}')]),
]),
], panelWidth=12, panelHeight=7)
+ grafana.util.grid.makeGrid([
row.new('Disk Space')
+ row.withPanels([
diskSpaceUtilisation + tsQueryOptions.withTargets([
prometheus.new(
'$datasource',
|||
sum (
sum without (device) (
max without (fstype, mountpoint, instance, pod) ((
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s}
) != 0)
)
/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(fsMountpointSelector)s})))
) by (%(clusterLabel)s)
||| % $._config
) + prometheus.withLegendFormat('{{%(clusterLabel)s}}'),
]),
]),
], panelWidth=24, panelHeight=7, startY=34),
),
} else {},
}

View file

@ -4,20 +4,11 @@
{
"source": {
"git": {
"remote": "https://github.com/grafana/grafonnet-lib.git",
"subdir": "grafonnet"
"remote": "https://github.com/grafana/grafonnet.git",
"subdir": "gen/grafonnet-latest"
}
},
"version": "master"
},
{
"source": {
"git": {
"remote": "https://github.com/grafana/grafonnet-lib.git",
"subdir": "grafonnet-7.0"
}
},
"version": "master"
"version": "main"
}
],
"legacyImports": false

File diff suppressed because it is too large Load diff

24
go.mod
View file

@ -18,18 +18,18 @@ require (
github.com/mattn/go-xmlrpc v0.0.3
github.com/mdlayher/ethtool v0.2.0
github.com/mdlayher/netlink v1.7.2
github.com/mdlayher/wifi v0.3.0
github.com/mdlayher/wifi v0.3.1
github.com/opencontainers/selinux v1.11.1
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55
github.com/prometheus-community/go-runit v0.1.0
github.com/prometheus/client_golang v1.20.5
github.com/prometheus/client_model v0.6.1
github.com/prometheus/common v0.60.1
github.com/prometheus/exporter-toolkit v0.13.1
github.com/prometheus/procfs v0.15.1
github.com/safchain/ethtool v0.4.1
github.com/prometheus/common v0.61.0
github.com/prometheus/exporter-toolkit v0.13.2
github.com/prometheus/procfs v0.15.2-0.20240603130017-1754b780536b // == v0.15.1 + https://github.com/prometheus/procfs/commit/1754b780536bb81082baa913e04cc4fff4d2baea
github.com/safchain/ethtool v0.5.9
golang.org/x/exp v0.0.0-20240909161429-701f63a606c0
golang.org/x/sys v0.26.0
golang.org/x/sys v0.28.0
howett.net/plist v1.0.1
)
@ -51,11 +51,11 @@ require (
github.com/xhit/go-str2duration/v2 v2.1.0 // indirect
go.uber.org/atomic v1.7.0 // indirect
go.uber.org/multierr v1.6.0 // indirect
golang.org/x/crypto v0.28.0 // indirect
golang.org/x/net v0.29.0 // indirect
golang.org/x/oauth2 v0.23.0 // indirect
golang.org/x/sync v0.8.0 // indirect
golang.org/x/text v0.19.0 // indirect
google.golang.org/protobuf v1.34.2 // indirect
golang.org/x/crypto v0.31.0 // indirect
golang.org/x/net v0.32.0 // indirect
golang.org/x/oauth2 v0.24.0 // indirect
golang.org/x/sync v0.10.0 // indirect
golang.org/x/text v0.21.0 // indirect
google.golang.org/protobuf v1.35.2 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
)

54
go.sum
View file

@ -61,8 +61,8 @@ github.com/mdlayher/socket v0.4.1 h1:eM9y2/jlbs1M615oshPQOHZzj6R6wMT7bX5NPiQvn2U
github.com/mdlayher/socket v0.4.1/go.mod h1:cAqeGjoufqdxWkD7DkpyS+wcefOtmu5OQ8KuoJGIReA=
github.com/mdlayher/vsock v1.2.1 h1:pC1mTJTvjo1r9n9fbm7S1j04rCgCzhCOS5DY0zqHlnQ=
github.com/mdlayher/vsock v1.2.1/go.mod h1:NRfCibel++DgeMD8z/hP+PPTjlNJsdPOmxcnENvE+SE=
github.com/mdlayher/wifi v0.3.0 h1:ZfS81w/7xTWBJfhM77K0k6m3sJckwoNOoZUwOW34omo=
github.com/mdlayher/wifi v0.3.0/go.mod h1:/bdkqKYl+lD4recmQM6bTHxMrEUW70reibTyr93CAd0=
github.com/mdlayher/wifi v0.3.1 h1:bZDuMI1f7z5BtUUO3NgHRdR/R88YtywIe6dsEFI0Txs=
github.com/mdlayher/wifi v0.3.1/go.mod h1:ODQaObvsglghTuNhezD9grkTB4shVNc28aJfTXmvSi8=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
@ -79,48 +79,48 @@ github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+
github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
github.com/prometheus/common v0.60.1 h1:FUas6GcOw66yB/73KC+BOZoFJmbo/1pojoILArPAaSc=
github.com/prometheus/common v0.60.1/go.mod h1:h0LYf1R1deLSKtD4Vdg8gy4RuOvENW2J/h19V5NADQw=
github.com/prometheus/exporter-toolkit v0.13.1 h1:Evsh0gWQo2bdOHlnz9+0Nm7/OFfIwhE2Ws4A2jIlR04=
github.com/prometheus/exporter-toolkit v0.13.1/go.mod h1:ujdv2YIOxtdFxxqtloLpbqmxd5J0Le6IITUvIRSWjj0=
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
github.com/prometheus/common v0.61.0 h1:3gv/GThfX0cV2lpO7gkTUwZru38mxevy90Bj8YFSRQQ=
github.com/prometheus/common v0.61.0/go.mod h1:zr29OCN/2BsJRaFwG8QOBr41D6kkchKbpeNH7pAjb/s=
github.com/prometheus/exporter-toolkit v0.13.2 h1:Z02fYtbqTMy2i/f+xZ+UK5jy/bl1Ex3ndzh06T/Q9DQ=
github.com/prometheus/exporter-toolkit v0.13.2/go.mod h1:tCqnfx21q6qN1KA4U3Bfb8uWzXfijIrJz3/kTIqMV7g=
github.com/prometheus/procfs v0.15.2-0.20240603130017-1754b780536b h1:4EJkx3vycI+n5JY5ht+bnSUGamkmmXkpcNeO/OBT/0A=
github.com/prometheus/procfs v0.15.2-0.20240603130017-1754b780536b/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/safchain/ethtool v0.4.1 h1:S6mEleTADqgynileXoiapt/nKnatyR6bmIHoF+h2ADo=
github.com/safchain/ethtool v0.4.1/go.mod h1:XLLnZmy4OCRTkksP/UiMjij96YmIsBfmBQcs7H6tA48=
github.com/safchain/ethtool v0.5.9 h1://6RvaOKFf3nQ0rl5+8zBbE4/72455VC9Jq61pfq67E=
github.com/safchain/ethtool v0.5.9/go.mod h1:w8oSsZeowyRaM7xJJBAbubzzrOkwO8TBgPSEqPP/5mg=
github.com/siebenmann/go-kstat v0.0.0-20210513183136-173c9b0a9973 h1:GfSdC6wKfTGcgCS7BtzF5694Amne1pGCSTY252WhlEY=
github.com/siebenmann/go-kstat v0.0.0-20210513183136-173c9b0a9973/go.mod h1:G81aIFAMS9ECrwBYR9YxhlPjWgrItd+Kje78O6+uqm8=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/xhit/go-str2duration/v2 v2.1.0 h1:lxklc02Drh6ynqX+DdPyp5pCKLUQpRT8bp8Ydu2Bstc=
github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU=
go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw=
go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4=
go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
golang.org/x/crypto v0.28.0 h1:GBDwsMXVQi34v5CCYUm2jkJvu4cbtru2U4TN2PSyQnw=
golang.org/x/crypto v0.28.0/go.mod h1:rmgy+3RHxRZMyY0jjAJShp2zgEdOqj2AO7U0pYmeQ7U=
golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U=
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 h1:e66Fs6Z+fZTbFBAxKfP3PALWBtpfqks2bwGcexMxgtk=
golang.org/x/exp v0.0.0-20240909161429-701f63a606c0/go.mod h1:2TbTHSBQa924w8M6Xs1QcRcFwyucIwBGpK1p2f1YFFY=
golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo=
golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0=
golang.org/x/oauth2 v0.23.0 h1:PbgcYx2W7i4LvjJWEbf0ngHV6qJYr86PkAV3bXdLEbs=
golang.org/x/oauth2 v0.23.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ=
golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/net v0.32.0 h1:ZqPmj8Kzc+Y6e0+skZsuACbx+wzMgo5MQsJh9Qd6aYI=
golang.org/x/net v0.32.0/go.mod h1:CwU0IoeOlnQQWJ6ioyFrfRuomB8GKF6KbYXZVyeXNfs=
golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE=
golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20211031064116-611d5d643895/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo=
golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM=
golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg=
google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw=
golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/sys v0.28.0 h1:Fksou7UEQUWlKvIdsqzJmUmCX3cZuD2+P3XyyzwMhlA=
golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo=
golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ=
google.golang.org/protobuf v1.35.2 h1:8Ar7bF+apOIoThw1EdZl0p1oWvMqTHmpA2fRTyZO8io=
google.golang.org/protobuf v1.35.2/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=