Add new collector and metrics for watchdog (#2309) (#2880)

Signed-off-by: Gavin Lam <gavin.oss@tutamail.com>
This commit is contained in:
Gavin Lam 2024-03-09 04:00:06 -05:00 committed by GitHub
parent 5e412a689a
commit 95efb86f6b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 348 additions and 0 deletions

View file

@ -204,6 +204,7 @@ softirqs | Exposes detailed softirq statistics from `/proc/softirqs`. | Linux
sysctl | Expose sysctl values from `/proc/sys`. Use `--collector.sysctl.include(-info)` to configure. | Linux sysctl | Expose sysctl values from `/proc/sys`. Use `--collector.sysctl.include(-info)` to configure. | Linux
systemd | Exposes service and system status from [systemd](http://www.freedesktop.org/wiki/Software/systemd/). | Linux systemd | Exposes service and system status from [systemd](http://www.freedesktop.org/wiki/Software/systemd/). | Linux
tcpstat | Exposes TCP connection status information from `/proc/net/tcp` and `/proc/net/tcp6`. (Warning: the current version has potential performance issues in high load situations.) | Linux tcpstat | Exposes TCP connection status information from `/proc/net/tcp` and `/proc/net/tcp6`. (Warning: the current version has potential performance issues in high load situations.) | Linux
watchdog | Exposes statistics from `/sys/class/watchdog` | Linux
wifi | Exposes WiFi device and station statistics. | Linux wifi | Exposes WiFi device and station statistics. | Linux
xfrm | Exposes statistics from `/proc/net/xfrm_stat` | Linux xfrm | Exposes statistics from `/proc/net/xfrm_stat` | Linux
zoneinfo | Exposes NUMA memory zone metrics. | Linux zoneinfo | Exposes NUMA memory zone metrics. | Linux

View file

@ -2945,6 +2945,7 @@ node_scrape_collector_success{collector="thermal_zone"} 1
node_scrape_collector_success{collector="time"} 1 node_scrape_collector_success{collector="time"} 1
node_scrape_collector_success{collector="udp_queues"} 1 node_scrape_collector_success{collector="udp_queues"} 1
node_scrape_collector_success{collector="vmstat"} 1 node_scrape_collector_success{collector="vmstat"} 1
node_scrape_collector_success{collector="watchdog"} 1
node_scrape_collector_success{collector="wifi"} 1 node_scrape_collector_success{collector="wifi"} 1
node_scrape_collector_success{collector="xfrm"} 1 node_scrape_collector_success{collector="xfrm"} 1
node_scrape_collector_success{collector="xfs"} 1 node_scrape_collector_success{collector="xfs"} 1
@ -3218,6 +3219,31 @@ node_vmstat_pswpin 1476
# HELP node_vmstat_pswpout /proc/vmstat information field pswpout. # HELP node_vmstat_pswpout /proc/vmstat information field pswpout.
# TYPE node_vmstat_pswpout untyped # TYPE node_vmstat_pswpout untyped
node_vmstat_pswpout 35045 node_vmstat_pswpout 35045
# HELP node_watchdog_access_cs0 Value of /sys/class/watchdog/<watchdog>/access_cs0
# TYPE node_watchdog_access_cs0 gauge
node_watchdog_access_cs0{name="watchdog0"} 0
# HELP node_watchdog_bootstatus Value of /sys/class/watchdog/<watchdog>/bootstatus
# TYPE node_watchdog_bootstatus gauge
node_watchdog_bootstatus{name="watchdog0"} 1
# HELP node_watchdog_fw_version Value of /sys/class/watchdog/<watchdog>/fw_version
# TYPE node_watchdog_fw_version gauge
node_watchdog_fw_version{name="watchdog0"} 2
# HELP node_watchdog_info Info of /sys/class/watchdog/<watchdog>
# TYPE node_watchdog_info gauge
node_watchdog_info{identity="",name="watchdog1",options="",pretimeout_governor="",state="",status=""} 1
node_watchdog_info{identity="Software Watchdog",name="watchdog0",options="0x8380",pretimeout_governor="noop",state="active",status="0x8000"} 1
# HELP node_watchdog_nowayout Value of /sys/class/watchdog/<watchdog>/nowayout
# TYPE node_watchdog_nowayout gauge
node_watchdog_nowayout{name="watchdog0"} 0
# HELP node_watchdog_pretimeout_seconds Value of /sys/class/watchdog/<watchdog>/pretimeout
# TYPE node_watchdog_pretimeout_seconds gauge
node_watchdog_pretimeout_seconds{name="watchdog0"} 120
# HELP node_watchdog_timeleft_seconds Value of /sys/class/watchdog/<watchdog>/timeleft
# TYPE node_watchdog_timeleft_seconds gauge
node_watchdog_timeleft_seconds{name="watchdog0"} 300
# HELP node_watchdog_timeout_seconds Value of /sys/class/watchdog/<watchdog>/timeout
# TYPE node_watchdog_timeout_seconds gauge
node_watchdog_timeout_seconds{name="watchdog0"} 60
# HELP node_wifi_interface_frequency_hertz The current frequency a WiFi interface is operating at, in hertz. # HELP node_wifi_interface_frequency_hertz The current frequency a WiFi interface is operating at, in hertz.
# TYPE node_wifi_interface_frequency_hertz gauge # TYPE node_wifi_interface_frequency_hertz gauge
node_wifi_interface_frequency_hertz{device="wlan0"} 2.412e+09 node_wifi_interface_frequency_hertz{device="wlan0"} 2.412e+09

View file

@ -2967,6 +2967,7 @@ node_scrape_collector_success{collector="thermal_zone"} 1
node_scrape_collector_success{collector="time"} 1 node_scrape_collector_success{collector="time"} 1
node_scrape_collector_success{collector="udp_queues"} 1 node_scrape_collector_success{collector="udp_queues"} 1
node_scrape_collector_success{collector="vmstat"} 1 node_scrape_collector_success{collector="vmstat"} 1
node_scrape_collector_success{collector="watchdog"} 1
node_scrape_collector_success{collector="wifi"} 1 node_scrape_collector_success{collector="wifi"} 1
node_scrape_collector_success{collector="xfrm"} 1 node_scrape_collector_success{collector="xfrm"} 1
node_scrape_collector_success{collector="xfs"} 1 node_scrape_collector_success{collector="xfs"} 1
@ -3240,6 +3241,31 @@ node_vmstat_pswpin 1476
# HELP node_vmstat_pswpout /proc/vmstat information field pswpout. # HELP node_vmstat_pswpout /proc/vmstat information field pswpout.
# TYPE node_vmstat_pswpout untyped # TYPE node_vmstat_pswpout untyped
node_vmstat_pswpout 35045 node_vmstat_pswpout 35045
# HELP node_watchdog_access_cs0 Value of /sys/class/watchdog/<watchdog>/access_cs0
# TYPE node_watchdog_access_cs0 gauge
node_watchdog_access_cs0{name="watchdog0"} 0
# HELP node_watchdog_bootstatus Value of /sys/class/watchdog/<watchdog>/bootstatus
# TYPE node_watchdog_bootstatus gauge
node_watchdog_bootstatus{name="watchdog0"} 1
# HELP node_watchdog_fw_version Value of /sys/class/watchdog/<watchdog>/fw_version
# TYPE node_watchdog_fw_version gauge
node_watchdog_fw_version{name="watchdog0"} 2
# HELP node_watchdog_info Info of /sys/class/watchdog/<watchdog>
# TYPE node_watchdog_info gauge
node_watchdog_info{identity="",name="watchdog1",options="",pretimeout_governor="",state="",status=""} 1
node_watchdog_info{identity="Software Watchdog",name="watchdog0",options="0x8380",pretimeout_governor="noop",state="active",status="0x8000"} 1
# HELP node_watchdog_nowayout Value of /sys/class/watchdog/<watchdog>/nowayout
# TYPE node_watchdog_nowayout gauge
node_watchdog_nowayout{name="watchdog0"} 0
# HELP node_watchdog_pretimeout_seconds Value of /sys/class/watchdog/<watchdog>/pretimeout
# TYPE node_watchdog_pretimeout_seconds gauge
node_watchdog_pretimeout_seconds{name="watchdog0"} 120
# HELP node_watchdog_timeleft_seconds Value of /sys/class/watchdog/<watchdog>/timeleft
# TYPE node_watchdog_timeleft_seconds gauge
node_watchdog_timeleft_seconds{name="watchdog0"} 300
# HELP node_watchdog_timeout_seconds Value of /sys/class/watchdog/<watchdog>/timeout
# TYPE node_watchdog_timeout_seconds gauge
node_watchdog_timeout_seconds{name="watchdog0"} 60
# HELP node_wifi_interface_frequency_hertz The current frequency a WiFi interface is operating at, in hertz. # HELP node_wifi_interface_frequency_hertz The current frequency a WiFi interface is operating at, in hertz.
# TYPE node_wifi_interface_frequency_hertz gauge # TYPE node_wifi_interface_frequency_hertz gauge
node_wifi_interface_frequency_hertz{device="wlan0"} 2.412e+09 node_wifi_interface_frequency_hertz{device="wlan0"} 2.412e+09

View file

@ -1717,6 +1717,75 @@ SymlinkTo: ../../devices/virtual/thermal/cooling_device0
Path: sys/class/thermal/thermal_zone0 Path: sys/class/thermal/thermal_zone0
SymlinkTo: ../../devices/virtual/thermal/thermal_zone0 SymlinkTo: ../../devices/virtual/thermal/thermal_zone0
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/watchdog
Mode: 775
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/watchdog/watchdog0
Mode: 775
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/access_cs0
Lines: 1
0EOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/bootstatus
Lines: 1
1EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/fw_version
Lines: 1
2EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/identity
Lines: 1
Software WatchdogEOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/nowayout
Lines: 1
0EOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/options
Lines: 1
0x8380EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/pretimeout
Lines: 1
120EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/pretimeout_governor
Lines: 1
noopEOF
Mode: 644
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/state
Lines: 1
activeEOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/status
Lines: 1
0x8000EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/timeleft
Lines: 1
300EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Path: sys/class/watchdog/watchdog0/timeout
Lines: 1
60EOF
Mode: 444
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/class/watchdog/watchdog1
Mode: 775
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Directory: sys/devices Directory: sys/devices
Mode: 755 Mode: 755
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

133
collector/watchdog.go Normal file
View file

@ -0,0 +1,133 @@
// Copyright 2023 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build linux && !nowatchdog
// +build linux,!nowatchdog
package collector
import (
"fmt"
"github.com/go-kit/log"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/procfs/sysfs"
)
type watchdogCollector struct {
fs sysfs.FS
logger log.Logger
}
func init() {
registerCollector("watchdog", defaultDisabled, NewWatchdogCollector)
}
// NewWatchdogCollector returns a new Collector exposing watchdog stats.
func NewWatchdogCollector(logger log.Logger) (Collector, error) {
fs, err := sysfs.NewFS(*sysPath)
if err != nil {
return nil, fmt.Errorf("failed to open procfs: %w", err)
}
return &watchdogCollector{
fs: fs,
logger: logger,
}, nil
}
var (
watchdogBootstatusDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "bootstatus"),
"Value of /sys/class/watchdog/<watchdog>/bootstatus",
[]string{"name"}, nil,
)
watchdogFwVersionDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "fw_version"),
"Value of /sys/class/watchdog/<watchdog>/fw_version",
[]string{"name"}, nil,
)
watchdogNowayoutDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "nowayout"),
"Value of /sys/class/watchdog/<watchdog>/nowayout",
[]string{"name"}, nil,
)
watchdogTimeleftDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "timeleft_seconds"),
"Value of /sys/class/watchdog/<watchdog>/timeleft",
[]string{"name"}, nil,
)
watchdogTimeoutDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "timeout_seconds"),
"Value of /sys/class/watchdog/<watchdog>/timeout",
[]string{"name"}, nil,
)
watchdogPretimeoutDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "pretimeout_seconds"),
"Value of /sys/class/watchdog/<watchdog>/pretimeout",
[]string{"name"}, nil,
)
watchdogAccessCs0Desc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "access_cs0"),
"Value of /sys/class/watchdog/<watchdog>/access_cs0",
[]string{"name"}, nil,
)
watchdogInfoDesc = prometheus.NewDesc(
prometheus.BuildFQName(namespace, "watchdog", "info"),
"Info of /sys/class/watchdog/<watchdog>",
[]string{"name", "options", "identity", "state", "status", "pretimeout_governor"}, nil,
)
)
func toLabelValue(ptr *string) string {
if ptr == nil {
return ""
}
return *ptr
}
func (c *watchdogCollector) Update(ch chan<- prometheus.Metric) error {
watchdogClass, err := c.fs.WatchdogClass()
if err != nil {
return err
}
for _, wd := range watchdogClass {
if wd.Bootstatus != nil {
ch <- prometheus.MustNewConstMetric(watchdogBootstatusDesc, prometheus.GaugeValue, float64(*wd.Bootstatus), wd.Name)
}
if wd.FwVersion != nil {
ch <- prometheus.MustNewConstMetric(watchdogFwVersionDesc, prometheus.GaugeValue, float64(*wd.FwVersion), wd.Name)
}
if wd.Nowayout != nil {
ch <- prometheus.MustNewConstMetric(watchdogNowayoutDesc, prometheus.GaugeValue, float64(*wd.Nowayout), wd.Name)
}
if wd.Timeleft != nil {
ch <- prometheus.MustNewConstMetric(watchdogTimeleftDesc, prometheus.GaugeValue, float64(*wd.Timeleft), wd.Name)
}
if wd.Timeout != nil {
ch <- prometheus.MustNewConstMetric(watchdogTimeoutDesc, prometheus.GaugeValue, float64(*wd.Timeout), wd.Name)
}
if wd.Pretimeout != nil {
ch <- prometheus.MustNewConstMetric(watchdogPretimeoutDesc, prometheus.GaugeValue, float64(*wd.Pretimeout), wd.Name)
}
if wd.AccessCs0 != nil {
ch <- prometheus.MustNewConstMetric(watchdogAccessCs0Desc, prometheus.GaugeValue, float64(*wd.AccessCs0), wd.Name)
}
ch <- prometheus.MustNewConstMetric(watchdogInfoDesc, prometheus.GaugeValue, 1.0,
wd.Name, toLabelValue(wd.Options), toLabelValue(wd.Identity), toLabelValue(wd.State), toLabelValue(wd.Status), toLabelValue(wd.PretimeoutGovernor))
}
return nil
}

View file

@ -0,0 +1,92 @@
// Copyright 2023 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file ewcept in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build !nowatchdog
// +build !nowatchdog
package collector
import (
"fmt"
"os"
"strings"
"testing"
"github.com/go-kit/log"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
)
type testWatchdogCollector struct {
wc Collector
}
func (c testWatchdogCollector) Collect(ch chan<- prometheus.Metric) {
c.wc.Update(ch)
}
func (c testWatchdogCollector) Describe(ch chan<- *prometheus.Desc) {
prometheus.DescribeByCollect(c, ch)
}
func TestWatchdogStats(t *testing.T) {
testcase := `# HELP node_watchdog_access_cs0 Value of /sys/class/watchdog/<watchdog>/access_cs0
# TYPE node_watchdog_access_cs0 gauge
node_watchdog_access_cs0{name="watchdog0"} 0
# HELP node_watchdog_bootstatus Value of /sys/class/watchdog/<watchdog>/bootstatus
# TYPE node_watchdog_bootstatus gauge
node_watchdog_bootstatus{name="watchdog0"} 1
# HELP node_watchdog_fw_version Value of /sys/class/watchdog/<watchdog>/fw_version
# TYPE node_watchdog_fw_version gauge
node_watchdog_fw_version{name="watchdog0"} 2
# HELP node_watchdog_info Info of /sys/class/watchdog/<watchdog>
# TYPE node_watchdog_info gauge
node_watchdog_info{identity="",name="watchdog1",options="",pretimeout_governor="",state="",status=""} 1
node_watchdog_info{identity="Software Watchdog",name="watchdog0",options="0x8380",pretimeout_governor="noop",state="active",status="0x8000"} 1
# HELP node_watchdog_nowayout Value of /sys/class/watchdog/<watchdog>/nowayout
# TYPE node_watchdog_nowayout gauge
node_watchdog_nowayout{name="watchdog0"} 0
# HELP node_watchdog_pretimeout_seconds Value of /sys/class/watchdog/<watchdog>/pretimeout
# TYPE node_watchdog_pretimeout_seconds gauge
node_watchdog_pretimeout_seconds{name="watchdog0"} 120
# HELP node_watchdog_timeleft_seconds Value of /sys/class/watchdog/<watchdog>/timeleft
# TYPE node_watchdog_timeleft_seconds gauge
node_watchdog_timeleft_seconds{name="watchdog0"} 300
# HELP node_watchdog_timeout_seconds Value of /sys/class/watchdog/<watchdog>/timeout
# TYPE node_watchdog_timeout_seconds gauge
node_watchdog_timeout_seconds{name="watchdog0"} 60
`
*sysPath = "fixtures/sys"
logger := log.NewLogfmtLogger(os.Stderr)
c, err := NewWatchdogCollector(logger)
if err != nil {
t.Fatal(err)
}
reg := prometheus.NewRegistry()
reg.MustRegister(&testWatchdogCollector{wc: c})
sink := make(chan prometheus.Metric)
go func() {
err = c.Update(sink)
if err != nil {
panic(fmt.Errorf("failed to update collector: %s", err))
}
close(sink)
}()
err = testutil.GatherAndCompare(reg, strings.NewReader(testcase))
if err != nil {
t.Fatal(err)
}
}

View file

@ -49,6 +49,7 @@ enabled_collectors=$(cat << COLLECTORS
thermal_zone thermal_zone
udp_queues udp_queues
vmstat vmstat
watchdog
wifi wifi
xfrm xfrm
xfs xfs