Expose cpu bugs and flags as info metrics. (#1788)

* Expose cpu bugs and flags as info metrics with a regexp filter.
* Automatically enable CPU info metrics when using flags or bugs feature.

Signed-off-by: domgoer <domdoumc@gmail.com>
This commit is contained in:
domchan 2020-07-18 00:32:23 +08:00 committed by GitHub
parent f4b89c79a2
commit 503e4fc848
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 98 additions and 2 deletions

View file

@ -18,6 +18,7 @@ package collector
import (
"fmt"
"path/filepath"
"regexp"
"strconv"
"sync"
@ -32,16 +33,23 @@ type cpuCollector struct {
fs procfs.FS
cpu *prometheus.Desc
cpuInfo *prometheus.Desc
cpuFlagsInfo *prometheus.Desc
cpuBugsInfo *prometheus.Desc
cpuGuest *prometheus.Desc
cpuCoreThrottle *prometheus.Desc
cpuPackageThrottle *prometheus.Desc
logger log.Logger
cpuStats []procfs.CPUStat
cpuStatsMutex sync.Mutex
cpuFlagsIncludeRegexp *regexp.Regexp
cpuBugsIncludeRegexp *regexp.Regexp
}
var (
enableCPUInfo = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool()
flagsInclude = kingpin.Flag("collector.cpu.info.flags-include", "Filter the `flags` field in cpuInfo with a value that must be a regular expression").String()
bugsInclude = kingpin.Flag("collector.cpu.info.bugs-include", "Filter the `bugs` field in cpuInfo with a value that must be a regular expression").String()
)
func init() {
@ -54,7 +62,7 @@ func NewCPUCollector(logger log.Logger) (Collector, error) {
if err != nil {
return nil, fmt.Errorf("failed to open procfs: %w", err)
}
return &cpuCollector{
c := &cpuCollector{
fs: fs,
cpu: nodeCPUSecondsDesc,
cpuInfo: prometheus.NewDesc(
@ -62,6 +70,16 @@ func NewCPUCollector(logger log.Logger) (Collector, error) {
"CPU information from /proc/cpuinfo.",
[]string{"package", "core", "cpu", "vendor", "family", "model", "model_name", "microcode", "stepping", "cachesize"}, nil,
),
cpuFlagsInfo: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "flag_info"),
"The `flags` field of CPU information from /proc/cpuinfo.",
[]string{"flag"}, nil,
),
cpuBugsInfo: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "bug_info"),
"The `bugs` field of CPU information from /proc/cpuinfo.",
[]string{"bug"}, nil,
),
cpuGuest: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "guest_seconds_total"),
"Seconds the cpus spent in guests (VMs) for each mode.",
@ -78,7 +96,34 @@ func NewCPUCollector(logger log.Logger) (Collector, error) {
[]string{"package"}, nil,
),
logger: logger,
}, nil
}
err = c.compileIncludeFlags(flagsInclude, bugsInclude)
if err != nil {
return nil, fmt.Errorf("fail to compile --collector.cpu.info.flags-include and --collector.cpu.info.bugs-include, the values of them must be regular expressions: %w", err)
}
return c, nil
}
func (c *cpuCollector) compileIncludeFlags(flagsIncludeFlag, bugsIncludeFlag *string) error {
if (*flagsIncludeFlag != "" || *bugsIncludeFlag != "") && !*enableCPUInfo {
*enableCPUInfo = true
level.Info(c.logger).Log("msg", "--collector.cpu.info has been set to `true` because you set the following flags, like --collector.cpu.info.flags-include and --collector.cpu.info.bugs-include")
}
var err error
if *flagsIncludeFlag != "" {
c.cpuFlagsIncludeRegexp, err = regexp.Compile(*flagsIncludeFlag)
if err != nil {
return err
}
}
if *bugsIncludeFlag != "" {
c.cpuBugsIncludeRegexp, err = regexp.Compile(*bugsIncludeFlag)
if err != nil {
return err
}
}
return nil
}
// Update implements Collector and exposes cpu related metrics from /proc/stat and /sys/.../cpu/.
@ -117,6 +162,31 @@ func (c *cpuCollector) updateInfo(ch chan<- prometheus.Metric) error {
cpu.Microcode,
cpu.Stepping,
cpu.CacheSize)
if err := updateFieldInfo(cpu.Flags, c.cpuFlagsIncludeRegexp, c.cpuFlagsInfo, ch); err != nil {
return err
}
if err := updateFieldInfo(cpu.Bugs, c.cpuBugsIncludeRegexp, c.cpuBugsInfo, ch); err != nil {
return err
}
}
return nil
}
func updateFieldInfo(valueList []string, filter *regexp.Regexp, desc *prometheus.Desc, ch chan<- prometheus.Metric) error {
if filter == nil {
return nil
}
for _, val := range valueList {
if !filter.MatchString(val) {
continue
}
ch <- prometheus.MustNewConstMetric(desc,
prometheus.GaugeValue,
1,
val,
)
}
return nil
}

View file

@ -184,12 +184,24 @@ node_cooling_device_cur_state{name="0",type="Processor"} 0
# HELP node_cooling_device_max_state Maximum throttle state of the cooling device
# TYPE node_cooling_device_max_state gauge
node_cooling_device_max_state{name="0",type="Processor"} 3
# HELP node_cpu_bug_info The `bugs` field of CPU information from /proc/cpuinfo.
# TYPE node_cpu_bug_info gauge
node_cpu_bug_info{bug="cpu_meltdown"} 1
node_cpu_bug_info{bug="mds"} 1
node_cpu_bug_info{bug="spectre_v1"} 1
node_cpu_bug_info{bug="spectre_v2"} 1
# HELP node_cpu_core_throttles_total Number of times this cpu core has been throttled.
# TYPE node_cpu_core_throttles_total counter
node_cpu_core_throttles_total{core="0",package="0"} 5
node_cpu_core_throttles_total{core="0",package="1"} 0
node_cpu_core_throttles_total{core="1",package="0"} 0
node_cpu_core_throttles_total{core="1",package="1"} 9
# HELP node_cpu_flag_info The `flags` field of CPU information from /proc/cpuinfo.
# TYPE node_cpu_flag_info gauge
node_cpu_flag_info{flag="aes"} 1
node_cpu_flag_info{flag="avx"} 1
node_cpu_flag_info{flag="avx2"} 1
node_cpu_flag_info{flag="constant_tsc"} 1
# HELP node_cpu_guest_seconds_total Seconds the cpus spent in guests (VMs) for each mode.
# TYPE node_cpu_guest_seconds_total counter
node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0.01

View file

@ -232,12 +232,24 @@ node_cooling_device_cur_state{name="0",type="Processor"} 0
# HELP node_cooling_device_max_state Maximum throttle state of the cooling device
# TYPE node_cooling_device_max_state gauge
node_cooling_device_max_state{name="0",type="Processor"} 3
# HELP node_cpu_bug_info The `bugs` field of CPU information from /proc/cpuinfo.
# TYPE node_cpu_bug_info gauge
node_cpu_bug_info{bug="cpu_meltdown"} 1
node_cpu_bug_info{bug="mds"} 1
node_cpu_bug_info{bug="spectre_v1"} 1
node_cpu_bug_info{bug="spectre_v2"} 1
# HELP node_cpu_core_throttles_total Number of times this cpu core has been throttled.
# TYPE node_cpu_core_throttles_total counter
node_cpu_core_throttles_total{core="0",package="0"} 5
node_cpu_core_throttles_total{core="0",package="1"} 0
node_cpu_core_throttles_total{core="1",package="0"} 0
node_cpu_core_throttles_total{core="1",package="1"} 9
# HELP node_cpu_flag_info The `flags` field of CPU information from /proc/cpuinfo.
# TYPE node_cpu_flag_info gauge
node_cpu_flag_info{flag="aes"} 1
node_cpu_flag_info{flag="avx"} 1
node_cpu_flag_info{flag="avx2"} 1
node_cpu_flag_info{flag="constant_tsc"} 1
# HELP node_cpu_guest_seconds_total Seconds the cpus spent in guests (VMs) for each mode.
# TYPE node_cpu_guest_seconds_total counter
node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0.01

View file

@ -107,6 +107,8 @@ fi
--collector.qdisc.fixtures="collector/fixtures/qdisc/" \
--collector.netclass.ignored-devices="(bond0|dmz|int)" \
--collector.cpu.info \
--collector.cpu.info.flags-include="^(aes|avx.?|constant_tsc)$" \
--collector.cpu.info.bugs-include="^(cpu_meltdown|spectre_.*|mds)$" \
--web.listen-address "127.0.0.1:${port}" \
--log.level="debug" > "${tmpdir}/node_exporter.log" 2>&1 &