node_exporter/collector/cpu_linux.go
Karsten Weiss efc1fdb6d0 cpu: Add a 2nd label 'package' to metric node_cpu_core_throttles_total (#871)
* cpu: Add a 2nd label 'package' to metric node_cpu_core_throttles_total

This commit fixes the node_cpu_core_throttles_total metrics on
multi-socket systems as the core_ids are the same for each package.
I.e. we need to count them seperately.

Rename the node_package_throttles_total metric label `node` to `package`.

Reorganize the sys.ttar archive and use the same symlinks as the Linux
kernel. Also, the new fixtures now use a dual-socket dual-core cpu w/o
HT/SMT (node0: cpu0+1, node1: cpu2+3) as well as processor-less
(memory-only) NUMA node 'node2' (this is a very rare case).

Signed-off-by: Karsten Weiss <knweiss@gmail.com>

* cpu: Use the direct /sys path to the cpu files.

Use the direct path /sys/devices/system/cpu/cpu[0-9]* (without symlinks)
instead of /sys/bus/cpu/devices/cpu[0-9]*.

The latter path also does not exist e.g. on RHEL 6.9's kernel.

Signed-off-by: Karsten Weiss <knweiss@gmail.com>

* cpu: Reverse core+package throttle processing order

Signed-off-by: Karsten Weiss <knweiss@gmail.com>

* cpu: Add documentation URLs

Signed-off-by: Karsten Weiss <knweiss@gmail.com>
2018-04-09 18:01:52 +02:00

229 lines
8.4 KiB
Go

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !nocpu
package collector
import (
"fmt"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/log"
"github.com/prometheus/procfs"
)
var (
digitRegexp = regexp.MustCompile("[0-9]+")
)
type cpuCollector struct {
cpu *prometheus.Desc
cpuGuest *prometheus.Desc
cpuFreq *prometheus.Desc
cpuFreqMin *prometheus.Desc
cpuFreqMax *prometheus.Desc
cpuCoreThrottle *prometheus.Desc
cpuPackageThrottle *prometheus.Desc
}
func init() {
registerCollector("cpu", defaultEnabled, NewCPUCollector)
}
// NewCPUCollector returns a new Collector exposing kernel/system statistics.
func NewCPUCollector() (Collector, error) {
return &cpuCollector{
cpu: nodeCpuSecondsDesc,
cpuGuest: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "guest_seconds_total"),
"Seconds the cpus spent in guests (VMs) for each mode.",
[]string{"cpu", "mode"}, nil,
),
cpuFreq: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "frequency_hertz"),
"Current cpu thread frequency in hertz.",
[]string{"cpu"}, nil,
),
cpuFreqMin: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "frequency_min_hertz"),
"Minimum cpu thread frequency in hertz.",
[]string{"cpu"}, nil,
),
cpuFreqMax: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "frequency_max_hertz"),
"Maximum cpu thread frequency in hertz.",
[]string{"cpu"}, nil,
),
cpuCoreThrottle: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "core_throttles_total"),
"Number of times this cpu core has been throttled.",
[]string{"package", "core"}, nil,
),
cpuPackageThrottle: prometheus.NewDesc(
prometheus.BuildFQName(namespace, cpuCollectorSubsystem, "package_throttles_total"),
"Number of times this cpu package has been throttled.",
[]string{"package"}, nil,
),
}, nil
}
// Update implements Collector and exposes cpu related metrics from /proc/stat and /sys/.../cpu/.
func (c *cpuCollector) Update(ch chan<- prometheus.Metric) error {
if err := c.updateStat(ch); err != nil {
return err
}
if err := c.updateCPUfreq(ch); err != nil {
return err
}
return nil
}
// updateCPUfreq reads /sys/devices/system/cpu/cpu* and expose cpu frequency statistics.
func (c *cpuCollector) updateCPUfreq(ch chan<- prometheus.Metric) error {
cpus, err := filepath.Glob(sysFilePath("devices/system/cpu/cpu[0-9]*"))
if err != nil {
return err
}
var value uint64
packageThrottles := make(map[uint64]uint64)
packageCoreThrottles := make(map[uint64]map[uint64]uint64)
// cpu loop
for _, cpu := range cpus {
_, cpuName := filepath.Split(cpu)
cpuNum := strings.TrimPrefix(cpuName, "cpu")
if _, err := os.Stat(filepath.Join(cpu, "cpufreq")); os.IsNotExist(err) {
log.Debugf("CPU %v is missing cpufreq", cpu)
} else {
// sysfs cpufreq values are kHz, thus multiply by 1000 to export base units (hz).
// See https://www.kernel.org/doc/Documentation/cpu-freq/user-guide.txt
if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq", "scaling_cur_freq")); err != nil {
return err
}
ch <- prometheus.MustNewConstMetric(c.cpuFreq, prometheus.GaugeValue, float64(value)*1000.0, cpuNum)
if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq", "scaling_min_freq")); err != nil {
return err
}
ch <- prometheus.MustNewConstMetric(c.cpuFreqMin, prometheus.GaugeValue, float64(value)*1000.0, cpuNum)
if value, err = readUintFromFile(filepath.Join(cpu, "cpufreq", "scaling_max_freq")); err != nil {
return err
}
ch <- prometheus.MustNewConstMetric(c.cpuFreqMax, prometheus.GaugeValue, float64(value)*1000.0, cpuNum)
}
// See
// https://www.kernel.org/doc/Documentation/x86/topology.txt
// https://www.kernel.org/doc/Documentation/cputopology.txt
// https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-devices-system-cpu
var err error
var physicalPackageID, coreID uint64
// topology/physical_package_id
if physicalPackageID, err = readUintFromFile(filepath.Join(cpu, "topology", "physical_package_id")); err != nil {
log.Debugf("CPU %v is missing physical_package_id", cpu)
continue
}
// topology/core_id
if coreID, err = readUintFromFile(filepath.Join(cpu, "topology", "core_id")); err != nil {
log.Debugf("CPU %v is missing core_id", cpu)
continue
}
// metric node_cpu_core_throttles_total
//
// We process this metric before the package throttles as there
// are cpu+kernel combinations that only present core throttles
// but no package throttles.
// Seen e.g. on an Intel Xeon E5472 system with RHEL 6.9 kernel.
if _, present := packageCoreThrottles[physicalPackageID]; !present {
packageCoreThrottles[physicalPackageID] = make(map[uint64]uint64)
}
if _, present := packageCoreThrottles[physicalPackageID][coreID]; !present {
// Read thermal_throttle/core_throttle_count only once
if coreThrottleCount, err := readUintFromFile(filepath.Join(cpu, "thermal_throttle", "core_throttle_count")); err == nil {
packageCoreThrottles[physicalPackageID][coreID] = coreThrottleCount
} else {
log.Debugf("CPU %v is missing core_throttle_count", cpu)
}
}
// metric node_cpu_package_throttles_total
if _, present := packageThrottles[physicalPackageID]; !present {
// Read thermal_throttle/package_throttle_count only once
if packageThrottleCount, err := readUintFromFile(filepath.Join(cpu, "thermal_throttle", "package_throttle_count")); err == nil {
packageThrottles[physicalPackageID] = packageThrottleCount
} else {
log.Debugf("CPU %v is missing package_throttle_count", cpu)
}
}
}
for physicalPackageID, packageThrottleCount := range packageThrottles {
ch <- prometheus.MustNewConstMetric(c.cpuPackageThrottle,
prometheus.CounterValue,
float64(packageThrottleCount),
strconv.FormatUint(physicalPackageID, 10))
}
for physicalPackageID, core_map := range packageCoreThrottles {
for coreID, coreThrottleCount := range core_map {
ch <- prometheus.MustNewConstMetric(c.cpuCoreThrottle,
prometheus.CounterValue,
float64(coreThrottleCount),
strconv.FormatUint(physicalPackageID, 10),
strconv.FormatUint(coreID, 10))
}
}
return nil
}
// updateStat reads /proc/stat through procfs and exports cpu related metrics.
func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error {
fs, err := procfs.NewFS(*procPath)
if err != nil {
return fmt.Errorf("failed to open procfs: %v", err)
}
stats, err := fs.NewStat()
if err != nil {
return err
}
for cpuID, cpuStat := range stats.CPU {
cpuNum := fmt.Sprintf("%d", cpuID)
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.User, cpuNum, "user")
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.Nice, cpuNum, "nice")
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.System, cpuNum, "system")
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.Idle, cpuNum, "idle")
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.Iowait, cpuNum, "iowait")
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.IRQ, cpuNum, "irq")
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.SoftIRQ, cpuNum, "softirq")
ch <- prometheus.MustNewConstMetric(c.cpu, prometheus.CounterValue, cpuStat.Steal, cpuNum, "steal")
// Guest CPU is also accounted for in cpuStat.User and cpuStat.Nice, expose these as separate metrics.
ch <- prometheus.MustNewConstMetric(c.cpuGuest, prometheus.CounterValue, cpuStat.Guest, cpuNum, "user")
ch <- prometheus.MustNewConstMetric(c.cpuGuest, prometheus.CounterValue, cpuStat.GuestNice, cpuNum, "nice")
}
return nil
}