Fix num cpu (#1561)

* add a map of profilers to CPUids

`runtime.NumCPU()` returns the number of CPUs that the process can run
on. This number does not necessarily correlate to CPU ids if the
affinity mask of the process is set.

This change maintains the current behavior as default, but also allows
the user to specify a range of CPUids to use instead.

The CPU id is stored as the value of a map keyed on the profiler
object's address.

Signed-off-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Daniel Hodges <hodges.daniel.scott@gmail.com>
Signed-off-by: Daniel Hodges <hodges@uber.com>

Co-authored-by: jdamato-fsly <55214354+jdamato-fsly@users.noreply.github.com>
This commit is contained in:
Daniel Hodges 2020-02-20 05:36:33 -05:00 committed by GitHub
parent b40954dce5
commit ec62141388
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 213 additions and 34 deletions

View file

@ -14,6 +14,7 @@
### Changes
* [ENHANCEMENT] Add `--collector.perf.cpus` to allow setting the CPU list for perf stats.
* [CHANGE] Add `--collector.netdev.device-whitelist`. #1279
* [CHANGE] Refactor mdadm collector #1403
* [CHANGE] Add `mountaddr` label to NFS metrics. #1417

View file

@ -84,6 +84,18 @@ Depending on the configured value different metrics will be available, for most
cases `0` will provide the most complete set. For more information see [`man 2
perf_event_open`](http://man7.org/linux/man-pages/man2/perf_event_open.2.html).
By default, the perf collector will only collect metrics of the CPUs that
`node_exporter` is running on (ie
[`runtime.NumCPU`](https://golang.org/pkg/runtime/#NumCPU). If this is
insufficient (e.g. if you run `node_exporter` with its CPU affinity set to
specific CPUs) You can specify a list of alternate CPUs by using the
`--collector.perf.cpus` flag. For example, to collect metrics on CPUs 2-6, you
would specify: `--collector.perf --collector.perf.cpus=2-6`. The CPU
configuration is zero indexed and can also take a stride value
`--collector.perf --collector.perf.cpus=1-10:5`, would collect on CPUs
1, 5, and 10.
Name | Description | OS
---------|-------------|----
buddyinfo | Exposes statistics of memory fragments as reported by /proc/buddyinfo. | Linux

View file

@ -14,18 +14,25 @@
package collector
import (
"fmt"
"runtime"
"strconv"
"strings"
"github.com/go-kit/kit/log"
"github.com/hodgesds/perf-utils"
"github.com/prometheus/client_golang/prometheus"
kingpin "gopkg.in/alecthomas/kingpin.v2"
)
const (
perfSubsystem = "perf"
)
var (
perfCPUsFlag = kingpin.Flag("collector.perf.cpus", "List of CPUs from which perf metrics should be collected").Default("").String()
)
func init() {
registerCollector(perfSubsystem, defaultDisabled, NewPerfCollector)
}
@ -36,40 +43,123 @@ func init() {
// settings not all profiler values may be exposed on the target system at any
// given time.
type perfCollector struct {
perfHwProfilers map[int]perf.HardwareProfiler
perfSwProfilers map[int]perf.SoftwareProfiler
perfCacheProfilers map[int]perf.CacheProfiler
desc map[string]*prometheus.Desc
logger log.Logger
hwProfilerCPUMap map[*perf.HardwareProfiler]int
swProfilerCPUMap map[*perf.SoftwareProfiler]int
cacheProfilerCPUMap map[*perf.CacheProfiler]int
perfHwProfilers map[int]*perf.HardwareProfiler
perfSwProfilers map[int]*perf.SoftwareProfiler
perfCacheProfilers map[int]*perf.CacheProfiler
desc map[string]*prometheus.Desc
logger log.Logger
}
// perfCPUFlagToCPUs returns a set of CPUs for the perf collectors to monitor.
func perfCPUFlagToCPUs(cpuFlag string) ([]int, error) {
var err error
cpus := []int{}
for _, subset := range strings.Split(cpuFlag, ",") {
// First parse a single CPU.
if !strings.Contains(subset, "-") {
cpu, err := strconv.Atoi(subset)
if err != nil {
return nil, err
}
cpus = append(cpus, cpu)
continue
}
stride := 1
// Handle strides, ie 1-10:5 should yield 1,5,10
strideSet := strings.Split(subset, ":")
if len(strideSet) == 2 {
stride, err = strconv.Atoi(strideSet[1])
if err != nil {
return nil, err
}
}
rangeSet := strings.Split(strideSet[0], "-")
if len(rangeSet) != 2 {
return nil, fmt.Errorf("invalid flag value %q", cpuFlag)
}
start, err := strconv.Atoi(rangeSet[0])
if err != nil {
return nil, err
}
end, err := strconv.Atoi(rangeSet[1])
if err != nil {
return nil, err
}
for i := start; i <= end; i += stride {
cpus = append(cpus, i)
}
}
return cpus, nil
}
// NewPerfCollector returns a new perf based collector, it creates a profiler
// per CPU.
func NewPerfCollector(logger log.Logger) (Collector, error) {
c := &perfCollector{
perfHwProfilers: map[int]perf.HardwareProfiler{},
perfSwProfilers: map[int]perf.SoftwareProfiler{},
perfCacheProfilers: map[int]perf.CacheProfiler{},
logger: logger,
collector := &perfCollector{
perfHwProfilers: map[int]*perf.HardwareProfiler{},
perfSwProfilers: map[int]*perf.SoftwareProfiler{},
perfCacheProfilers: map[int]*perf.CacheProfiler{},
hwProfilerCPUMap: map[*perf.HardwareProfiler]int{},
swProfilerCPUMap: map[*perf.SoftwareProfiler]int{},
cacheProfilerCPUMap: map[*perf.CacheProfiler]int{},
logger: logger,
}
ncpus := runtime.NumCPU()
for i := 0; i < ncpus; i++ {
// Use -1 to profile all processes on the CPU, see:
// man perf_event_open
c.perfHwProfilers[i] = perf.NewHardwareProfiler(-1, i)
if err := c.perfHwProfilers[i].Start(); err != nil {
return c, err
if perfCPUsFlag != nil && *perfCPUsFlag != "" {
cpus, err := perfCPUFlagToCPUs(*perfCPUsFlag)
if err != nil {
return nil, err
}
c.perfSwProfilers[i] = perf.NewSoftwareProfiler(-1, i)
if err := c.perfSwProfilers[i].Start(); err != nil {
return c, err
for _, cpu := range cpus {
// Use -1 to profile all processes on the CPU, see:
// man perf_event_open
hwProf := perf.NewHardwareProfiler(-1, cpu)
if err := hwProf.Start(); err != nil {
return nil, err
}
collector.perfHwProfilers[cpu] = &hwProf
swProf := perf.NewSoftwareProfiler(-1, cpu)
if err := swProf.Start(); err != nil {
return nil, err
}
collector.perfSwProfilers[cpu] = &swProf
cacheProf := perf.NewCacheProfiler(-1, cpu)
if err := cacheProf.Start(); err != nil {
return nil, err
}
collector.perfCacheProfilers[cpu] = &cacheProf
}
c.perfCacheProfilers[i] = perf.NewCacheProfiler(-1, i)
if err := c.perfCacheProfilers[i].Start(); err != nil {
return c, err
} else {
for i := 0; i < runtime.NumCPU(); i++ {
hwProf := perf.NewHardwareProfiler(-1, i)
if err := hwProf.Start(); err != nil {
return nil, err
}
collector.perfHwProfilers[i] = &hwProf
swProf := perf.NewSoftwareProfiler(-1, i)
if err := swProf.Start(); err != nil {
return nil, err
}
collector.perfSwProfilers[i] = &swProf
cacheProf := perf.NewCacheProfiler(-1, i)
if err := cacheProf.Start(); err != nil {
return nil, err
}
collector.perfCacheProfilers[i] = &cacheProf
}
}
c.desc = map[string]*prometheus.Desc{
collector.desc = map[string]*prometheus.Desc{
"cpucycles_total": prometheus.NewDesc(
prometheus.BuildFQName(
namespace,
@ -312,7 +402,7 @@ func NewPerfCollector(logger log.Logger) (Collector, error) {
),
}
return c, nil
return collector, nil
}
// Update implements the Collector interface and will collect metrics per CPU.
@ -333,9 +423,10 @@ func (c *perfCollector) Update(ch chan<- prometheus.Metric) error {
}
func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error {
for cpu, profiler := range c.perfHwProfilers {
cpuStr := strconv.Itoa(cpu)
hwProfile, err := profiler.Profile()
for _, profiler := range c.perfHwProfilers {
cpuid := c.hwProfilerCPUMap[profiler]
cpuStr := fmt.Sprintf("%d", cpuid)
hwProfile, err := (*profiler).Profile()
if err != nil {
return err
}
@ -404,9 +495,10 @@ func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error {
}
func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error {
for cpu, profiler := range c.perfSwProfilers {
cpuStr := strconv.Itoa(cpu)
swProfile, err := profiler.Profile()
for _, profiler := range c.perfSwProfilers {
cpuid := c.swProfilerCPUMap[profiler]
cpuStr := fmt.Sprintf("%d", cpuid)
swProfile, err := (*profiler).Profile()
if err != nil {
return err
}
@ -459,9 +551,10 @@ func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error {
}
func (c *perfCollector) updateCacheStats(ch chan<- prometheus.Metric) error {
for cpu, profiler := range c.perfCacheProfilers {
cpuStr := strconv.Itoa(cpu)
cacheProfile, err := profiler.Profile()
for _, profiler := range c.perfCacheProfilers {
cpuid := c.cacheProfilerCPUMap[profiler]
cpuStr := fmt.Sprintf("%d", cpuid)
cacheProfile, err := (*profiler).Profile()
if err != nil {
return err
}

View file

@ -54,3 +54,76 @@ func TestPerfCollector(t *testing.T) {
t.Fatal(err)
}
}
func TestPerfCPUFlagToCPUs(t *testing.T) {
tests := []struct {
name string
flag string
exCpus []int
errStr string
}{
{
name: "valid single cpu",
flag: "1",
exCpus: []int{1},
},
{
name: "valid range cpus",
flag: "1-5",
exCpus: []int{1, 2, 3, 4, 5},
},
{
name: "valid double digit",
flag: "10",
exCpus: []int{10},
},
{
name: "valid double digit range",
flag: "10-12",
exCpus: []int{10, 11, 12},
},
{
name: "valid double digit stride",
flag: "10-20:5",
exCpus: []int{10, 15, 20},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
cpus, err := perfCPUFlagToCPUs(test.flag)
if test.errStr != "" {
if err != nil {
t.Fatal("expected error to not be nil")
}
if test.errStr != err.Error() {
t.Fatalf(
"expected error %q, got %q",
test.errStr,
err.Error(),
)
}
return
}
if err != nil {
t.Fatal(err)
}
if len(cpus) != len(test.exCpus) {
t.Fatalf(
"expected cpus %v, got %v",
test.exCpus,
cpus,
)
}
for i := range cpus {
if test.exCpus[i] != cpus[i] {
t.Fatalf(
"expected cpus %v, got %v",
test.exCpus,
cpus,
)
}
}
})
}
}