diff --git a/CHANGELOG.md b/CHANGELOG.md index f017b571..68022ac9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ ### Changes +* [ENHANCEMENT] Add `--collector.perf.cpus` to allow setting the CPU list for perf stats. * [CHANGE] Add `--collector.netdev.device-whitelist`. #1279 * [CHANGE] Refactor mdadm collector #1403 * [CHANGE] Add `mountaddr` label to NFS metrics. #1417 diff --git a/README.md b/README.md index 0a0f7c36..73a34494 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,18 @@ Depending on the configured value different metrics will be available, for most cases `0` will provide the most complete set. For more information see [`man 2 perf_event_open`](http://man7.org/linux/man-pages/man2/perf_event_open.2.html). +By default, the perf collector will only collect metrics of the CPUs that +`node_exporter` is running on (ie +[`runtime.NumCPU`](https://golang.org/pkg/runtime/#NumCPU). If this is +insufficient (e.g. if you run `node_exporter` with its CPU affinity set to +specific CPUs) You can specify a list of alternate CPUs by using the +`--collector.perf.cpus` flag. For example, to collect metrics on CPUs 2-6, you +would specify: `--collector.perf --collector.perf.cpus=2-6`. The CPU +configuration is zero indexed and can also take a stride value +`--collector.perf --collector.perf.cpus=1-10:5`, would collect on CPUs +1, 5, and 10. + + Name | Description | OS ---------|-------------|---- buddyinfo | Exposes statistics of memory fragments as reported by /proc/buddyinfo. | Linux diff --git a/collector/perf_linux.go b/collector/perf_linux.go index e8a52b43..b67f970d 100644 --- a/collector/perf_linux.go +++ b/collector/perf_linux.go @@ -14,18 +14,25 @@ package collector import ( + "fmt" "runtime" "strconv" + "strings" "github.com/go-kit/kit/log" "github.com/hodgesds/perf-utils" "github.com/prometheus/client_golang/prometheus" + kingpin "gopkg.in/alecthomas/kingpin.v2" ) const ( perfSubsystem = "perf" ) +var ( + perfCPUsFlag = kingpin.Flag("collector.perf.cpus", "List of CPUs from which perf metrics should be collected").Default("").String() +) + func init() { registerCollector(perfSubsystem, defaultDisabled, NewPerfCollector) } @@ -36,40 +43,123 @@ func init() { // settings not all profiler values may be exposed on the target system at any // given time. type perfCollector struct { - perfHwProfilers map[int]perf.HardwareProfiler - perfSwProfilers map[int]perf.SoftwareProfiler - perfCacheProfilers map[int]perf.CacheProfiler - desc map[string]*prometheus.Desc - logger log.Logger + hwProfilerCPUMap map[*perf.HardwareProfiler]int + swProfilerCPUMap map[*perf.SoftwareProfiler]int + cacheProfilerCPUMap map[*perf.CacheProfiler]int + perfHwProfilers map[int]*perf.HardwareProfiler + perfSwProfilers map[int]*perf.SoftwareProfiler + perfCacheProfilers map[int]*perf.CacheProfiler + desc map[string]*prometheus.Desc + logger log.Logger +} + +// perfCPUFlagToCPUs returns a set of CPUs for the perf collectors to monitor. +func perfCPUFlagToCPUs(cpuFlag string) ([]int, error) { + var err error + cpus := []int{} + for _, subset := range strings.Split(cpuFlag, ",") { + // First parse a single CPU. + if !strings.Contains(subset, "-") { + cpu, err := strconv.Atoi(subset) + if err != nil { + return nil, err + } + cpus = append(cpus, cpu) + continue + } + + stride := 1 + // Handle strides, ie 1-10:5 should yield 1,5,10 + strideSet := strings.Split(subset, ":") + if len(strideSet) == 2 { + stride, err = strconv.Atoi(strideSet[1]) + if err != nil { + return nil, err + } + } + + rangeSet := strings.Split(strideSet[0], "-") + if len(rangeSet) != 2 { + return nil, fmt.Errorf("invalid flag value %q", cpuFlag) + } + start, err := strconv.Atoi(rangeSet[0]) + if err != nil { + return nil, err + } + end, err := strconv.Atoi(rangeSet[1]) + if err != nil { + return nil, err + } + for i := start; i <= end; i += stride { + cpus = append(cpus, i) + } + } + + return cpus, nil } // NewPerfCollector returns a new perf based collector, it creates a profiler // per CPU. func NewPerfCollector(logger log.Logger) (Collector, error) { - c := &perfCollector{ - perfHwProfilers: map[int]perf.HardwareProfiler{}, - perfSwProfilers: map[int]perf.SoftwareProfiler{}, - perfCacheProfilers: map[int]perf.CacheProfiler{}, - logger: logger, + collector := &perfCollector{ + perfHwProfilers: map[int]*perf.HardwareProfiler{}, + perfSwProfilers: map[int]*perf.SoftwareProfiler{}, + perfCacheProfilers: map[int]*perf.CacheProfiler{}, + hwProfilerCPUMap: map[*perf.HardwareProfiler]int{}, + swProfilerCPUMap: map[*perf.SoftwareProfiler]int{}, + cacheProfilerCPUMap: map[*perf.CacheProfiler]int{}, + logger: logger, } - ncpus := runtime.NumCPU() - for i := 0; i < ncpus; i++ { - // Use -1 to profile all processes on the CPU, see: - // man perf_event_open - c.perfHwProfilers[i] = perf.NewHardwareProfiler(-1, i) - if err := c.perfHwProfilers[i].Start(); err != nil { - return c, err + + if perfCPUsFlag != nil && *perfCPUsFlag != "" { + cpus, err := perfCPUFlagToCPUs(*perfCPUsFlag) + if err != nil { + return nil, err } - c.perfSwProfilers[i] = perf.NewSoftwareProfiler(-1, i) - if err := c.perfSwProfilers[i].Start(); err != nil { - return c, err + for _, cpu := range cpus { + // Use -1 to profile all processes on the CPU, see: + // man perf_event_open + hwProf := perf.NewHardwareProfiler(-1, cpu) + if err := hwProf.Start(); err != nil { + return nil, err + } + collector.perfHwProfilers[cpu] = &hwProf + + swProf := perf.NewSoftwareProfiler(-1, cpu) + if err := swProf.Start(); err != nil { + return nil, err + } + collector.perfSwProfilers[cpu] = &swProf + + cacheProf := perf.NewCacheProfiler(-1, cpu) + if err := cacheProf.Start(); err != nil { + return nil, err + } + collector.perfCacheProfilers[cpu] = &cacheProf } - c.perfCacheProfilers[i] = perf.NewCacheProfiler(-1, i) - if err := c.perfCacheProfilers[i].Start(); err != nil { - return c, err + } else { + for i := 0; i < runtime.NumCPU(); i++ { + hwProf := perf.NewHardwareProfiler(-1, i) + if err := hwProf.Start(); err != nil { + return nil, err + } + collector.perfHwProfilers[i] = &hwProf + + swProf := perf.NewSoftwareProfiler(-1, i) + if err := swProf.Start(); err != nil { + return nil, err + } + collector.perfSwProfilers[i] = &swProf + + cacheProf := perf.NewCacheProfiler(-1, i) + if err := cacheProf.Start(); err != nil { + return nil, err + } + collector.perfCacheProfilers[i] = &cacheProf } } - c.desc = map[string]*prometheus.Desc{ + + collector.desc = map[string]*prometheus.Desc{ "cpucycles_total": prometheus.NewDesc( prometheus.BuildFQName( namespace, @@ -312,7 +402,7 @@ func NewPerfCollector(logger log.Logger) (Collector, error) { ), } - return c, nil + return collector, nil } // Update implements the Collector interface and will collect metrics per CPU. @@ -333,9 +423,10 @@ func (c *perfCollector) Update(ch chan<- prometheus.Metric) error { } func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { - for cpu, profiler := range c.perfHwProfilers { - cpuStr := strconv.Itoa(cpu) - hwProfile, err := profiler.Profile() + for _, profiler := range c.perfHwProfilers { + cpuid := c.hwProfilerCPUMap[profiler] + cpuStr := fmt.Sprintf("%d", cpuid) + hwProfile, err := (*profiler).Profile() if err != nil { return err } @@ -404,9 +495,10 @@ func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error { } func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { - for cpu, profiler := range c.perfSwProfilers { - cpuStr := strconv.Itoa(cpu) - swProfile, err := profiler.Profile() + for _, profiler := range c.perfSwProfilers { + cpuid := c.swProfilerCPUMap[profiler] + cpuStr := fmt.Sprintf("%d", cpuid) + swProfile, err := (*profiler).Profile() if err != nil { return err } @@ -459,9 +551,10 @@ func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error { } func (c *perfCollector) updateCacheStats(ch chan<- prometheus.Metric) error { - for cpu, profiler := range c.perfCacheProfilers { - cpuStr := strconv.Itoa(cpu) - cacheProfile, err := profiler.Profile() + for _, profiler := range c.perfCacheProfilers { + cpuid := c.cacheProfilerCPUMap[profiler] + cpuStr := fmt.Sprintf("%d", cpuid) + cacheProfile, err := (*profiler).Profile() if err != nil { return err } diff --git a/collector/perf_linux_test.go b/collector/perf_linux_test.go index 68c580b2..eecfab9c 100644 --- a/collector/perf_linux_test.go +++ b/collector/perf_linux_test.go @@ -54,3 +54,76 @@ func TestPerfCollector(t *testing.T) { t.Fatal(err) } } + +func TestPerfCPUFlagToCPUs(t *testing.T) { + tests := []struct { + name string + flag string + exCpus []int + errStr string + }{ + { + name: "valid single cpu", + flag: "1", + exCpus: []int{1}, + }, + { + name: "valid range cpus", + flag: "1-5", + exCpus: []int{1, 2, 3, 4, 5}, + }, + { + name: "valid double digit", + flag: "10", + exCpus: []int{10}, + }, + { + name: "valid double digit range", + flag: "10-12", + exCpus: []int{10, 11, 12}, + }, + { + name: "valid double digit stride", + flag: "10-20:5", + exCpus: []int{10, 15, 20}, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + cpus, err := perfCPUFlagToCPUs(test.flag) + if test.errStr != "" { + if err != nil { + t.Fatal("expected error to not be nil") + } + if test.errStr != err.Error() { + t.Fatalf( + "expected error %q, got %q", + test.errStr, + err.Error(), + ) + } + return + } + if err != nil { + t.Fatal(err) + } + if len(cpus) != len(test.exCpus) { + t.Fatalf( + "expected cpus %v, got %v", + test.exCpus, + cpus, + ) + } + for i := range cpus { + if test.exCpus[i] != cpus[i] { + t.Fatalf( + "expected cpus %v, got %v", + test.exCpus, + cpus, + ) + } + } + }) + } +}