Handle small backwards jumps in CPU idle

The Linux CPU idle stat can also jump backwards slightly in some cases.
Allow the jump back up to 3 seconds before we attempt to reset the CPU
counter cache.

Fixes: https://github.com/prometheus/node_exporter/issues/1903

Signed-off-by: Ben Kochie <superq@gmail.com>
This commit is contained in:
Ben Kochie 2021-07-04 10:47:04 +02:00
parent 13be860e25
commit 73c9a10d37
No known key found for this signature in database
GPG key ID: C646B23C9E3245F1
2 changed files with 122 additions and 7 deletions

View file

@ -46,10 +46,14 @@ type cpuCollector struct {
cpuBugsIncludeRegexp *regexp.Regexp
}
// Idle jump back limit in seconds.
const jumpBackSeconds = 3.0
var (
enableCPUInfo = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool()
flagsInclude = kingpin.Flag("collector.cpu.info.flags-include", "Filter the `flags` field in cpuInfo with a value that must be a regular expression").String()
bugsInclude = kingpin.Flag("collector.cpu.info.bugs-include", "Filter the `bugs` field in cpuInfo with a value that must be a regular expression").String()
enableCPUInfo = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool()
flagsInclude = kingpin.Flag("collector.cpu.info.flags-include", "Filter the `flags` field in cpuInfo with a value that must be a regular expression").String()
bugsInclude = kingpin.Flag("collector.cpu.info.bugs-include", "Filter the `bugs` field in cpuInfo with a value that must be a regular expression").String()
jumpBackDebugMessage = fmt.Sprintf("CPU Idle counter jumped backwards more than %f seconds, possible hotplug event, resetting CPU stats", jumpBackSeconds)
)
func init() {
@ -302,6 +306,7 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error {
// updateCPUStats updates the internal cache of CPU stats.
func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) {
// Acquire a lock to update the stats.
c.cpuStatsMutex.Lock()
defer c.cpuStatsMutex.Unlock()
@ -312,12 +317,17 @@ func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) {
}
for i, n := range newStats {
// If idle jumps backwards, assume we had a hotplug event and reset the stats for this CPU.
if n.Idle < c.cpuStats[i].Idle {
level.Debug(c.logger).Log("msg", "CPU Idle counter jumped backwards, possible hotplug event, resetting CPU stats", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
// If idle jumps backwards by more than X seconds, assume we had a hotplug event and reset the stats for this CPU.
if (c.cpuStats[i].Idle - n.Idle) >= jumpBackSeconds {
level.Debug(c.logger).Log("msg", jumpBackDebugMessage, "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
c.cpuStats[i] = procfs.CPUStat{}
}
c.cpuStats[i].Idle = n.Idle
if n.Idle >= c.cpuStats[i].Idle {
c.cpuStats[i].Idle = n.Idle
} else {
level.Debug(c.logger).Log("msg", "CPU Idle counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
}
if n.User >= c.cpuStats[i].User {
c.cpuStats[i].User = n.User

105
collector/cpu_linux_test.go Normal file
View file

@ -0,0 +1,105 @@
// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !nocpu
package collector
import (
"reflect"
"testing"
"github.com/go-kit/log"
"github.com/prometheus/procfs"
)
func makeTestCPUCollector(s []procfs.CPUStat) *cpuCollector {
dup := make([]procfs.CPUStat, len(s))
copy(dup, s)
return &cpuCollector{
logger: log.NewNopLogger(),
cpuStats: dup,
}
}
func TestCPU(t *testing.T) {
firstCPUStat := []procfs.CPUStat{{
User: 100.0,
Nice: 100.0,
System: 100.0,
Idle: 100.0,
Iowait: 100.0,
IRQ: 100.0,
SoftIRQ: 100.0,
Steal: 100.0,
Guest: 100.0,
GuestNice: 100.0,
}}
c := makeTestCPUCollector(firstCPUStat)
want := []procfs.CPUStat{{
User: 101.0,
Nice: 101.0,
System: 101.0,
Idle: 101.0,
Iowait: 101.0,
IRQ: 101.0,
SoftIRQ: 101.0,
Steal: 101.0,
Guest: 101.0,
GuestNice: 101.0,
}}
c.updateCPUStats(want)
got := c.cpuStats
if !reflect.DeepEqual(want, got) {
t.Fatalf("should have %v CPU Stat: got %v", want, got)
}
c = makeTestCPUCollector(firstCPUStat)
jumpBack := []procfs.CPUStat{{
User: 99.9,
Nice: 99.9,
System: 99.9,
Idle: 99.9,
Iowait: 99.9,
IRQ: 99.9,
SoftIRQ: 99.9,
Steal: 99.9,
Guest: 99.9,
GuestNice: 99.9,
}}
c.updateCPUStats(jumpBack)
got = c.cpuStats
if reflect.DeepEqual(jumpBack, got) {
t.Fatalf("should have %v CPU Stat: got %v", firstCPUStat, got)
}
c = makeTestCPUCollector(firstCPUStat)
resetIdle := []procfs.CPUStat{{
User: 102.0,
Nice: 102.0,
System: 102.0,
Idle: 1.0,
Iowait: 102.0,
IRQ: 102.0,
SoftIRQ: 102.0,
Steal: 102.0,
Guest: 102.0,
GuestNice: 102.0,
}}
c.updateCPUStats(resetIdle)
got = c.cpuStats
if !reflect.DeepEqual(resetIdle, got) {
t.Fatalf("should have %v CPU Stat: got %v", resetIdle, got)
}
}