mirror of
https://github.com/prometheus/node_exporter.git
synced 2024-12-27 14:39:53 -08:00
Handle small backwards jumps in CPU idle
The Linux CPU idle stat can also jump backwards slightly in some cases. Allow the jump back up to 3 seconds before we attempt to reset the CPU counter cache. Fixes: https://github.com/prometheus/node_exporter/issues/1903 Signed-off-by: Ben Kochie <superq@gmail.com>
This commit is contained in:
parent
13be860e25
commit
73c9a10d37
|
@ -46,10 +46,14 @@ type cpuCollector struct {
|
|||
cpuBugsIncludeRegexp *regexp.Regexp
|
||||
}
|
||||
|
||||
// Idle jump back limit in seconds.
|
||||
const jumpBackSeconds = 3.0
|
||||
|
||||
var (
|
||||
enableCPUInfo = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool()
|
||||
flagsInclude = kingpin.Flag("collector.cpu.info.flags-include", "Filter the `flags` field in cpuInfo with a value that must be a regular expression").String()
|
||||
bugsInclude = kingpin.Flag("collector.cpu.info.bugs-include", "Filter the `bugs` field in cpuInfo with a value that must be a regular expression").String()
|
||||
enableCPUInfo = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool()
|
||||
flagsInclude = kingpin.Flag("collector.cpu.info.flags-include", "Filter the `flags` field in cpuInfo with a value that must be a regular expression").String()
|
||||
bugsInclude = kingpin.Flag("collector.cpu.info.bugs-include", "Filter the `bugs` field in cpuInfo with a value that must be a regular expression").String()
|
||||
jumpBackDebugMessage = fmt.Sprintf("CPU Idle counter jumped backwards more than %f seconds, possible hotplug event, resetting CPU stats", jumpBackSeconds)
|
||||
)
|
||||
|
||||
func init() {
|
||||
|
@ -302,6 +306,7 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error {
|
|||
|
||||
// updateCPUStats updates the internal cache of CPU stats.
|
||||
func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) {
|
||||
|
||||
// Acquire a lock to update the stats.
|
||||
c.cpuStatsMutex.Lock()
|
||||
defer c.cpuStatsMutex.Unlock()
|
||||
|
@ -312,12 +317,17 @@ func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) {
|
|||
}
|
||||
|
||||
for i, n := range newStats {
|
||||
// If idle jumps backwards, assume we had a hotplug event and reset the stats for this CPU.
|
||||
if n.Idle < c.cpuStats[i].Idle {
|
||||
level.Debug(c.logger).Log("msg", "CPU Idle counter jumped backwards, possible hotplug event, resetting CPU stats", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
|
||||
// If idle jumps backwards by more than X seconds, assume we had a hotplug event and reset the stats for this CPU.
|
||||
if (c.cpuStats[i].Idle - n.Idle) >= jumpBackSeconds {
|
||||
level.Debug(c.logger).Log("msg", jumpBackDebugMessage, "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
|
||||
c.cpuStats[i] = procfs.CPUStat{}
|
||||
}
|
||||
c.cpuStats[i].Idle = n.Idle
|
||||
|
||||
if n.Idle >= c.cpuStats[i].Idle {
|
||||
c.cpuStats[i].Idle = n.Idle
|
||||
} else {
|
||||
level.Debug(c.logger).Log("msg", "CPU Idle counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
|
||||
}
|
||||
|
||||
if n.User >= c.cpuStats[i].User {
|
||||
c.cpuStats[i].User = n.User
|
||||
|
|
105
collector/cpu_linux_test.go
Normal file
105
collector/cpu_linux_test.go
Normal file
|
@ -0,0 +1,105 @@
|
|||
// Copyright 2021 The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// +build !nocpu
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/go-kit/log"
|
||||
"github.com/prometheus/procfs"
|
||||
)
|
||||
|
||||
func makeTestCPUCollector(s []procfs.CPUStat) *cpuCollector {
|
||||
dup := make([]procfs.CPUStat, len(s))
|
||||
copy(dup, s)
|
||||
return &cpuCollector{
|
||||
logger: log.NewNopLogger(),
|
||||
cpuStats: dup,
|
||||
}
|
||||
}
|
||||
|
||||
func TestCPU(t *testing.T) {
|
||||
firstCPUStat := []procfs.CPUStat{{
|
||||
User: 100.0,
|
||||
Nice: 100.0,
|
||||
System: 100.0,
|
||||
Idle: 100.0,
|
||||
Iowait: 100.0,
|
||||
IRQ: 100.0,
|
||||
SoftIRQ: 100.0,
|
||||
Steal: 100.0,
|
||||
Guest: 100.0,
|
||||
GuestNice: 100.0,
|
||||
}}
|
||||
|
||||
c := makeTestCPUCollector(firstCPUStat)
|
||||
want := []procfs.CPUStat{{
|
||||
User: 101.0,
|
||||
Nice: 101.0,
|
||||
System: 101.0,
|
||||
Idle: 101.0,
|
||||
Iowait: 101.0,
|
||||
IRQ: 101.0,
|
||||
SoftIRQ: 101.0,
|
||||
Steal: 101.0,
|
||||
Guest: 101.0,
|
||||
GuestNice: 101.0,
|
||||
}}
|
||||
c.updateCPUStats(want)
|
||||
got := c.cpuStats
|
||||
if !reflect.DeepEqual(want, got) {
|
||||
t.Fatalf("should have %v CPU Stat: got %v", want, got)
|
||||
}
|
||||
|
||||
c = makeTestCPUCollector(firstCPUStat)
|
||||
jumpBack := []procfs.CPUStat{{
|
||||
User: 99.9,
|
||||
Nice: 99.9,
|
||||
System: 99.9,
|
||||
Idle: 99.9,
|
||||
Iowait: 99.9,
|
||||
IRQ: 99.9,
|
||||
SoftIRQ: 99.9,
|
||||
Steal: 99.9,
|
||||
Guest: 99.9,
|
||||
GuestNice: 99.9,
|
||||
}}
|
||||
c.updateCPUStats(jumpBack)
|
||||
got = c.cpuStats
|
||||
if reflect.DeepEqual(jumpBack, got) {
|
||||
t.Fatalf("should have %v CPU Stat: got %v", firstCPUStat, got)
|
||||
}
|
||||
|
||||
c = makeTestCPUCollector(firstCPUStat)
|
||||
resetIdle := []procfs.CPUStat{{
|
||||
User: 102.0,
|
||||
Nice: 102.0,
|
||||
System: 102.0,
|
||||
Idle: 1.0,
|
||||
Iowait: 102.0,
|
||||
IRQ: 102.0,
|
||||
SoftIRQ: 102.0,
|
||||
Steal: 102.0,
|
||||
Guest: 102.0,
|
||||
GuestNice: 102.0,
|
||||
}}
|
||||
c.updateCPUStats(resetIdle)
|
||||
got = c.cpuStats
|
||||
if !reflect.DeepEqual(resetIdle, got) {
|
||||
t.Fatalf("should have %v CPU Stat: got %v", resetIdle, got)
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue