mirror of
				https://github.com/prometheus/node_exporter.git
				synced 2025-08-20 18:33:52 -07:00 
			
		
		
		
	Handle small backwards jumps in CPU idle
The Linux CPU idle stat can also jump backwards slightly in some cases. Allow the jump back up to 3 seconds before we attempt to reset the CPU counter cache. Fixes: https://github.com/prometheus/node_exporter/issues/1903 Signed-off-by: Ben Kochie <superq@gmail.com>
This commit is contained in:
		
							parent
							
								
									13be860e25
								
							
						
					
					
						commit
						73c9a10d37
					
				| 
						 | 
				
			
			@ -46,10 +46,14 @@ type cpuCollector struct {
 | 
			
		|||
	cpuBugsIncludeRegexp  *regexp.Regexp
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// Idle jump back limit in seconds.
 | 
			
		||||
const jumpBackSeconds = 3.0
 | 
			
		||||
 | 
			
		||||
var (
 | 
			
		||||
	enableCPUInfo = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool()
 | 
			
		||||
	flagsInclude  = kingpin.Flag("collector.cpu.info.flags-include", "Filter the `flags` field in cpuInfo with a value that must be a regular expression").String()
 | 
			
		||||
	bugsInclude   = kingpin.Flag("collector.cpu.info.bugs-include", "Filter the `bugs` field in cpuInfo with a value that must be a regular expression").String()
 | 
			
		||||
	enableCPUInfo        = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool()
 | 
			
		||||
	flagsInclude         = kingpin.Flag("collector.cpu.info.flags-include", "Filter the `flags` field in cpuInfo with a value that must be a regular expression").String()
 | 
			
		||||
	bugsInclude          = kingpin.Flag("collector.cpu.info.bugs-include", "Filter the `bugs` field in cpuInfo with a value that must be a regular expression").String()
 | 
			
		||||
	jumpBackDebugMessage = fmt.Sprintf("CPU Idle counter jumped backwards more than %f seconds, possible hotplug event, resetting CPU stats", jumpBackSeconds)
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
func init() {
 | 
			
		||||
| 
						 | 
				
			
			@ -302,6 +306,7 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error {
 | 
			
		|||
 | 
			
		||||
// updateCPUStats updates the internal cache of CPU stats.
 | 
			
		||||
func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) {
 | 
			
		||||
 | 
			
		||||
	// Acquire a lock to update the stats.
 | 
			
		||||
	c.cpuStatsMutex.Lock()
 | 
			
		||||
	defer c.cpuStatsMutex.Unlock()
 | 
			
		||||
| 
						 | 
				
			
			@ -312,12 +317,17 @@ func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) {
 | 
			
		|||
	}
 | 
			
		||||
 | 
			
		||||
	for i, n := range newStats {
 | 
			
		||||
		// If idle jumps backwards, assume we had a hotplug event and reset the stats for this CPU.
 | 
			
		||||
		if n.Idle < c.cpuStats[i].Idle {
 | 
			
		||||
			level.Debug(c.logger).Log("msg", "CPU Idle counter jumped backwards, possible hotplug event, resetting CPU stats", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
 | 
			
		||||
		// If idle jumps backwards by more than X seconds, assume we had a hotplug event and reset the stats for this CPU.
 | 
			
		||||
		if (c.cpuStats[i].Idle - n.Idle) >= jumpBackSeconds {
 | 
			
		||||
			level.Debug(c.logger).Log("msg", jumpBackDebugMessage, "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
 | 
			
		||||
			c.cpuStats[i] = procfs.CPUStat{}
 | 
			
		||||
		}
 | 
			
		||||
		c.cpuStats[i].Idle = n.Idle
 | 
			
		||||
 | 
			
		||||
		if n.Idle >= c.cpuStats[i].Idle {
 | 
			
		||||
			c.cpuStats[i].Idle = n.Idle
 | 
			
		||||
		} else {
 | 
			
		||||
			level.Debug(c.logger).Log("msg", "CPU Idle counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		if n.User >= c.cpuStats[i].User {
 | 
			
		||||
			c.cpuStats[i].User = n.User
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										105
									
								
								collector/cpu_linux_test.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										105
									
								
								collector/cpu_linux_test.go
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,105 @@
 | 
			
		|||
// Copyright 2021 The Prometheus Authors
 | 
			
		||||
// Licensed under the Apache License, Version 2.0 (the "License");
 | 
			
		||||
// you may not use this file except in compliance with the License.
 | 
			
		||||
// You may obtain a copy of the License at
 | 
			
		||||
//
 | 
			
		||||
// http://www.apache.org/licenses/LICENSE-2.0
 | 
			
		||||
//
 | 
			
		||||
// Unless required by applicable law or agreed to in writing, software
 | 
			
		||||
// distributed under the License is distributed on an "AS IS" BASIS,
 | 
			
		||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
			
		||||
// See the License for the specific language governing permissions and
 | 
			
		||||
// limitations under the License.
 | 
			
		||||
 | 
			
		||||
// +build !nocpu
 | 
			
		||||
 | 
			
		||||
package collector
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"reflect"
 | 
			
		||||
	"testing"
 | 
			
		||||
 | 
			
		||||
	"github.com/go-kit/log"
 | 
			
		||||
	"github.com/prometheus/procfs"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
func makeTestCPUCollector(s []procfs.CPUStat) *cpuCollector {
 | 
			
		||||
	dup := make([]procfs.CPUStat, len(s))
 | 
			
		||||
	copy(dup, s)
 | 
			
		||||
	return &cpuCollector{
 | 
			
		||||
		logger:   log.NewNopLogger(),
 | 
			
		||||
		cpuStats: dup,
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func TestCPU(t *testing.T) {
 | 
			
		||||
	firstCPUStat := []procfs.CPUStat{{
 | 
			
		||||
		User:      100.0,
 | 
			
		||||
		Nice:      100.0,
 | 
			
		||||
		System:    100.0,
 | 
			
		||||
		Idle:      100.0,
 | 
			
		||||
		Iowait:    100.0,
 | 
			
		||||
		IRQ:       100.0,
 | 
			
		||||
		SoftIRQ:   100.0,
 | 
			
		||||
		Steal:     100.0,
 | 
			
		||||
		Guest:     100.0,
 | 
			
		||||
		GuestNice: 100.0,
 | 
			
		||||
	}}
 | 
			
		||||
 | 
			
		||||
	c := makeTestCPUCollector(firstCPUStat)
 | 
			
		||||
	want := []procfs.CPUStat{{
 | 
			
		||||
		User:      101.0,
 | 
			
		||||
		Nice:      101.0,
 | 
			
		||||
		System:    101.0,
 | 
			
		||||
		Idle:      101.0,
 | 
			
		||||
		Iowait:    101.0,
 | 
			
		||||
		IRQ:       101.0,
 | 
			
		||||
		SoftIRQ:   101.0,
 | 
			
		||||
		Steal:     101.0,
 | 
			
		||||
		Guest:     101.0,
 | 
			
		||||
		GuestNice: 101.0,
 | 
			
		||||
	}}
 | 
			
		||||
	c.updateCPUStats(want)
 | 
			
		||||
	got := c.cpuStats
 | 
			
		||||
	if !reflect.DeepEqual(want, got) {
 | 
			
		||||
		t.Fatalf("should have %v CPU Stat: got %v", want, got)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	c = makeTestCPUCollector(firstCPUStat)
 | 
			
		||||
	jumpBack := []procfs.CPUStat{{
 | 
			
		||||
		User:      99.9,
 | 
			
		||||
		Nice:      99.9,
 | 
			
		||||
		System:    99.9,
 | 
			
		||||
		Idle:      99.9,
 | 
			
		||||
		Iowait:    99.9,
 | 
			
		||||
		IRQ:       99.9,
 | 
			
		||||
		SoftIRQ:   99.9,
 | 
			
		||||
		Steal:     99.9,
 | 
			
		||||
		Guest:     99.9,
 | 
			
		||||
		GuestNice: 99.9,
 | 
			
		||||
	}}
 | 
			
		||||
	c.updateCPUStats(jumpBack)
 | 
			
		||||
	got = c.cpuStats
 | 
			
		||||
	if reflect.DeepEqual(jumpBack, got) {
 | 
			
		||||
		t.Fatalf("should have %v CPU Stat: got %v", firstCPUStat, got)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	c = makeTestCPUCollector(firstCPUStat)
 | 
			
		||||
	resetIdle := []procfs.CPUStat{{
 | 
			
		||||
		User:      102.0,
 | 
			
		||||
		Nice:      102.0,
 | 
			
		||||
		System:    102.0,
 | 
			
		||||
		Idle:      1.0,
 | 
			
		||||
		Iowait:    102.0,
 | 
			
		||||
		IRQ:       102.0,
 | 
			
		||||
		SoftIRQ:   102.0,
 | 
			
		||||
		Steal:     102.0,
 | 
			
		||||
		Guest:     102.0,
 | 
			
		||||
		GuestNice: 102.0,
 | 
			
		||||
	}}
 | 
			
		||||
	c.updateCPUStats(resetIdle)
 | 
			
		||||
	got = c.cpuStats
 | 
			
		||||
	if !reflect.DeepEqual(resetIdle, got) {
 | 
			
		||||
		t.Fatalf("should have %v CPU Stat: got %v", resetIdle, got)
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
		Loading…
	
		Reference in a new issue