Skip to content

Commit 2510378

Browse files
authored
Merge pull request #2067 from prometheus/superq/idle_jump
Handle small backwards jumps in CPU idle
2 parents 35a2de2 + 73c9a10 commit 2510378

File tree

2 files changed

+122
-7
lines changed

2 files changed

+122
-7
lines changed

collector/cpu_linux.go

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,14 @@ type cpuCollector struct {
4646
cpuBugsIncludeRegexp *regexp.Regexp
4747
}
4848

49+
// Idle jump back limit in seconds.
50+
const jumpBackSeconds = 3.0
51+
4952
var (
50-
enableCPUInfo = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool()
51-
flagsInclude = kingpin.Flag("collector.cpu.info.flags-include", "Filter the `flags` field in cpuInfo with a value that must be a regular expression").String()
52-
bugsInclude = kingpin.Flag("collector.cpu.info.bugs-include", "Filter the `bugs` field in cpuInfo with a value that must be a regular expression").String()
53+
enableCPUInfo = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool()
54+
flagsInclude = kingpin.Flag("collector.cpu.info.flags-include", "Filter the `flags` field in cpuInfo with a value that must be a regular expression").String()
55+
bugsInclude = kingpin.Flag("collector.cpu.info.bugs-include", "Filter the `bugs` field in cpuInfo with a value that must be a regular expression").String()
56+
jumpBackDebugMessage = fmt.Sprintf("CPU Idle counter jumped backwards more than %f seconds, possible hotplug event, resetting CPU stats", jumpBackSeconds)
5357
)
5458

5559
func init() {
@@ -302,6 +306,7 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error {
302306

303307
// updateCPUStats updates the internal cache of CPU stats.
304308
func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) {
309+
305310
// Acquire a lock to update the stats.
306311
c.cpuStatsMutex.Lock()
307312
defer c.cpuStatsMutex.Unlock()
@@ -312,12 +317,17 @@ func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) {
312317
}
313318

314319
for i, n := range newStats {
315-
// If idle jumps backwards, assume we had a hotplug event and reset the stats for this CPU.
316-
if n.Idle < c.cpuStats[i].Idle {
317-
level.Debug(c.logger).Log("msg", "CPU Idle counter jumped backwards, possible hotplug event, resetting CPU stats", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
320+
// If idle jumps backwards by more than X seconds, assume we had a hotplug event and reset the stats for this CPU.
321+
if (c.cpuStats[i].Idle - n.Idle) >= jumpBackSeconds {
322+
level.Debug(c.logger).Log("msg", jumpBackDebugMessage, "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
318323
c.cpuStats[i] = procfs.CPUStat{}
319324
}
320-
c.cpuStats[i].Idle = n.Idle
325+
326+
if n.Idle >= c.cpuStats[i].Idle {
327+
c.cpuStats[i].Idle = n.Idle
328+
} else {
329+
level.Debug(c.logger).Log("msg", "CPU Idle counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
330+
}
321331

322332
if n.User >= c.cpuStats[i].User {
323333
c.cpuStats[i].User = n.User

collector/cpu_linux_test.go

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
// Copyright 2021 The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
// +build !nocpu
15+
16+
package collector
17+
18+
import (
19+
"reflect"
20+
"testing"
21+
22+
"github.com/go-kit/log"
23+
"github.com/prometheus/procfs"
24+
)
25+
26+
func makeTestCPUCollector(s []procfs.CPUStat) *cpuCollector {
27+
dup := make([]procfs.CPUStat, len(s))
28+
copy(dup, s)
29+
return &cpuCollector{
30+
logger: log.NewNopLogger(),
31+
cpuStats: dup,
32+
}
33+
}
34+
35+
func TestCPU(t *testing.T) {
36+
firstCPUStat := []procfs.CPUStat{{
37+
User: 100.0,
38+
Nice: 100.0,
39+
System: 100.0,
40+
Idle: 100.0,
41+
Iowait: 100.0,
42+
IRQ: 100.0,
43+
SoftIRQ: 100.0,
44+
Steal: 100.0,
45+
Guest: 100.0,
46+
GuestNice: 100.0,
47+
}}
48+
49+
c := makeTestCPUCollector(firstCPUStat)
50+
want := []procfs.CPUStat{{
51+
User: 101.0,
52+
Nice: 101.0,
53+
System: 101.0,
54+
Idle: 101.0,
55+
Iowait: 101.0,
56+
IRQ: 101.0,
57+
SoftIRQ: 101.0,
58+
Steal: 101.0,
59+
Guest: 101.0,
60+
GuestNice: 101.0,
61+
}}
62+
c.updateCPUStats(want)
63+
got := c.cpuStats
64+
if !reflect.DeepEqual(want, got) {
65+
t.Fatalf("should have %v CPU Stat: got %v", want, got)
66+
}
67+
68+
c = makeTestCPUCollector(firstCPUStat)
69+
jumpBack := []procfs.CPUStat{{
70+
User: 99.9,
71+
Nice: 99.9,
72+
System: 99.9,
73+
Idle: 99.9,
74+
Iowait: 99.9,
75+
IRQ: 99.9,
76+
SoftIRQ: 99.9,
77+
Steal: 99.9,
78+
Guest: 99.9,
79+
GuestNice: 99.9,
80+
}}
81+
c.updateCPUStats(jumpBack)
82+
got = c.cpuStats
83+
if reflect.DeepEqual(jumpBack, got) {
84+
t.Fatalf("should have %v CPU Stat: got %v", firstCPUStat, got)
85+
}
86+
87+
c = makeTestCPUCollector(firstCPUStat)
88+
resetIdle := []procfs.CPUStat{{
89+
User: 102.0,
90+
Nice: 102.0,
91+
System: 102.0,
92+
Idle: 1.0,
93+
Iowait: 102.0,
94+
IRQ: 102.0,
95+
SoftIRQ: 102.0,
96+
Steal: 102.0,
97+
Guest: 102.0,
98+
GuestNice: 102.0,
99+
}}
100+
c.updateCPUStats(resetIdle)
101+
got = c.cpuStats
102+
if !reflect.DeepEqual(resetIdle, got) {
103+
t.Fatalf("should have %v CPU Stat: got %v", resetIdle, got)
104+
}
105+
}

0 commit comments

Comments
 (0)