Skip to content

Commit 79ef3f2

Browse files
committed
Add new collector and metrics for watchdog (#2309)
Signed-off-by: Gavin Lam <[email protected]>
1 parent 4ec07ee commit 79ef3f2

File tree

9 files changed

+354
-6
lines changed

9 files changed

+354
-6
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,7 @@ softirqs | Exposes detailed softirq statistics from `/proc/softirqs`. | Linux
204204
sysctl | Expose sysctl values from `/proc/sys`. Use `--collector.sysctl.include(-info)` to configure. | Linux
205205
systemd | Exposes service and system status from [systemd](http://www.freedesktop.org/wiki/Software/systemd/). | Linux
206206
tcpstat | Exposes TCP connection status information from `/proc/net/tcp` and `/proc/net/tcp6`. (Warning: the current version has potential performance issues in high load situations.) | Linux
207+
watchdog | Exposes statistics from `/sys/class/watchdog` | Linux
207208
wifi | Exposes WiFi device and station statistics. | Linux
208209
xfrm | Exposes statistics from `/proc/net/xfrm_stat` | Linux
209210
zoneinfo | Exposes NUMA memory zone metrics. | Linux

collector/fixtures/e2e-64k-page-output.txt

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2946,6 +2946,7 @@ node_scrape_collector_success{collector="thermal_zone"} 1
29462946
node_scrape_collector_success{collector="time"} 1
29472947
node_scrape_collector_success{collector="udp_queues"} 1
29482948
node_scrape_collector_success{collector="vmstat"} 1
2949+
node_scrape_collector_success{collector="watchdog"} 1
29492950
node_scrape_collector_success{collector="wifi"} 1
29502951
node_scrape_collector_success{collector="xfrm"} 1
29512952
node_scrape_collector_success{collector="xfs"} 1
@@ -3222,6 +3223,31 @@ node_vmstat_pswpin 1476
32223223
# HELP node_vmstat_pswpout /proc/vmstat information field pswpout.
32233224
# TYPE node_vmstat_pswpout untyped
32243225
node_vmstat_pswpout 35045
3226+
# HELP node_watchdog_access_cs0 Value of /sys/class/watchdog/<watchdog>/access_cs0
3227+
# TYPE node_watchdog_access_cs0 gauge
3228+
node_watchdog_access_cs0{name="watchdog0"} 0
3229+
# HELP node_watchdog_bootstatus Value of /sys/class/watchdog/<watchdog>/bootstatus
3230+
# TYPE node_watchdog_bootstatus gauge
3231+
node_watchdog_bootstatus{name="watchdog0"} 1
3232+
# HELP node_watchdog_fw_version Value of /sys/class/watchdog/<watchdog>/fw_version
3233+
# TYPE node_watchdog_fw_version gauge
3234+
node_watchdog_fw_version{name="watchdog0"} 2
3235+
# HELP node_watchdog_info Info of /sys/class/watchdog/<watchdog>
3236+
# TYPE node_watchdog_info gauge
3237+
node_watchdog_info{identity="",name="watchdog1",options="",pretimeout_governor="",state="",status=""} 1
3238+
node_watchdog_info{identity="Software Watchdog",name="watchdog0",options="0x8380",pretimeout_governor="noop",state="active",status="0x8000"} 1
3239+
# HELP node_watchdog_nowayout Value of /sys/class/watchdog/<watchdog>/nowayout
3240+
# TYPE node_watchdog_nowayout gauge
3241+
node_watchdog_nowayout{name="watchdog0"} 0
3242+
# HELP node_watchdog_pretimeout_seconds Value of /sys/class/watchdog/<watchdog>/pretimeout
3243+
# TYPE node_watchdog_pretimeout_seconds gauge
3244+
node_watchdog_pretimeout_seconds{name="watchdog0"} 120
3245+
# HELP node_watchdog_timeleft_seconds Value of /sys/class/watchdog/<watchdog>/timeleft
3246+
# TYPE node_watchdog_timeleft_seconds gauge
3247+
node_watchdog_timeleft_seconds{name="watchdog0"} 300
3248+
# HELP node_watchdog_timeout_seconds Value of /sys/class/watchdog/<watchdog>/timeout
3249+
# TYPE node_watchdog_timeout_seconds gauge
3250+
node_watchdog_timeout_seconds{name="watchdog0"} 60
32253251
# HELP node_wifi_interface_frequency_hertz The current frequency a WiFi interface is operating at, in hertz.
32263252
# TYPE node_wifi_interface_frequency_hertz gauge
32273253
node_wifi_interface_frequency_hertz{device="wlan0"} 2.412e+09

collector/fixtures/e2e-output.txt

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2968,6 +2968,7 @@ node_scrape_collector_success{collector="thermal_zone"} 1
29682968
node_scrape_collector_success{collector="time"} 1
29692969
node_scrape_collector_success{collector="udp_queues"} 1
29702970
node_scrape_collector_success{collector="vmstat"} 1
2971+
node_scrape_collector_success{collector="watchdog"} 1
29712972
node_scrape_collector_success{collector="wifi"} 1
29722973
node_scrape_collector_success{collector="xfrm"} 1
29732974
node_scrape_collector_success{collector="xfs"} 1
@@ -3244,6 +3245,31 @@ node_vmstat_pswpin 1476
32443245
# HELP node_vmstat_pswpout /proc/vmstat information field pswpout.
32453246
# TYPE node_vmstat_pswpout untyped
32463247
node_vmstat_pswpout 35045
3248+
# HELP node_watchdog_access_cs0 Value of /sys/class/watchdog/<watchdog>/access_cs0
3249+
# TYPE node_watchdog_access_cs0 gauge
3250+
node_watchdog_access_cs0{name="watchdog0"} 0
3251+
# HELP node_watchdog_bootstatus Value of /sys/class/watchdog/<watchdog>/bootstatus
3252+
# TYPE node_watchdog_bootstatus gauge
3253+
node_watchdog_bootstatus{name="watchdog0"} 1
3254+
# HELP node_watchdog_fw_version Value of /sys/class/watchdog/<watchdog>/fw_version
3255+
# TYPE node_watchdog_fw_version gauge
3256+
node_watchdog_fw_version{name="watchdog0"} 2
3257+
# HELP node_watchdog_info Info of /sys/class/watchdog/<watchdog>
3258+
# TYPE node_watchdog_info gauge
3259+
node_watchdog_info{identity="",name="watchdog1",options="",pretimeout_governor="",state="",status=""} 1
3260+
node_watchdog_info{identity="Software Watchdog",name="watchdog0",options="0x8380",pretimeout_governor="noop",state="active",status="0x8000"} 1
3261+
# HELP node_watchdog_nowayout Value of /sys/class/watchdog/<watchdog>/nowayout
3262+
# TYPE node_watchdog_nowayout gauge
3263+
node_watchdog_nowayout{name="watchdog0"} 0
3264+
# HELP node_watchdog_pretimeout_seconds Value of /sys/class/watchdog/<watchdog>/pretimeout
3265+
# TYPE node_watchdog_pretimeout_seconds gauge
3266+
node_watchdog_pretimeout_seconds{name="watchdog0"} 120
3267+
# HELP node_watchdog_timeleft_seconds Value of /sys/class/watchdog/<watchdog>/timeleft
3268+
# TYPE node_watchdog_timeleft_seconds gauge
3269+
node_watchdog_timeleft_seconds{name="watchdog0"} 300
3270+
# HELP node_watchdog_timeout_seconds Value of /sys/class/watchdog/<watchdog>/timeout
3271+
# TYPE node_watchdog_timeout_seconds gauge
3272+
node_watchdog_timeout_seconds{name="watchdog0"} 60
32473273
# HELP node_wifi_interface_frequency_hertz The current frequency a WiFi interface is operating at, in hertz.
32483274
# TYPE node_wifi_interface_frequency_hertz gauge
32493275
node_wifi_interface_frequency_hertz{device="wlan0"} 2.412e+09

collector/fixtures/sys.ttar

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1717,6 +1717,75 @@ SymlinkTo: ../../devices/virtual/thermal/cooling_device0
17171717
Path: sys/class/thermal/thermal_zone0
17181718
SymlinkTo: ../../devices/virtual/thermal/thermal_zone0
17191719
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1720+
Directory: sys/class/watchdog
1721+
Mode: 775
1722+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1723+
Directory: sys/class/watchdog/watchdog0
1724+
Mode: 775
1725+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1726+
Path: sys/class/watchdog/watchdog0/access_cs0
1727+
Lines: 1
1728+
0EOF
1729+
Mode: 644
1730+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1731+
Path: sys/class/watchdog/watchdog0/bootstatus
1732+
Lines: 1
1733+
1EOF
1734+
Mode: 444
1735+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1736+
Path: sys/class/watchdog/watchdog0/fw_version
1737+
Lines: 1
1738+
2EOF
1739+
Mode: 444
1740+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1741+
Path: sys/class/watchdog/watchdog0/identity
1742+
Lines: 1
1743+
Software WatchdogEOF
1744+
Mode: 444
1745+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1746+
Path: sys/class/watchdog/watchdog0/nowayout
1747+
Lines: 1
1748+
0EOF
1749+
Mode: 644
1750+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1751+
Path: sys/class/watchdog/watchdog0/options
1752+
Lines: 1
1753+
0x8380EOF
1754+
Mode: 444
1755+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1756+
Path: sys/class/watchdog/watchdog0/pretimeout
1757+
Lines: 1
1758+
120EOF
1759+
Mode: 444
1760+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1761+
Path: sys/class/watchdog/watchdog0/pretimeout_governor
1762+
Lines: 1
1763+
noopEOF
1764+
Mode: 644
1765+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1766+
Path: sys/class/watchdog/watchdog0/state
1767+
Lines: 1
1768+
activeEOF
1769+
Mode: 444
1770+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1771+
Path: sys/class/watchdog/watchdog0/status
1772+
Lines: 1
1773+
0x8000EOF
1774+
Mode: 444
1775+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1776+
Path: sys/class/watchdog/watchdog0/timeleft
1777+
Lines: 1
1778+
300EOF
1779+
Mode: 444
1780+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1781+
Path: sys/class/watchdog/watchdog0/timeout
1782+
Lines: 1
1783+
60EOF
1784+
Mode: 444
1785+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1786+
Directory: sys/class/watchdog/watchdog1
1787+
Mode: 775
1788+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
17201789
Directory: sys/devices
17211790
Mode: 755
17221791
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

collector/watchdog.go

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
// Copyright 2023 The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
//go:build !nowatchdog
15+
// +build !nowatchdog
16+
17+
package collector
18+
19+
import (
20+
"fmt"
21+
22+
"github.com/go-kit/log"
23+
"github.com/prometheus/client_golang/prometheus"
24+
"github.com/prometheus/procfs/sysfs"
25+
)
26+
27+
type watchdogCollector struct {
28+
fs sysfs.FS
29+
logger log.Logger
30+
}
31+
32+
func init() {
33+
registerCollector("watchdog", defaultDisabled, NewWatchdogCollector)
34+
}
35+
36+
// NewWatchdogCollector returns a new Collector exposing watchdog stats.
37+
func NewWatchdogCollector(logger log.Logger) (Collector, error) {
38+
fs, err := sysfs.NewFS(*sysPath)
39+
if err != nil {
40+
return nil, fmt.Errorf("failed to open procfs: %w", err)
41+
}
42+
43+
return &watchdogCollector{
44+
fs: fs,
45+
logger: logger,
46+
}, nil
47+
}
48+
49+
var (
50+
watchdogBootstatusDesc = prometheus.NewDesc(
51+
prometheus.BuildFQName(namespace, "watchdog", "bootstatus"),
52+
"Value of /sys/class/watchdog/<watchdog>/bootstatus",
53+
[]string{"name"}, nil,
54+
)
55+
watchdogFwVersionDesc = prometheus.NewDesc(
56+
prometheus.BuildFQName(namespace, "watchdog", "fw_version"),
57+
"Value of /sys/class/watchdog/<watchdog>/fw_version",
58+
[]string{"name"}, nil,
59+
)
60+
watchdogNowayoutDesc = prometheus.NewDesc(
61+
prometheus.BuildFQName(namespace, "watchdog", "nowayout"),
62+
"Value of /sys/class/watchdog/<watchdog>/nowayout",
63+
[]string{"name"}, nil,
64+
)
65+
watchdogTimeleftDesc = prometheus.NewDesc(
66+
prometheus.BuildFQName(namespace, "watchdog", "timeleft_seconds"),
67+
"Value of /sys/class/watchdog/<watchdog>/timeleft",
68+
[]string{"name"}, nil,
69+
)
70+
watchdogTimeoutDesc = prometheus.NewDesc(
71+
prometheus.BuildFQName(namespace, "watchdog", "timeout_seconds"),
72+
"Value of /sys/class/watchdog/<watchdog>/timeout",
73+
[]string{"name"}, nil,
74+
)
75+
watchdogPretimeoutDesc = prometheus.NewDesc(
76+
prometheus.BuildFQName(namespace, "watchdog", "pretimeout_seconds"),
77+
"Value of /sys/class/watchdog/<watchdog>/pretimeout",
78+
[]string{"name"}, nil,
79+
)
80+
watchdogAccessCs0Desc = prometheus.NewDesc(
81+
prometheus.BuildFQName(namespace, "watchdog", "access_cs0"),
82+
"Value of /sys/class/watchdog/<watchdog>/access_cs0",
83+
[]string{"name"}, nil,
84+
)
85+
watchdogInfoDesc = prometheus.NewDesc(
86+
prometheus.BuildFQName(namespace, "watchdog", "info"),
87+
"Info of /sys/class/watchdog/<watchdog>",
88+
[]string{"name", "options", "identity", "state", "status", "pretimeout_governor"}, nil,
89+
)
90+
)
91+
92+
func toLabelValue(ptr *string) string {
93+
if ptr == nil {
94+
return ""
95+
}
96+
return *ptr
97+
}
98+
99+
func (c *watchdogCollector) Update(ch chan<- prometheus.Metric) error {
100+
watchdogClass, err := c.fs.WatchdogClass()
101+
if err != nil {
102+
return err
103+
}
104+
105+
for _, wd := range watchdogClass {
106+
if wd.Bootstatus != nil {
107+
ch <- prometheus.MustNewConstMetric(watchdogBootstatusDesc, prometheus.GaugeValue, float64(*wd.Bootstatus), wd.Name)
108+
}
109+
if wd.FwVersion != nil {
110+
ch <- prometheus.MustNewConstMetric(watchdogFwVersionDesc, prometheus.GaugeValue, float64(*wd.FwVersion), wd.Name)
111+
}
112+
if wd.Nowayout != nil {
113+
ch <- prometheus.MustNewConstMetric(watchdogNowayoutDesc, prometheus.GaugeValue, float64(*wd.Nowayout), wd.Name)
114+
}
115+
if wd.Timeleft != nil {
116+
ch <- prometheus.MustNewConstMetric(watchdogTimeleftDesc, prometheus.GaugeValue, float64(*wd.Timeleft), wd.Name)
117+
}
118+
if wd.Timeout != nil {
119+
ch <- prometheus.MustNewConstMetric(watchdogTimeoutDesc, prometheus.GaugeValue, float64(*wd.Timeout), wd.Name)
120+
}
121+
if wd.Pretimeout != nil {
122+
ch <- prometheus.MustNewConstMetric(watchdogPretimeoutDesc, prometheus.GaugeValue, float64(*wd.Pretimeout), wd.Name)
123+
}
124+
if wd.AccessCs0 != nil {
125+
ch <- prometheus.MustNewConstMetric(watchdogAccessCs0Desc, prometheus.GaugeValue, float64(*wd.AccessCs0), wd.Name)
126+
}
127+
128+
ch <- prometheus.MustNewConstMetric(watchdogInfoDesc, prometheus.GaugeValue, 1.0,
129+
wd.Name, toLabelValue(wd.Options), toLabelValue(wd.Identity), toLabelValue(wd.State), toLabelValue(wd.Status), toLabelValue(wd.PretimeoutGovernor))
130+
}
131+
132+
return nil
133+
}

collector/watchdog_test.go

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
// Copyright 2023 The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file ewcept in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
//go:build !nowatchdog
15+
// +build !nowatchdog
16+
17+
package collector
18+
19+
import (
20+
"fmt"
21+
"os"
22+
"strings"
23+
"testing"
24+
25+
"github.com/go-kit/log"
26+
"github.com/prometheus/client_golang/prometheus"
27+
"github.com/prometheus/client_golang/prometheus/testutil"
28+
)
29+
30+
type testWatchdogCollector struct {
31+
wc Collector
32+
}
33+
34+
func (c testWatchdogCollector) Collect(ch chan<- prometheus.Metric) {
35+
c.wc.Update(ch)
36+
}
37+
38+
func (c testWatchdogCollector) Describe(ch chan<- *prometheus.Desc) {
39+
prometheus.DescribeByCollect(c, ch)
40+
}
41+
42+
func TestWatchdogStats(t *testing.T) {
43+
testcase := `# HELP node_watchdog_access_cs0 Value of /sys/class/watchdog/<watchdog>/access_cs0
44+
# TYPE node_watchdog_access_cs0 gauge
45+
node_watchdog_access_cs0{name="watchdog0"} 0
46+
# HELP node_watchdog_bootstatus Value of /sys/class/watchdog/<watchdog>/bootstatus
47+
# TYPE node_watchdog_bootstatus gauge
48+
node_watchdog_bootstatus{name="watchdog0"} 1
49+
# HELP node_watchdog_fw_version Value of /sys/class/watchdog/<watchdog>/fw_version
50+
# TYPE node_watchdog_fw_version gauge
51+
node_watchdog_fw_version{name="watchdog0"} 2
52+
# HELP node_watchdog_info Info of /sys/class/watchdog/<watchdog>
53+
# TYPE node_watchdog_info gauge
54+
node_watchdog_info{identity="",name="watchdog1",options="",pretimeout_governor="",state="",status=""} 1
55+
node_watchdog_info{identity="Software Watchdog",name="watchdog0",options="0x8380",pretimeout_governor="noop",state="active",status="0x8000"} 1
56+
# HELP node_watchdog_nowayout Value of /sys/class/watchdog/<watchdog>/nowayout
57+
# TYPE node_watchdog_nowayout gauge
58+
node_watchdog_nowayout{name="watchdog0"} 0
59+
# HELP node_watchdog_pretimeout_seconds Value of /sys/class/watchdog/<watchdog>/pretimeout
60+
# TYPE node_watchdog_pretimeout_seconds gauge
61+
node_watchdog_pretimeout_seconds{name="watchdog0"} 120
62+
# HELP node_watchdog_timeleft_seconds Value of /sys/class/watchdog/<watchdog>/timeleft
63+
# TYPE node_watchdog_timeleft_seconds gauge
64+
node_watchdog_timeleft_seconds{name="watchdog0"} 300
65+
# HELP node_watchdog_timeout_seconds Value of /sys/class/watchdog/<watchdog>/timeout
66+
# TYPE node_watchdog_timeout_seconds gauge
67+
node_watchdog_timeout_seconds{name="watchdog0"} 60
68+
`
69+
*sysPath = "fixtures/sys"
70+
71+
logger := log.NewLogfmtLogger(os.Stderr)
72+
c, err := NewWatchdogCollector(logger)
73+
if err != nil {
74+
t.Fatal(err)
75+
}
76+
reg := prometheus.NewRegistry()
77+
reg.MustRegister(&testWatchdogCollector{wc: c})
78+
79+
sink := make(chan prometheus.Metric)
80+
go func() {
81+
err = c.Update(sink)
82+
if err != nil {
83+
panic(fmt.Errorf("failed to update collector: %s", err))
84+
}
85+
close(sink)
86+
}()
87+
88+
err = testutil.GatherAndCompare(reg, strings.NewReader(testcase))
89+
if err != nil {
90+
t.Fatal(err)
91+
}
92+
}

end-to-end-test.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ enabled_collectors=$(cat << COLLECTORS
5050
thermal_zone
5151
udp_queues
5252
vmstat
53+
watchdog
5354
wifi
5455
xfrm
5556
xfs

0 commit comments

Comments
 (0)