Skip to content

Commit 6c99640

Browse files
committed
Add new collector and metrics for watchdog (#2309)
Signed-off-by: Gavin Lam <[email protected]>
1 parent 5e412a6 commit 6c99640

File tree

7 files changed

+348
-0
lines changed

7 files changed

+348
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,7 @@ softirqs | Exposes detailed softirq statistics from `/proc/softirqs`. | Linux
204204
sysctl | Expose sysctl values from `/proc/sys`. Use `--collector.sysctl.include(-info)` to configure. | Linux
205205
systemd | Exposes service and system status from [systemd](http://www.freedesktop.org/wiki/Software/systemd/). | Linux
206206
tcpstat | Exposes TCP connection status information from `/proc/net/tcp` and `/proc/net/tcp6`. (Warning: the current version has potential performance issues in high load situations.) | Linux
207+
watchdog | Exposes statistics from `/sys/class/watchdog` | Linux
207208
wifi | Exposes WiFi device and station statistics. | Linux
208209
xfrm | Exposes statistics from `/proc/net/xfrm_stat` | Linux
209210
zoneinfo | Exposes NUMA memory zone metrics. | Linux

collector/fixtures/e2e-64k-page-output.txt

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2945,6 +2945,7 @@ node_scrape_collector_success{collector="thermal_zone"} 1
29452945
node_scrape_collector_success{collector="time"} 1
29462946
node_scrape_collector_success{collector="udp_queues"} 1
29472947
node_scrape_collector_success{collector="vmstat"} 1
2948+
node_scrape_collector_success{collector="watchdog"} 1
29482949
node_scrape_collector_success{collector="wifi"} 1
29492950
node_scrape_collector_success{collector="xfrm"} 1
29502951
node_scrape_collector_success{collector="xfs"} 1
@@ -3218,6 +3219,31 @@ node_vmstat_pswpin 1476
32183219
# HELP node_vmstat_pswpout /proc/vmstat information field pswpout.
32193220
# TYPE node_vmstat_pswpout untyped
32203221
node_vmstat_pswpout 35045
3222+
# HELP node_watchdog_access_cs0 Value of /sys/class/watchdog/<watchdog>/access_cs0
3223+
# TYPE node_watchdog_access_cs0 gauge
3224+
node_watchdog_access_cs0{name="watchdog0"} 0
3225+
# HELP node_watchdog_bootstatus Value of /sys/class/watchdog/<watchdog>/bootstatus
3226+
# TYPE node_watchdog_bootstatus gauge
3227+
node_watchdog_bootstatus{name="watchdog0"} 1
3228+
# HELP node_watchdog_fw_version Value of /sys/class/watchdog/<watchdog>/fw_version
3229+
# TYPE node_watchdog_fw_version gauge
3230+
node_watchdog_fw_version{name="watchdog0"} 2
3231+
# HELP node_watchdog_info Info of /sys/class/watchdog/<watchdog>
3232+
# TYPE node_watchdog_info gauge
3233+
node_watchdog_info{identity="",name="watchdog1",options="",pretimeout_governor="",state="",status=""} 1
3234+
node_watchdog_info{identity="Software Watchdog",name="watchdog0",options="0x8380",pretimeout_governor="noop",state="active",status="0x8000"} 1
3235+
# HELP node_watchdog_nowayout Value of /sys/class/watchdog/<watchdog>/nowayout
3236+
# TYPE node_watchdog_nowayout gauge
3237+
node_watchdog_nowayout{name="watchdog0"} 0
3238+
# HELP node_watchdog_pretimeout_seconds Value of /sys/class/watchdog/<watchdog>/pretimeout
3239+
# TYPE node_watchdog_pretimeout_seconds gauge
3240+
node_watchdog_pretimeout_seconds{name="watchdog0"} 120
3241+
# HELP node_watchdog_timeleft_seconds Value of /sys/class/watchdog/<watchdog>/timeleft
3242+
# TYPE node_watchdog_timeleft_seconds gauge
3243+
node_watchdog_timeleft_seconds{name="watchdog0"} 300
3244+
# HELP node_watchdog_timeout_seconds Value of /sys/class/watchdog/<watchdog>/timeout
3245+
# TYPE node_watchdog_timeout_seconds gauge
3246+
node_watchdog_timeout_seconds{name="watchdog0"} 60
32213247
# HELP node_wifi_interface_frequency_hertz The current frequency a WiFi interface is operating at, in hertz.
32223248
# TYPE node_wifi_interface_frequency_hertz gauge
32233249
node_wifi_interface_frequency_hertz{device="wlan0"} 2.412e+09

collector/fixtures/e2e-output.txt

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2967,6 +2967,7 @@ node_scrape_collector_success{collector="thermal_zone"} 1
29672967
node_scrape_collector_success{collector="time"} 1
29682968
node_scrape_collector_success{collector="udp_queues"} 1
29692969
node_scrape_collector_success{collector="vmstat"} 1
2970+
node_scrape_collector_success{collector="watchdog"} 1
29702971
node_scrape_collector_success{collector="wifi"} 1
29712972
node_scrape_collector_success{collector="xfrm"} 1
29722973
node_scrape_collector_success{collector="xfs"} 1
@@ -3240,6 +3241,31 @@ node_vmstat_pswpin 1476
32403241
# HELP node_vmstat_pswpout /proc/vmstat information field pswpout.
32413242
# TYPE node_vmstat_pswpout untyped
32423243
node_vmstat_pswpout 35045
3244+
# HELP node_watchdog_access_cs0 Value of /sys/class/watchdog/<watchdog>/access_cs0
3245+
# TYPE node_watchdog_access_cs0 gauge
3246+
node_watchdog_access_cs0{name="watchdog0"} 0
3247+
# HELP node_watchdog_bootstatus Value of /sys/class/watchdog/<watchdog>/bootstatus
3248+
# TYPE node_watchdog_bootstatus gauge
3249+
node_watchdog_bootstatus{name="watchdog0"} 1
3250+
# HELP node_watchdog_fw_version Value of /sys/class/watchdog/<watchdog>/fw_version
3251+
# TYPE node_watchdog_fw_version gauge
3252+
node_watchdog_fw_version{name="watchdog0"} 2
3253+
# HELP node_watchdog_info Info of /sys/class/watchdog/<watchdog>
3254+
# TYPE node_watchdog_info gauge
3255+
node_watchdog_info{identity="",name="watchdog1",options="",pretimeout_governor="",state="",status=""} 1
3256+
node_watchdog_info{identity="Software Watchdog",name="watchdog0",options="0x8380",pretimeout_governor="noop",state="active",status="0x8000"} 1
3257+
# HELP node_watchdog_nowayout Value of /sys/class/watchdog/<watchdog>/nowayout
3258+
# TYPE node_watchdog_nowayout gauge
3259+
node_watchdog_nowayout{name="watchdog0"} 0
3260+
# HELP node_watchdog_pretimeout_seconds Value of /sys/class/watchdog/<watchdog>/pretimeout
3261+
# TYPE node_watchdog_pretimeout_seconds gauge
3262+
node_watchdog_pretimeout_seconds{name="watchdog0"} 120
3263+
# HELP node_watchdog_timeleft_seconds Value of /sys/class/watchdog/<watchdog>/timeleft
3264+
# TYPE node_watchdog_timeleft_seconds gauge
3265+
node_watchdog_timeleft_seconds{name="watchdog0"} 300
3266+
# HELP node_watchdog_timeout_seconds Value of /sys/class/watchdog/<watchdog>/timeout
3267+
# TYPE node_watchdog_timeout_seconds gauge
3268+
node_watchdog_timeout_seconds{name="watchdog0"} 60
32433269
# HELP node_wifi_interface_frequency_hertz The current frequency a WiFi interface is operating at, in hertz.
32443270
# TYPE node_wifi_interface_frequency_hertz gauge
32453271
node_wifi_interface_frequency_hertz{device="wlan0"} 2.412e+09

collector/fixtures/sys.ttar

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1717,6 +1717,75 @@ SymlinkTo: ../../devices/virtual/thermal/cooling_device0
17171717
Path: sys/class/thermal/thermal_zone0
17181718
SymlinkTo: ../../devices/virtual/thermal/thermal_zone0
17191719
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1720+
Directory: sys/class/watchdog
1721+
Mode: 775
1722+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1723+
Directory: sys/class/watchdog/watchdog0
1724+
Mode: 775
1725+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1726+
Path: sys/class/watchdog/watchdog0/access_cs0
1727+
Lines: 1
1728+
0EOF
1729+
Mode: 644
1730+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1731+
Path: sys/class/watchdog/watchdog0/bootstatus
1732+
Lines: 1
1733+
1EOF
1734+
Mode: 444
1735+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1736+
Path: sys/class/watchdog/watchdog0/fw_version
1737+
Lines: 1
1738+
2EOF
1739+
Mode: 444
1740+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1741+
Path: sys/class/watchdog/watchdog0/identity
1742+
Lines: 1
1743+
Software WatchdogEOF
1744+
Mode: 444
1745+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1746+
Path: sys/class/watchdog/watchdog0/nowayout
1747+
Lines: 1
1748+
0EOF
1749+
Mode: 644
1750+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1751+
Path: sys/class/watchdog/watchdog0/options
1752+
Lines: 1
1753+
0x8380EOF
1754+
Mode: 444
1755+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1756+
Path: sys/class/watchdog/watchdog0/pretimeout
1757+
Lines: 1
1758+
120EOF
1759+
Mode: 444
1760+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1761+
Path: sys/class/watchdog/watchdog0/pretimeout_governor
1762+
Lines: 1
1763+
noopEOF
1764+
Mode: 644
1765+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1766+
Path: sys/class/watchdog/watchdog0/state
1767+
Lines: 1
1768+
activeEOF
1769+
Mode: 444
1770+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1771+
Path: sys/class/watchdog/watchdog0/status
1772+
Lines: 1
1773+
0x8000EOF
1774+
Mode: 444
1775+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1776+
Path: sys/class/watchdog/watchdog0/timeleft
1777+
Lines: 1
1778+
300EOF
1779+
Mode: 444
1780+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1781+
Path: sys/class/watchdog/watchdog0/timeout
1782+
Lines: 1
1783+
60EOF
1784+
Mode: 444
1785+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1786+
Directory: sys/class/watchdog/watchdog1
1787+
Mode: 775
1788+
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
17201789
Directory: sys/devices
17211790
Mode: 755
17221791
# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

collector/watchdog.go

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
// Copyright 2023 The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
//go:build !nowatchdog
15+
// +build !nowatchdog
16+
17+
package collector
18+
19+
import (
20+
"fmt"
21+
22+
"github.com/go-kit/log"
23+
"github.com/prometheus/client_golang/prometheus"
24+
"github.com/prometheus/procfs/sysfs"
25+
)
26+
27+
type watchdogCollector struct {
28+
fs sysfs.FS
29+
logger log.Logger
30+
}
31+
32+
func init() {
33+
registerCollector("watchdog", defaultDisabled, NewWatchdogCollector)
34+
}
35+
36+
// NewWatchdogCollector returns a new Collector exposing watchdog stats.
37+
func NewWatchdogCollector(logger log.Logger) (Collector, error) {
38+
fs, err := sysfs.NewFS(*sysPath)
39+
if err != nil {
40+
return nil, fmt.Errorf("failed to open procfs: %w", err)
41+
}
42+
43+
return &watchdogCollector{
44+
fs: fs,
45+
logger: logger,
46+
}, nil
47+
}
48+
49+
var (
50+
watchdogBootstatusDesc = prometheus.NewDesc(
51+
prometheus.BuildFQName(namespace, "watchdog", "bootstatus"),
52+
"Value of /sys/class/watchdog/<watchdog>/bootstatus",
53+
[]string{"name"}, nil,
54+
)
55+
watchdogFwVersionDesc = prometheus.NewDesc(
56+
prometheus.BuildFQName(namespace, "watchdog", "fw_version"),
57+
"Value of /sys/class/watchdog/<watchdog>/fw_version",
58+
[]string{"name"}, nil,
59+
)
60+
watchdogNowayoutDesc = prometheus.NewDesc(
61+
prometheus.BuildFQName(namespace, "watchdog", "nowayout"),
62+
"Value of /sys/class/watchdog/<watchdog>/nowayout",
63+
[]string{"name"}, nil,
64+
)
65+
watchdogTimeleftDesc = prometheus.NewDesc(
66+
prometheus.BuildFQName(namespace, "watchdog", "timeleft_seconds"),
67+
"Value of /sys/class/watchdog/<watchdog>/timeleft",
68+
[]string{"name"}, nil,
69+
)
70+
watchdogTimeoutDesc = prometheus.NewDesc(
71+
prometheus.BuildFQName(namespace, "watchdog", "timeout_seconds"),
72+
"Value of /sys/class/watchdog/<watchdog>/timeout",
73+
[]string{"name"}, nil,
74+
)
75+
watchdogPretimeoutDesc = prometheus.NewDesc(
76+
prometheus.BuildFQName(namespace, "watchdog", "pretimeout_seconds"),
77+
"Value of /sys/class/watchdog/<watchdog>/pretimeout",
78+
[]string{"name"}, nil,
79+
)
80+
watchdogAccessCs0Desc = prometheus.NewDesc(
81+
prometheus.BuildFQName(namespace, "watchdog", "access_cs0"),
82+
"Value of /sys/class/watchdog/<watchdog>/access_cs0",
83+
[]string{"name"}, nil,
84+
)
85+
watchdogInfoDesc = prometheus.NewDesc(
86+
prometheus.BuildFQName(namespace, "watchdog", "info"),
87+
"Info of /sys/class/watchdog/<watchdog>",
88+
[]string{"name", "options", "identity", "state", "status", "pretimeout_governor"}, nil,
89+
)
90+
)
91+
92+
func toLabelValue(ptr *string) string {
93+
if ptr == nil {
94+
return ""
95+
}
96+
return *ptr
97+
}
98+
99+
func (c *watchdogCollector) Update(ch chan<- prometheus.Metric) error {
100+
watchdogClass, err := c.fs.WatchdogClass()
101+
if err != nil {
102+
return err
103+
}
104+
105+
for _, wd := range watchdogClass {
106+
if wd.Bootstatus != nil {
107+
ch <- prometheus.MustNewConstMetric(watchdogBootstatusDesc, prometheus.GaugeValue, float64(*wd.Bootstatus), wd.Name)
108+
}
109+
if wd.FwVersion != nil {
110+
ch <- prometheus.MustNewConstMetric(watchdogFwVersionDesc, prometheus.GaugeValue, float64(*wd.FwVersion), wd.Name)
111+
}
112+
if wd.Nowayout != nil {
113+
ch <- prometheus.MustNewConstMetric(watchdogNowayoutDesc, prometheus.GaugeValue, float64(*wd.Nowayout), wd.Name)
114+
}
115+
if wd.Timeleft != nil {
116+
ch <- prometheus.MustNewConstMetric(watchdogTimeleftDesc, prometheus.GaugeValue, float64(*wd.Timeleft), wd.Name)
117+
}
118+
if wd.Timeout != nil {
119+
ch <- prometheus.MustNewConstMetric(watchdogTimeoutDesc, prometheus.GaugeValue, float64(*wd.Timeout), wd.Name)
120+
}
121+
if wd.Pretimeout != nil {
122+
ch <- prometheus.MustNewConstMetric(watchdogPretimeoutDesc, prometheus.GaugeValue, float64(*wd.Pretimeout), wd.Name)
123+
}
124+
if wd.AccessCs0 != nil {
125+
ch <- prometheus.MustNewConstMetric(watchdogAccessCs0Desc, prometheus.GaugeValue, float64(*wd.AccessCs0), wd.Name)
126+
}
127+
128+
ch <- prometheus.MustNewConstMetric(watchdogInfoDesc, prometheus.GaugeValue, 1.0,
129+
wd.Name, toLabelValue(wd.Options), toLabelValue(wd.Identity), toLabelValue(wd.State), toLabelValue(wd.Status), toLabelValue(wd.PretimeoutGovernor))
130+
}
131+
132+
return nil
133+
}

collector/watchdog_test.go

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
// Copyright 2023 The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file ewcept in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
//go:build !nowatchdog
15+
// +build !nowatchdog
16+
17+
package collector
18+
19+
import (
20+
"fmt"
21+
"os"
22+
"strings"
23+
"testing"
24+
25+
"github.com/go-kit/log"
26+
"github.com/prometheus/client_golang/prometheus"
27+
"github.com/prometheus/client_golang/prometheus/testutil"
28+
)
29+
30+
type testWatchdogCollector struct {
31+
wc Collector
32+
}
33+
34+
func (c testWatchdogCollector) Collect(ch chan<- prometheus.Metric) {
35+
c.wc.Update(ch)
36+
}
37+
38+
func (c testWatchdogCollector) Describe(ch chan<- *prometheus.Desc) {
39+
prometheus.DescribeByCollect(c, ch)
40+
}
41+
42+
func TestWatchdogStats(t *testing.T) {
43+
testcase := `# HELP node_watchdog_access_cs0 Value of /sys/class/watchdog/<watchdog>/access_cs0
44+
# TYPE node_watchdog_access_cs0 gauge
45+
node_watchdog_access_cs0{name="watchdog0"} 0
46+
# HELP node_watchdog_bootstatus Value of /sys/class/watchdog/<watchdog>/bootstatus
47+
# TYPE node_watchdog_bootstatus gauge
48+
node_watchdog_bootstatus{name="watchdog0"} 1
49+
# HELP node_watchdog_fw_version Value of /sys/class/watchdog/<watchdog>/fw_version
50+
# TYPE node_watchdog_fw_version gauge
51+
node_watchdog_fw_version{name="watchdog0"} 2
52+
# HELP node_watchdog_info Info of /sys/class/watchdog/<watchdog>
53+
# TYPE node_watchdog_info gauge
54+
node_watchdog_info{identity="",name="watchdog1",options="",pretimeout_governor="",state="",status=""} 1
55+
node_watchdog_info{identity="Software Watchdog",name="watchdog0",options="0x8380",pretimeout_governor="noop",state="active",status="0x8000"} 1
56+
# HELP node_watchdog_nowayout Value of /sys/class/watchdog/<watchdog>/nowayout
57+
# TYPE node_watchdog_nowayout gauge
58+
node_watchdog_nowayout{name="watchdog0"} 0
59+
# HELP node_watchdog_pretimeout_seconds Value of /sys/class/watchdog/<watchdog>/pretimeout
60+
# TYPE node_watchdog_pretimeout_seconds gauge
61+
node_watchdog_pretimeout_seconds{name="watchdog0"} 120
62+
# HELP node_watchdog_timeleft_seconds Value of /sys/class/watchdog/<watchdog>/timeleft
63+
# TYPE node_watchdog_timeleft_seconds gauge
64+
node_watchdog_timeleft_seconds{name="watchdog0"} 300
65+
# HELP node_watchdog_timeout_seconds Value of /sys/class/watchdog/<watchdog>/timeout
66+
# TYPE node_watchdog_timeout_seconds gauge
67+
node_watchdog_timeout_seconds{name="watchdog0"} 60
68+
`
69+
*sysPath = "fixtures/sys"
70+
71+
logger := log.NewLogfmtLogger(os.Stderr)
72+
c, err := NewWatchdogCollector(logger)
73+
if err != nil {
74+
t.Fatal(err)
75+
}
76+
reg := prometheus.NewRegistry()
77+
reg.MustRegister(&testWatchdogCollector{wc: c})
78+
79+
sink := make(chan prometheus.Metric)
80+
go func() {
81+
err = c.Update(sink)
82+
if err != nil {
83+
panic(fmt.Errorf("failed to update collector: %s", err))
84+
}
85+
close(sink)
86+
}()
87+
88+
err = testutil.GatherAndCompare(reg, strings.NewReader(testcase))
89+
if err != nil {
90+
t.Fatal(err)
91+
}
92+
}

end-to-end-test.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ enabled_collectors=$(cat << COLLECTORS
4949
thermal_zone
5050
udp_queues
5151
vmstat
52+
watchdog
5253
wifi
5354
xfrm
5455
xfs

0 commit comments

Comments
 (0)