Skip to content

Commit 5f110df

Browse files
authored
Add initial support for monitoring GPUs on Linux (#1998)
Expose GPU metrics using `sysfs/drm`. `amdgpu` is the only driver which exposes this information through DRM. Signed-off-by: Siavash Safi <[email protected]>
1 parent f61be48 commit 5f110df

File tree

1 file changed

+143
-0
lines changed

1 file changed

+143
-0
lines changed

collector/drm_linux.go

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
// Copyright 2021 The Prometheus Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
// +build !nogpu
15+
16+
package collector
17+
18+
import (
19+
"fmt"
20+
21+
"github.com/go-kit/log"
22+
"github.com/prometheus/client_golang/prometheus"
23+
"github.com/prometheus/procfs/sysfs"
24+
)
25+
26+
const (
27+
drmCollectorSubsystem = "drm"
28+
)
29+
30+
type drmCollector struct {
31+
fs sysfs.FS
32+
logger log.Logger
33+
CardInfo *prometheus.Desc
34+
GPUBusyPercent *prometheus.Desc
35+
MemoryGTTSize *prometheus.Desc
36+
MemoryGTTUsed *prometheus.Desc
37+
MemoryVisibleVRAMSize *prometheus.Desc
38+
MemoryVisibleVRAMUsed *prometheus.Desc
39+
MemoryVRAMSize *prometheus.Desc
40+
MemoryVRAMUsed *prometheus.Desc
41+
}
42+
43+
func init() {
44+
registerCollector("drm", defaultDisabled, NewDrmCollector)
45+
}
46+
47+
// NewDrmCollector returns a new Collector exposing /sys/class/drm/card?/device stats.
48+
func NewDrmCollector(logger log.Logger) (Collector, error) {
49+
fs, err := sysfs.NewFS(*sysPath)
50+
if err != nil {
51+
return nil, fmt.Errorf("failed to open sysfs: %w", err)
52+
}
53+
54+
return &drmCollector{
55+
fs: fs,
56+
logger: logger,
57+
CardInfo: prometheus.NewDesc(
58+
prometheus.BuildFQName(namespace, drmCollectorSubsystem, "card_info"),
59+
"Card information",
60+
[]string{"card", "memory_vendor", "power_performance_level", "unique_id", "vendor"}, nil,
61+
),
62+
GPUBusyPercent: prometheus.NewDesc(
63+
prometheus.BuildFQName(namespace, drmCollectorSubsystem, "gpu_busy_percent"),
64+
"How busy the GPU is as a percentage.",
65+
[]string{"card"}, nil,
66+
),
67+
MemoryGTTSize: prometheus.NewDesc(
68+
prometheus.BuildFQName(namespace, drmCollectorSubsystem, "memory_gtt_size_bytes"),
69+
"The size of the graphics translation table (GTT) block in bytes.",
70+
[]string{"card"}, nil,
71+
),
72+
MemoryGTTUsed: prometheus.NewDesc(
73+
prometheus.BuildFQName(namespace, drmCollectorSubsystem, "memory_gtt_used_bytes"),
74+
"The used amount of the graphics translation table (GTT) block in bytes.",
75+
[]string{"card"}, nil,
76+
),
77+
MemoryVisibleVRAMSize: prometheus.NewDesc(
78+
prometheus.BuildFQName(namespace, drmCollectorSubsystem, "memory_vis_vram_size_bytes"),
79+
"The size of visible VRAM in bytes.",
80+
[]string{"card"}, nil,
81+
),
82+
MemoryVisibleVRAMUsed: prometheus.NewDesc(
83+
prometheus.BuildFQName(namespace, drmCollectorSubsystem, "memory_vis_vram_used_bytes"),
84+
"The used amount of visible VRAM in bytes.",
85+
[]string{"card"}, nil,
86+
),
87+
MemoryVRAMSize: prometheus.NewDesc(
88+
prometheus.BuildFQName(namespace, drmCollectorSubsystem, "memory_vram_size_bytes"),
89+
"The size of VRAM in bytes.",
90+
[]string{"card"}, nil,
91+
),
92+
MemoryVRAMUsed: prometheus.NewDesc(
93+
prometheus.BuildFQName(namespace, drmCollectorSubsystem, "memory_vram_used_bytes"),
94+
"The used amount of VRAM in bytes.",
95+
[]string{"card"}, nil,
96+
),
97+
}, nil
98+
}
99+
100+
func (c *drmCollector) Update(ch chan<- prometheus.Metric) error {
101+
if err := c.updateAMDCards(ch); err != nil {
102+
return err
103+
}
104+
105+
return nil
106+
}
107+
108+
func (c *drmCollector) updateAMDCards(ch chan<- prometheus.Metric) error {
109+
vendor := "amd"
110+
stats, err := c.fs.ClassDRMCardAMDGPUStats()
111+
if err != nil {
112+
return err
113+
}
114+
115+
for _, s := range stats {
116+
ch <- prometheus.MustNewConstMetric(
117+
c.CardInfo, prometheus.GaugeValue, 1,
118+
s.Name, s.MemoryVRAMVendor, s.PowerDPMForcePerformanceLevel, s.UniqueID, vendor)
119+
120+
ch <- prometheus.MustNewConstMetric(
121+
c.GPUBusyPercent, prometheus.GaugeValue, float64(s.GPUBusyPercent), s.Name)
122+
123+
ch <- prometheus.MustNewConstMetric(
124+
c.MemoryGTTSize, prometheus.GaugeValue, float64(s.MemoryGTTSize), s.Name)
125+
126+
ch <- prometheus.MustNewConstMetric(
127+
c.MemoryGTTUsed, prometheus.GaugeValue, float64(s.MemoryGTTUsed), s.Name)
128+
129+
ch <- prometheus.MustNewConstMetric(
130+
c.MemoryVRAMSize, prometheus.GaugeValue, float64(s.MemoryVRAMSize), s.Name)
131+
132+
ch <- prometheus.MustNewConstMetric(
133+
c.MemoryVRAMUsed, prometheus.GaugeValue, float64(s.MemoryVRAMUsed), s.Name)
134+
135+
ch <- prometheus.MustNewConstMetric(
136+
c.MemoryVisibleVRAMSize, prometheus.GaugeValue, float64(s.MemoryVisibleVRAMSize), s.Name)
137+
138+
ch <- prometheus.MustNewConstMetric(
139+
c.MemoryVisibleVRAMUsed, prometheus.GaugeValue, float64(s.MemoryVisibleVRAMUsed), s.Name)
140+
}
141+
142+
return nil
143+
}

0 commit comments

Comments
 (0)