Skip to content

Commit d7f29b7

Browse files
authored
Add new Tensor Core and GPU capacity metrics (#1814)
1 parent 4c3550e commit d7f29b7

File tree

6 files changed

+75
-23
lines changed

6 files changed

+75
-23
lines changed

internal/containerinsightscommon/const.go

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -83,16 +83,17 @@ const (
8383
DiskIOWrite = "Write"
8484
DiskIOTotal = "Total"
8585

86-
GpuUtilization = "gpu_utilization"
87-
GpuMemUtilization = "gpu_memory_utilization"
88-
GpuMemUsed = "gpu_memory_used"
89-
GpuMemTotal = "gpu_memory_total"
90-
GpuTemperature = "gpu_temperature"
91-
GpuPowerDraw = "gpu_power_draw"
92-
GpuRequest = "gpu_request"
93-
GpuLimit = "gpu_limit"
94-
GpuTotal = "gpu_total"
95-
GpuUniqueId = "UUID"
86+
GpuUtilization = "gpu_utilization"
87+
GpuMemUtilization = "gpu_memory_utilization"
88+
GpuMemUsed = "gpu_memory_used"
89+
GpuMemTotal = "gpu_memory_total"
90+
GpuTemperature = "gpu_temperature"
91+
GpuPowerDraw = "gpu_power_draw"
92+
GpuRequest = "gpu_request"
93+
GpuLimit = "gpu_limit"
94+
GpuTotal = "gpu_total"
95+
GpuUniqueID = "UUID"
96+
GpuTensorCoreUtilization = "gpu_tensor_core_utilization"
9697

9798
NeuronCoreUtilization = "neuroncore_utilization"
9899
NeuronCoreMemoryUtilizationTotal = "neuroncore_memory_usage_total"

plugins/processors/gpuattributes/internal/metricFilters/gpumetricfilters.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ var ContainerGpuLabelFilter = map[string]map[string]interface{}{
3030
containerinsightscommon.FullPodNameKey: nil,
3131
containerinsightscommon.PodNameKey: nil,
3232
containerinsightscommon.TypeService: nil,
33-
containerinsightscommon.GpuUniqueId: nil,
33+
containerinsightscommon.GpuUniqueID: nil,
3434
containerinsightscommon.ContainerNamekey: nil,
3535
containerinsightscommon.InstanceTypeKey: nil,
3636
containerinsightscommon.VersionKey: nil,
@@ -57,7 +57,7 @@ var PodGpuLabelFilter = map[string]map[string]interface{}{
5757
containerinsightscommon.FullPodNameKey: nil,
5858
containerinsightscommon.PodNameKey: nil,
5959
containerinsightscommon.TypeService: nil,
60-
containerinsightscommon.GpuUniqueId: nil,
60+
containerinsightscommon.GpuUniqueID: nil,
6161
containerinsightscommon.InstanceTypeKey: nil,
6262
containerinsightscommon.VersionKey: nil,
6363
containerinsightscommon.SourcesKey: nil,

translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,8 @@ exporters:
185185
- node_gpu_limit
186186
- node_gpu_usage_total
187187
- node_gpu_reserved_capacity
188+
- node_gpu_unreserved_capacity
189+
- node_gpu_available_capacity
188190
- dimensions:
189191
- - ClusterName
190192
- InstanceId
@@ -346,6 +348,7 @@ exporters:
346348
- container_gpu_memory_used
347349
- container_gpu_power_draw
348350
- container_gpu_temperature
351+
- container_gpu_tensor_core_utilization
349352
- dimensions:
350353
- - ClusterName
351354
- - ClusterName
@@ -372,6 +375,7 @@ exporters:
372375
- pod_gpu_memory_used
373376
- pod_gpu_power_draw
374377
- pod_gpu_temperature
378+
- pod_gpu_tensor_core_utilization
375379
- dimensions:
376380
- - ClusterName
377381
- - ClusterName
@@ -389,6 +393,7 @@ exporters:
389393
- node_gpu_memory_used
390394
- node_gpu_power_draw
391395
- node_gpu_temperature
396+
- node_gpu_tensor_core_utilization
392397
- dimensions:
393398
- - ClusterName
394399
- - ClusterName
@@ -1226,6 +1231,48 @@ processors:
12261231
new_label: Type
12271232
new_value: NodeGPU
12281233
submatch_case: ""
1234+
- action: insert
1235+
aggregation_type: ""
1236+
include: DCGM_FI_PROF_PIPE_TENSOR_ACTIVE
1237+
match_type: ""
1238+
new_name: container_gpu_tensor_core_utilization
1239+
operations:
1240+
- action: add_label
1241+
aggregation_type: ""
1242+
experimental_scale: 0
1243+
label: ""
1244+
label_value: ""
1245+
new_label: Type
1246+
new_value: ContainerGPU
1247+
submatch_case: ""
1248+
- action: insert
1249+
aggregation_type: ""
1250+
include: DCGM_FI_PROF_PIPE_TENSOR_ACTIVE
1251+
match_type: ""
1252+
new_name: pod_gpu_tensor_core_utilization
1253+
operations:
1254+
- action: add_label
1255+
aggregation_type: ""
1256+
experimental_scale: 0
1257+
label: ""
1258+
label_value: ""
1259+
new_label: Type
1260+
new_value: PodGPU
1261+
submatch_case: ""
1262+
- action: insert
1263+
aggregation_type: ""
1264+
include: DCGM_FI_PROF_PIPE_TENSOR_ACTIVE
1265+
match_type: ""
1266+
new_name: node_gpu_tensor_core_utilization
1267+
operations:
1268+
- action: add_label
1269+
aggregation_type: ""
1270+
experimental_scale: 0
1271+
label: ""
1272+
label_value: ""
1273+
new_label: Type
1274+
new_value: NodeGPU
1275+
submatch_case: ""
12291276
- action: update
12301277
aggregation_type: ""
12311278
include: execution_status_total

translator/translate/otel/exporter/awsemf/kubernetes.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ func getNodeMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDecla
154154
"node_status_capacity_pods", "node_status_allocatable_pods",
155155
}
156156
if awscontainerinsight.AcceleratedComputeMetricsEnabled(conf) {
157-
nodeMetrics = append(nodeMetrics, "node_gpu_limit", "node_gpu_usage_total", "node_gpu_reserved_capacity")
157+
nodeMetrics = append(nodeMetrics, "node_gpu_limit", "node_gpu_usage_total", "node_gpu_reserved_capacity", "node_gpu_unreserved_capacity", "node_gpu_available_capacity")
158158
}
159159
if enhancedContainerInsightsEnabled {
160160
return []*awsemfexporter.MetricDeclaration{
@@ -491,6 +491,7 @@ func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar
491491
"container_gpu_memory_used",
492492
"container_gpu_power_draw",
493493
"container_gpu_temperature",
494+
"container_gpu_tensor_core_utilization",
494495
},
495496
},
496497
{
@@ -502,6 +503,7 @@ func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar
502503
"pod_gpu_memory_used",
503504
"pod_gpu_power_draw",
504505
"pod_gpu_temperature",
506+
"pod_gpu_tensor_core_utilization",
505507
},
506508
},
507509
{
@@ -513,6 +515,7 @@ func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar
513515
"node_gpu_memory_used",
514516
"node_gpu_power_draw",
515517
"node_gpu_temperature",
518+
"node_gpu_tensor_core_utilization",
516519
},
517520
},
518521
}...)

translator/translate/otel/exporter/awsemf/translator_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ func TestTranslator(t *testing.T) {
320320
"node_cpu_usage_total", "node_cpu_limit", "node_memory_working_set", "node_memory_limit",
321321
"node_status_condition_ready", "node_status_condition_disk_pressure", "node_status_condition_memory_pressure",
322322
"node_status_condition_pid_pressure", "node_status_condition_network_unavailable", "node_status_condition_unknown",
323-
"node_status_capacity_pods", "node_status_allocatable_pods", "node_gpu_limit", "node_gpu_usage_total", "node_gpu_reserved_capacity"},
323+
"node_status_capacity_pods", "node_status_allocatable_pods", "node_gpu_limit", "node_gpu_usage_total", "node_gpu_reserved_capacity", "node_gpu_unreserved_capacity", "node_gpu_available_capacity"},
324324
},
325325
{
326326
Dimensions: [][]string{
@@ -405,19 +405,19 @@ func TestTranslator(t *testing.T) {
405405
{
406406
Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace", "PodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName", "GpuDevice"}},
407407
MetricNameSelectors: []string{
408-
"container_gpu_utilization", "container_gpu_memory_utilization", "container_gpu_memory_total", "container_gpu_memory_used", "container_gpu_power_draw", "container_gpu_temperature",
408+
"container_gpu_utilization", "container_gpu_memory_utilization", "container_gpu_memory_total", "container_gpu_memory_used", "container_gpu_power_draw", "container_gpu_temperature", "container_gpu_tensor_core_utilization",
409409
},
410410
},
411411
{
412412
Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace"}, {"ClusterName", "Namespace", "Service"}, {"ClusterName", "Namespace", "PodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "GpuDevice"}},
413413
MetricNameSelectors: []string{
414-
"pod_gpu_utilization", "pod_gpu_memory_utilization", "pod_gpu_memory_total", "pod_gpu_memory_used", "pod_gpu_power_draw", "pod_gpu_temperature",
414+
"pod_gpu_utilization", "pod_gpu_memory_utilization", "pod_gpu_memory_total", "pod_gpu_memory_used", "pod_gpu_power_draw", "pod_gpu_temperature", "pod_gpu_tensor_core_utilization",
415415
},
416416
},
417417
{
418418
Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "NodeName", "InstanceId"}, {"ClusterName", "NodeName", "InstanceId", "InstanceType", "GpuDevice"}},
419419
MetricNameSelectors: []string{
420-
"node_gpu_utilization", "node_gpu_memory_utilization", "node_gpu_memory_total", "node_gpu_memory_used", "node_gpu_power_draw", "node_gpu_temperature",
420+
"node_gpu_utilization", "node_gpu_memory_utilization", "node_gpu_memory_total", "node_gpu_memory_used", "node_gpu_power_draw", "node_gpu_temperature", "node_gpu_tensor_core_utilization",
421421
},
422422
},
423423
{

translator/translate/otel/processor/metricstransformprocessor/translator.go

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,13 @@ var metricDuplicateTypes = []string{
3030
}
3131

3232
var renameMapForDcgm = map[string]string{
33-
"DCGM_FI_DEV_GPU_UTIL": containerinsightscommon.GpuUtilization,
34-
"DCGM_FI_DEV_FB_USED_PERCENT": containerinsightscommon.GpuMemUtilization,
35-
"DCGM_FI_DEV_FB_USED": containerinsightscommon.GpuMemUsed,
36-
"DCGM_FI_DEV_FB_TOTAL": containerinsightscommon.GpuMemTotal,
37-
"DCGM_FI_DEV_GPU_TEMP": containerinsightscommon.GpuTemperature,
38-
"DCGM_FI_DEV_POWER_USAGE": containerinsightscommon.GpuPowerDraw,
33+
"DCGM_FI_DEV_GPU_UTIL": containerinsightscommon.GpuUtilization,
34+
"DCGM_FI_DEV_FB_USED_PERCENT": containerinsightscommon.GpuMemUtilization,
35+
"DCGM_FI_DEV_FB_USED": containerinsightscommon.GpuMemUsed,
36+
"DCGM_FI_DEV_FB_TOTAL": containerinsightscommon.GpuMemTotal,
37+
"DCGM_FI_DEV_GPU_TEMP": containerinsightscommon.GpuTemperature,
38+
"DCGM_FI_DEV_POWER_USAGE": containerinsightscommon.GpuPowerDraw,
39+
"DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": containerinsightscommon.GpuTensorCoreUtilization,
3940
}
4041

4142
var renameMapForNeuronMonitor = map[string]string{

0 commit comments

Comments
 (0)