Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions internal/containerinsightscommon/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,16 +83,17 @@ const (
DiskIOWrite = "Write"
DiskIOTotal = "Total"

GpuUtilization = "gpu_utilization"
GpuMemUtilization = "gpu_memory_utilization"
GpuMemUsed = "gpu_memory_used"
GpuMemTotal = "gpu_memory_total"
GpuTemperature = "gpu_temperature"
GpuPowerDraw = "gpu_power_draw"
GpuRequest = "gpu_request"
GpuLimit = "gpu_limit"
GpuTotal = "gpu_total"
GpuUniqueId = "UUID"
GpuUtilization = "gpu_utilization"
GpuMemUtilization = "gpu_memory_utilization"
GpuMemUsed = "gpu_memory_used"
GpuMemTotal = "gpu_memory_total"
GpuTemperature = "gpu_temperature"
GpuPowerDraw = "gpu_power_draw"
GpuRequest = "gpu_request"
GpuLimit = "gpu_limit"
GpuTotal = "gpu_total"
GpuUniqueID = "UUID"
GpuTensorCoreUtilization = "gpu_tensor_core_utilization"

NeuronCoreUtilization = "neuroncore_utilization"
NeuronCoreMemoryUtilizationTotal = "neuroncore_memory_usage_total"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ var ContainerGpuLabelFilter = map[string]map[string]interface{}{
containerinsightscommon.FullPodNameKey: nil,
containerinsightscommon.PodNameKey: nil,
containerinsightscommon.TypeService: nil,
containerinsightscommon.GpuUniqueId: nil,
containerinsightscommon.GpuUniqueID: nil,
containerinsightscommon.ContainerNamekey: nil,
containerinsightscommon.InstanceTypeKey: nil,
containerinsightscommon.VersionKey: nil,
Expand All @@ -57,7 +57,7 @@ var PodGpuLabelFilter = map[string]map[string]interface{}{
containerinsightscommon.FullPodNameKey: nil,
containerinsightscommon.PodNameKey: nil,
containerinsightscommon.TypeService: nil,
containerinsightscommon.GpuUniqueId: nil,
containerinsightscommon.GpuUniqueID: nil,
containerinsightscommon.InstanceTypeKey: nil,
containerinsightscommon.VersionKey: nil,
containerinsightscommon.SourcesKey: nil,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ exporters:
- node_gpu_limit
- node_gpu_usage_total
- node_gpu_reserved_capacity
- node_gpu_unreserved_capacity
- node_gpu_available_capacity
- dimensions:
- - ClusterName
- InstanceId
Expand Down Expand Up @@ -346,6 +348,7 @@ exporters:
- container_gpu_memory_used
- container_gpu_power_draw
- container_gpu_temperature
- container_gpu_tensor_core_utilization
- dimensions:
- - ClusterName
- - ClusterName
Expand All @@ -372,6 +375,7 @@ exporters:
- pod_gpu_memory_used
- pod_gpu_power_draw
- pod_gpu_temperature
- pod_gpu_tensor_core_utilization
- dimensions:
- - ClusterName
- - ClusterName
Expand All @@ -389,6 +393,7 @@ exporters:
- node_gpu_memory_used
- node_gpu_power_draw
- node_gpu_temperature
- node_gpu_tensor_core_utilization
- dimensions:
- - ClusterName
- - ClusterName
Expand Down Expand Up @@ -1226,6 +1231,48 @@ processors:
new_label: Type
new_value: NodeGPU
submatch_case: ""
- action: insert
aggregation_type: ""
include: DCGM_FI_PROF_PIPE_TENSOR_ACTIVE
match_type: ""
new_name: container_gpu_tensor_core_utilization
operations:
- action: add_label
aggregation_type: ""
experimental_scale: 0
label: ""
label_value: ""
new_label: Type
new_value: ContainerGPU
submatch_case: ""
- action: insert
aggregation_type: ""
include: DCGM_FI_PROF_PIPE_TENSOR_ACTIVE
match_type: ""
new_name: pod_gpu_tensor_core_utilization
operations:
- action: add_label
aggregation_type: ""
experimental_scale: 0
label: ""
label_value: ""
new_label: Type
new_value: PodGPU
submatch_case: ""
- action: insert
aggregation_type: ""
include: DCGM_FI_PROF_PIPE_TENSOR_ACTIVE
match_type: ""
new_name: node_gpu_tensor_core_utilization
operations:
- action: add_label
aggregation_type: ""
experimental_scale: 0
label: ""
label_value: ""
new_label: Type
new_value: NodeGPU
submatch_case: ""
- action: update
aggregation_type: ""
include: execution_status_total
Expand Down
5 changes: 4 additions & 1 deletion translator/translate/otel/exporter/awsemf/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ func getNodeMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDecla
"node_status_capacity_pods", "node_status_allocatable_pods",
}
if awscontainerinsight.AcceleratedComputeMetricsEnabled(conf) {
nodeMetrics = append(nodeMetrics, "node_gpu_limit", "node_gpu_usage_total", "node_gpu_reserved_capacity")
nodeMetrics = append(nodeMetrics, "node_gpu_limit", "node_gpu_usage_total", "node_gpu_reserved_capacity", "node_gpu_unreserved_capacity", "node_gpu_available_capacity")
}
if enhancedContainerInsightsEnabled {
return []*awsemfexporter.MetricDeclaration{
Expand Down Expand Up @@ -491,6 +491,7 @@ func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar
"container_gpu_memory_used",
"container_gpu_power_draw",
"container_gpu_temperature",
"container_gpu_tensor_core_utilization",
},
},
{
Expand All @@ -502,6 +503,7 @@ func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar
"pod_gpu_memory_used",
"pod_gpu_power_draw",
"pod_gpu_temperature",
"pod_gpu_tensor_core_utilization",
},
},
{
Expand All @@ -513,6 +515,7 @@ func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar
"node_gpu_memory_used",
"node_gpu_power_draw",
"node_gpu_temperature",
"node_gpu_tensor_core_utilization",
},
},
}...)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ func TestTranslator(t *testing.T) {
"node_cpu_usage_total", "node_cpu_limit", "node_memory_working_set", "node_memory_limit",
"node_status_condition_ready", "node_status_condition_disk_pressure", "node_status_condition_memory_pressure",
"node_status_condition_pid_pressure", "node_status_condition_network_unavailable", "node_status_condition_unknown",
"node_status_capacity_pods", "node_status_allocatable_pods", "node_gpu_limit", "node_gpu_usage_total", "node_gpu_reserved_capacity"},
"node_status_capacity_pods", "node_status_allocatable_pods", "node_gpu_limit", "node_gpu_usage_total", "node_gpu_reserved_capacity", "node_gpu_unreserved_capacity", "node_gpu_available_capacity"},
},
{
Dimensions: [][]string{
Expand Down Expand Up @@ -405,19 +405,19 @@ func TestTranslator(t *testing.T) {
{
Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace", "PodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName", "GpuDevice"}},
MetricNameSelectors: []string{
"container_gpu_utilization", "container_gpu_memory_utilization", "container_gpu_memory_total", "container_gpu_memory_used", "container_gpu_power_draw", "container_gpu_temperature",
"container_gpu_utilization", "container_gpu_memory_utilization", "container_gpu_memory_total", "container_gpu_memory_used", "container_gpu_power_draw", "container_gpu_temperature", "container_gpu_tensor_core_utilization",
},
},
{
Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace"}, {"ClusterName", "Namespace", "Service"}, {"ClusterName", "Namespace", "PodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "GpuDevice"}},
MetricNameSelectors: []string{
"pod_gpu_utilization", "pod_gpu_memory_utilization", "pod_gpu_memory_total", "pod_gpu_memory_used", "pod_gpu_power_draw", "pod_gpu_temperature",
"pod_gpu_utilization", "pod_gpu_memory_utilization", "pod_gpu_memory_total", "pod_gpu_memory_used", "pod_gpu_power_draw", "pod_gpu_temperature", "pod_gpu_tensor_core_utilization",
},
},
{
Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "NodeName", "InstanceId"}, {"ClusterName", "NodeName", "InstanceId", "InstanceType", "GpuDevice"}},
MetricNameSelectors: []string{
"node_gpu_utilization", "node_gpu_memory_utilization", "node_gpu_memory_total", "node_gpu_memory_used", "node_gpu_power_draw", "node_gpu_temperature",
"node_gpu_utilization", "node_gpu_memory_utilization", "node_gpu_memory_total", "node_gpu_memory_used", "node_gpu_power_draw", "node_gpu_temperature", "node_gpu_tensor_core_utilization",
},
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,13 @@ var metricDuplicateTypes = []string{
}

var renameMapForDcgm = map[string]string{
"DCGM_FI_DEV_GPU_UTIL": containerinsightscommon.GpuUtilization,
"DCGM_FI_DEV_FB_USED_PERCENT": containerinsightscommon.GpuMemUtilization,
"DCGM_FI_DEV_FB_USED": containerinsightscommon.GpuMemUsed,
"DCGM_FI_DEV_FB_TOTAL": containerinsightscommon.GpuMemTotal,
"DCGM_FI_DEV_GPU_TEMP": containerinsightscommon.GpuTemperature,
"DCGM_FI_DEV_POWER_USAGE": containerinsightscommon.GpuPowerDraw,
"DCGM_FI_DEV_GPU_UTIL": containerinsightscommon.GpuUtilization,
"DCGM_FI_DEV_FB_USED_PERCENT": containerinsightscommon.GpuMemUtilization,
"DCGM_FI_DEV_FB_USED": containerinsightscommon.GpuMemUsed,
"DCGM_FI_DEV_FB_TOTAL": containerinsightscommon.GpuMemTotal,
"DCGM_FI_DEV_GPU_TEMP": containerinsightscommon.GpuTemperature,
"DCGM_FI_DEV_POWER_USAGE": containerinsightscommon.GpuPowerDraw,
"DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": containerinsightscommon.GpuTensorCoreUtilization,
}

var renameMapForNeuronMonitor = map[string]string{
Expand Down
Loading