Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
0a0a8e3
Merge pull request #1 from aws/main
spanaik Jan 7, 2025
e6877b1
Merge branch 'aws:main' into main
spanaik Jan 26, 2025
d57685e
Add UltraServer support for CloudWatch agent
petruanica Feb 26, 2025
166704b
Add fmt changes
petruanica Feb 26, 2025
d172274
Merge branch 'main' into feat-ultraserver-tupperware-support
petruanica Mar 3, 2025
954d78f
Merge branch 'main' into feat-ultraserver-tupperware-support
petruanica Mar 4, 2025
06053ca
Merge branch 'aws:main' into feat-ultraserver-tupperware-support
petruanica Mar 4, 2025
e8eb480
Merge branch 'main' into feat-ultraserver-tupperware-support
petruanica Mar 5, 2025
a3d5799
Merge branch 'main' into feat-ultraserver-tupperware-support
petruanica Mar 7, 2025
9836c91
Merge branch 'main' into feat-ultraserver-tupperware-support
petruanica Mar 14, 2025
06f23cc
Merge branch 'main' into feat-ultraserver-tupperware-support
petruanica Mar 19, 2025
2987c0a
BugFix: Aggregrate neuron core utilization across runtimes
spanaik Jun 3, 2025
ff44140
Fix formatting
spanaik Jun 3, 2025
c17b8f4
Merge branch 'main' into main
spanaik Jun 3, 2025
8a57d23
Merge branch 'main' into main
spanaik Jun 4, 2025
e96b50e
address comments
spanaik Jun 9, 2025
23c85e6
Merge branch 'main' into main
spanaik Jun 9, 2025
3656cd5
Merge branch 'main' into main
spanaik Jun 10, 2025
de718d9
Fixing Indentations and minor bug
spanaik Jun 13, 2025
0d6014f
Merge branch 'main' into main
spanaik Jun 13, 2025
87f2191
Merge branch 'main' into main
sky333999 Jun 13, 2025
438dd00
Fix flaky tests and Lint errors
spanaik Jun 16, 2025
06b244f
Merge branch 'main' into main
spanaik Jun 16, 2025
4bea1e3
lint fix
spanaik Jun 16, 2025
37b0078
fix: Core Utilization for LNC enabled neuron Instances
spanaik Jul 2, 2025
b687c6c
Merge pull request #1 from Reham77/main
petruanica Jul 3, 2025
3326e3f
Merge commit 'b687c6cbcfa50dc546ea2c4b79dd7336bc3d850d' into feat-ult…
petruanica Jul 3, 2025
36f0eb6
Merge branch 'main' into feat-ultraserver-tupperware-support
petruanica Jul 3, 2025
aca5dca
Merge branch 'main' into feat-ultraserver-tupperware-support
petruanica Jul 3, 2025
d723eed
Merge branch 'main' into feat-ultraserver-tupperware-support
petruanica Jul 30, 2025
eba55a6
Merge branch 'main' into feat-ultraserver-tupperware-support
petruanica Aug 12, 2025
de23e81
Merge branch 'main' into feat-ultraserver-tupperware-support
petruanica Aug 12, 2025
cbeaa97
Merge branch 'main' into feat-ultraserver-tupperware-support
petruanica Aug 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions internal/containerinsightscommon/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ const (
MetricType = "Type"
SourcesKey = "Sources"
GpuDeviceKey = "GpuDevice"
UltraServerKey = "UltraServer"

ClusterQueueNameKey = "ClusterQueue"
ClusterQueueStatusKey = "Status"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ const (
NeuronExecutionErrorsAggregatedMetric = containerinsightscommon.NeuronExecutionErrors + "_total"
NeuronDeviceHardwareEccEventsAggregatedMetric = containerinsightscommon.NeuronDeviceHardwareEccEvents + "_total"
NeuronCoreLabel = "neuroncore"
NeuronCorePerDevice = 2
NeuronCoresPerDeviceAttributeKey = "neuroncore_per_device_count"
)

type AwsNeuronMetricModifier struct {
Expand Down Expand Up @@ -324,6 +324,8 @@ func (md *AwsNeuronMetricModifier) aggregateCoreUtilizationMetrics(originalMetri
aggregatedMetric := setMetricMetadata(newMetricSlice.AppendEmpty(), originalMetric.Name(), originalMetric.Unit())
aggregateDatapoints := aggregatedMetric.SetEmptySum().DataPoints()
firstOriginalDatapoint := originalMetricDatapoints.At(0)
neuronCoresPerDevice, _ := firstOriginalDatapoint.Attributes().Get(NeuronCoresPerDeviceAttributeKey)
neuronCoresPerDeviceInt, _ := strconv.Atoi(neuronCoresPerDevice.Str())
// Creating body for the aggregated metric and add it to the new newMetricSlice for each Core
for aggregatedMetricMetadata, value := range aggregatedValuesPerCore {
datapoint := aggregateDatapoints.AppendEmpty()
Expand All @@ -333,7 +335,7 @@ func (md *AwsNeuronMetricModifier) aggregateCoreUtilizationMetrics(originalMetri
datapoint.Attributes().PutStr(NeuronCoreLabel, aggregatedMetricMetadata.coreID)
datapoint.Attributes().PutStr(NeuronCoreAttributeKey, "core"+aggregatedMetricMetadata.coreID)
coreID, _ := strconv.Atoi(aggregatedMetricMetadata.coreID)
datapoint.Attributes().PutStr(NeuronDeviceAttributeKey, "device"+strconv.Itoa(coreID/NeuronCorePerDevice))
datapoint.Attributes().PutStr(NeuronDeviceAttributeKey, "device"+strconv.Itoa(coreID/neuronCoresPerDeviceInt))
}
return newMetricSlice
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ var metricNameToMetricLayout = map[string]MetricDefinition{
NeuronDeviceRuntimeMemoryUsedBytes: {MetricType: pmetric.MetricTypeGauge, MetricValues: []float64{1, 2}, SpecialAttributes: [][]string{{MemoryLocation, "host"}, {MemoryLocation, "neuron_device"}}, Unit: Bytes},
NeuronExecutionLatency: {MetricType: pmetric.MetricTypeGauge, MetricValues: []float64{0, 0, 0, 0, 1, 0, 0}, SpecialAttributes: [][]string{{Percentile, "p0"}, {Percentile, "p1"}, {Percentile, "p100"}, {Percentile, "p25"}, {Percentile, "p50"}, {Percentile, "p75"}, {Percentile, "p99"}}, Unit: Seconds},
NeuronDeviceHwEccEvents: {MetricType: pmetric.MetricTypeSum, MetricValues: []float64{1, 2, 3, 4}, SpecialAttributes: [][]string{{NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "mem_ecc_corrected", PodName, DummyPod, RuntimeTag, "1"}, {NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "mem_ecc_uncorrected", PodName, DummyPod, RuntimeTag, "1"}, {NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "sram_ecc_corrected", PodName, DummyPod, RuntimeTag, "1"}, {NeuronDeviceIndex, "1", NeuronDevice, "1", EventType, "sram_ecc_uncorrected", PodName, DummyPod, RuntimeTag, "1"}}, Unit: Count},
NeuronCoreUtilizationRatio: {MetricType: pmetric.MetricTypeGauge, MetricValues: []float64{1, 2, 3, 4, 5, 6}, SpecialAttributes: [][]string{{NeuronCore, "0", NeuronDevice, "0", NeuronCoreLabel, "0", MemoryLocation, "None", PodName, DummyPod, RuntimeTag, "DEFAULT"}, {NeuronCore, "1", NeuronDevice, "0", NeuronCoreLabel, "1", MemoryLocation, "None", PodName, DummyPod, RuntimeTag, "123"}, {NeuronCore, "2", NeuronDevice, "1", NeuronCoreLabel, "2", MemoryLocation, "None", PodName, DummyPod, RuntimeTag, "123"}, {NeuronCore, "0", NeuronDevice, "0", NeuronCoreLabel, "0", MemoryLocation, "None", PodName, DummyPod, RuntimeTag, "456"}, {NeuronCore, "1", NeuronDevice, "0", NeuronCoreLabel, "1", MemoryLocation, "None", PodName, DummyPod, RuntimeTag, "456"}, {NeuronCore, "2", NeuronDevice, "1", NeuronCoreLabel, "2", MemoryLocation, "None", PodName, DummyPod, RuntimeTag, "456"}}, Unit: Percent},
NeuronCoreUtilizationRatio: {MetricType: pmetric.MetricTypeGauge, MetricValues: []float64{1, 2, 3, 4, 5, 6}, SpecialAttributes: [][]string{{NeuronCore, "0", NeuronDevice, "0", NeuronCoreLabel, "0", MemoryLocation, "None", PodName, DummyPod, RuntimeTag, "DEFAULT", NeuronCoresPerDeviceAttributeKey, "2"}, {NeuronCore, "1", NeuronDevice, "0", NeuronCoreLabel, "1", MemoryLocation, "None", PodName, DummyPod, RuntimeTag, "123", NeuronCoresPerDeviceAttributeKey, "2"}, {NeuronCore, "2", NeuronDevice, "1", NeuronCoreLabel, "2", MemoryLocation, "None", PodName, DummyPod, RuntimeTag, "123", NeuronCoresPerDeviceAttributeKey, "2"}, {NeuronCore, "0", NeuronDevice, "0", NeuronCoreLabel, "0", MemoryLocation, "None", PodName, DummyPod, RuntimeTag, "456", NeuronCoresPerDeviceAttributeKey, "2"}, {NeuronCore, "1", NeuronDevice, "0", NeuronCoreLabel, "1", MemoryLocation, "None", PodName, DummyPod, RuntimeTag, "456", NeuronCoresPerDeviceAttributeKey, "2"}, {NeuronCore, "2", NeuronDevice, "1", NeuronCoreLabel, "2", MemoryLocation, "None", PodName, DummyPod, RuntimeTag, "456", NeuronCoresPerDeviceAttributeKey, "2"}}, Unit: Percent},
}

func setupMetricModifier() *AwsNeuronMetricModifier {
Expand Down Expand Up @@ -319,9 +319,9 @@ func TestMetricModifierForNeuronCoreUtilizationUsageMetric(t *testing.T) {

expectedMetrics := map[string]pmetric.Metric{
NeuronCoreUtilizationRatio: metricsList.At(0),
"node_neuroncore_utilization": createExpectedMetric("node_neuroncore_utilization", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", NeuronCoreLabel: "0", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None", RuntimeTag: "DEFAULT"}, {NeuronCore: "core1", NeuronDevice: "device0", NeuronCoreLabel: "1", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None", RuntimeTag: "DEFAULT"}, {NeuronCore: "core2", NeuronDevice: "device1", NeuronCoreLabel: "2", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None", RuntimeTag: "DEFAULT"}}, []float64{4, 5, 6}, pmetric.MetricTypeSum, Percent),
"pod_neuroncore_utilization": createExpectedMetric("pod_neuroncore_utilization", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", NeuronCoreLabel: "0", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None", RuntimeTag: "DEFAULT"}, {NeuronCore: "core1", NeuronDevice: "device0", NeuronCoreLabel: "1", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None", RuntimeTag: "DEFAULT"}, {NeuronCore: "core2", NeuronDevice: "device1", NeuronCoreLabel: "2", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None", RuntimeTag: "DEFAULT"}}, []float64{4, 5, 6}, pmetric.MetricTypeSum, Percent),
"container_neuroncore_utilization": createExpectedMetric("container_neuroncore_utilization", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", NeuronCoreLabel: "0", Type: ContainerAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None", RuntimeTag: "DEFAULT"}, {NeuronCore: "core1", NeuronDevice: "device0", NeuronCoreLabel: "1", Type: ContainerAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None", RuntimeTag: "DEFAULT"}, {NeuronCore: "core2", NeuronDevice: "device1", NeuronCoreLabel: "2", Type: ContainerAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None", RuntimeTag: "DEFAULT"}}, []float64{4, 5, 6}, pmetric.MetricTypeSum, Percent),
"node_neuroncore_utilization": createExpectedMetric("node_neuroncore_utilization", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", NeuronCoreLabel: "0", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None", RuntimeTag: "DEFAULT", NeuronCoresPerDeviceAttributeKey: "2"}, {NeuronCore: "core1", NeuronDevice: "device0", NeuronCoreLabel: "1", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None", RuntimeTag: "DEFAULT", NeuronCoresPerDeviceAttributeKey: "2"}, {NeuronCore: "core2", NeuronDevice: "device1", NeuronCoreLabel: "2", Type: NodeAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None", RuntimeTag: "DEFAULT", NeuronCoresPerDeviceAttributeKey: "2"}}, []float64{4, 5, 6}, pmetric.MetricTypeSum, Percent),
"pod_neuroncore_utilization": createExpectedMetric("pod_neuroncore_utilization", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", NeuronCoreLabel: "0", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None", RuntimeTag: "DEFAULT", NeuronCoresPerDeviceAttributeKey: "2"}, {NeuronCore: "core1", NeuronDevice: "device0", NeuronCoreLabel: "1", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None", RuntimeTag: "DEFAULT", NeuronCoresPerDeviceAttributeKey: "2"}, {NeuronCore: "core2", NeuronDevice: "device1", NeuronCoreLabel: "2", Type: PodAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None", RuntimeTag: "DEFAULT", NeuronCoresPerDeviceAttributeKey: "2"}}, []float64{4, 5, 6}, pmetric.MetricTypeSum, Percent),
"container_neuroncore_utilization": createExpectedMetric("container_neuroncore_utilization", false, []map[string]string{{NeuronCore: "core0", NeuronDevice: "device0", NeuronCoreLabel: "0", Type: ContainerAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None", RuntimeTag: "DEFAULT", NeuronCoresPerDeviceAttributeKey: "2"}, {NeuronCore: "core1", NeuronDevice: "device0", NeuronCoreLabel: "1", Type: ContainerAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None", RuntimeTag: "DEFAULT", NeuronCoresPerDeviceAttributeKey: "2"}, {NeuronCore: "core2", NeuronDevice: "device1", NeuronCoreLabel: "2", Type: ContainerAWSNeuronCore, PodName: DummyPod, MemoryLocation: "None", RuntimeTag: "DEFAULT", NeuronCoresPerDeviceAttributeKey: "2"}}, []float64{4, 5, 6}, pmetric.MetricTypeSum, Percent),
}

assertModifiedMetric(t, metricsList, expectedMetrics)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,11 @@ var PodNeuronLabelFilter = map[string]map[string]interface{}{
pod_owners: nil,
containerinsightscommon.K8sLabelsKey: nil,
},
internal.Region: nil,
internal.SubnetId: nil,
internal.NeuronCore: nil,
containerinsightscommon.MetricType: nil,
internal.Region: nil,
internal.SubnetId: nil,
internal.NeuronCore: nil,
containerinsightscommon.MetricType: nil,
containerinsightscommon.UltraServerKey: nil,
}

var ContainerNeuronLabelFilter = map[string]map[string]interface{}{
Expand All @@ -130,10 +131,11 @@ var ContainerNeuronLabelFilter = map[string]map[string]interface{}{
pod_owners: nil,
containerinsightscommon.K8sLabelsKey: nil,
},
internal.Region: nil,
internal.SubnetId: nil,
internal.NeuronCore: nil,
containerinsightscommon.MetricType: nil,
internal.Region: nil,
internal.SubnetId: nil,
internal.NeuronCore: nil,
containerinsightscommon.MetricType: nil,
containerinsightscommon.UltraServerKey: nil,
}

var NodeNeuronLabelFilter = map[string]map[string]interface{}{
Expand All @@ -149,8 +151,9 @@ var NodeNeuronLabelFilter = map[string]map[string]interface{}{
containerinsightscommon.HostKey: nil,
containerinsightscommon.K8sLabelsKey: nil,
},
internal.Region: nil,
internal.SubnetId: nil,
internal.NeuronCore: nil,
containerinsightscommon.MetricType: nil,
internal.Region: nil,
internal.SubnetId: nil,
internal.NeuronCore: nil,
containerinsightscommon.MetricType: nil,
containerinsightscommon.UltraServerKey: nil,
}
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,8 @@ exporters:
- pod_neurondevice_hw_ecc_events_total
- dimensions:
- - ClusterName
- - ClusterName
- UltraServer
- - ClusterName
- InstanceId
- NodeName
Expand All @@ -504,6 +506,8 @@ exporters:
- node_neuroncore_memory_usage_tensors
- dimensions:
- - ClusterName
- - ClusterName
- UltraServer
- - ClusterName
- InstanceId
- NodeName
Expand All @@ -513,6 +517,8 @@ exporters:
- node_neuron_execution_latency
- dimensions:
- - ClusterName
- - ClusterName
- UltraServer
- - ClusterName
- InstanceId
- NodeName
Expand Down
6 changes: 3 additions & 3 deletions translator/translate/otel/exporter/awsemf/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -562,7 +562,7 @@ func getAwsNeuronMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.Metric
},
},
{
Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "InstanceId", "NodeName"}, {"ClusterName", "InstanceType", "InstanceId", "NodeName", "NeuronDevice", "NeuronCore"}},
Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "UltraServer"}, {"ClusterName", "InstanceId", "NodeName"}, {"ClusterName", "InstanceType", "InstanceId", "NodeName", "NeuronDevice", "NeuronCore"}},
MetricNameSelectors: []string{
"node_neuroncore_utilization",
"node_neuroncore_memory_usage_total",
Expand All @@ -574,15 +574,15 @@ func getAwsNeuronMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.Metric
},
},
{
Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "InstanceId", "NodeName"}},
Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "UltraServer"}, {"ClusterName", "InstanceId", "NodeName"}},
MetricNameSelectors: []string{
"node_neuron_execution_errors_total",
"node_neurondevice_runtime_memory_used_bytes",
"node_neuron_execution_latency",
},
},
{
Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "InstanceId", "NodeName"}, {"ClusterName", "InstanceId", "NodeName", "NeuronDevice"}},
Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "UltraServer"}, {"ClusterName", "InstanceId", "NodeName"}, {"ClusterName", "InstanceId", "NodeName", "NeuronDevice"}},
MetricNameSelectors: []string{
"node_neurondevice_hw_ecc_events_total",
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,7 @@ func TestTranslator(t *testing.T) {
},
},
{
Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "InstanceId", "NodeName"}, {"ClusterName", "InstanceType", "InstanceId", "NodeName", "NeuronDevice", "NeuronCore"}},
Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "UltraServer"}, {"ClusterName", "InstanceId", "NodeName"}, {"ClusterName", "InstanceType", "InstanceId", "NodeName", "NeuronDevice", "NeuronCore"}},
MetricNameSelectors: []string{
"node_neuroncore_utilization",
"node_neuroncore_memory_usage_total",
Expand All @@ -469,15 +469,15 @@ func TestTranslator(t *testing.T) {
},
},
{
Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "InstanceId", "NodeName"}},
Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "UltraServer"}, {"ClusterName", "InstanceId", "NodeName"}},
MetricNameSelectors: []string{
"node_neuron_execution_errors_total",
"node_neurondevice_runtime_memory_used_bytes",
"node_neuron_execution_latency",
},
},
{
Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "InstanceId", "NodeName"}, {"ClusterName", "InstanceId", "NodeName", "NeuronDevice"}},
Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "UltraServer"}, {"ClusterName", "InstanceId", "NodeName"}, {"ClusterName", "InstanceId", "NodeName", "NeuronDevice"}},
MetricNameSelectors: []string{
"node_neurondevice_hw_ecc_events_total",
},
Expand Down
Loading