Skip to content

Commit 04ef515

Browse files
committed
feat: add new RDS metrics to eporter
- Add BurstBalance metric for gp2 burst-bucket I/O credits - Add CheckpointLag metric for WAL data consistency - Add CPU credit metrics (Balance, Usage, Surplus) - Add DiskQueueDepth metric for outstanding IOs - Add EBS metrics (ByteBalance, IOBalance) - Add Network metrics (Receive/Transmit Throughput) - Add Replication metrics (OldestSlotLag) - Add Latency metrics (Read/Write) - Add TransactionLogsGeneration metric Signed-off-by: Semyon Koshel <[email protected]>
1 parent 1bfd810 commit 04ef515

File tree

3 files changed

+330
-15
lines changed

3 files changed

+330
-15
lines changed

internal/app/cloudwatch/rds.go

Lines changed: 74 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,58 +27,103 @@ type CloudWatchMetrics struct {
2727
}
2828

2929
type RdsMetrics struct {
30+
BurstBalance *float64
31+
CheckpointLag *float64
32+
CPUCreditBalance *float64
33+
CPUCreditUsage *float64
34+
CPUSurplusCreditBalance *float64
35+
CPUSurplusCreditsCharged *float64
3036
CPUUtilization *float64
3137
DBLoad *float64
3238
DBLoadCPU *float64
3339
DBLoadNonCPU *float64
3440
DatabaseConnections *float64
41+
DiskQueueDepth *float64
42+
EBSByteBalance *float64
43+
EBSIOBalance *float64
3544
FreeStorageSpace *float64
3645
FreeableMemory *float64
3746
MaximumUsedTransactionIDs *float64
47+
NetworkReceiveThroughput *float64
48+
NetworkTransmitThroughput *float64
49+
OldestReplicationSlotLag *float64
50+
ReadLatency *float64
3851
ReadIOPS *float64
3952
ReadThroughput *float64
4053
ReplicaLag *float64
4154
ReplicationSlotDiskUsage *float64
4255
SwapUsage *float64
4356
TransactionLogsDiskUsage *float64
57+
TransactionLogsGeneration *float64
58+
WriteLatency *float64
4459
WriteIOPS *float64
4560
WriteThroughput *float64
4661
}
4762

4863
func (m *RdsMetrics) Update(field string, value float64) error {
4964
switch field {
65+
case "BurstBalance":
66+
m.BurstBalance = &value
67+
case "CheckpointLag":
68+
m.CheckpointLag = &value
69+
case "CPUCreditBalance":
70+
m.CPUCreditBalance = &value
71+
case "CPUCreditUsage":
72+
m.CPUCreditUsage = &value
73+
case "CPUSurplusCreditBalance":
74+
m.CPUSurplusCreditBalance = &value
75+
case "CPUSurplusCreditsCharged":
76+
m.CPUSurplusCreditsCharged = &value
77+
case "CPUUtilization":
78+
m.CPUUtilization = &value
5079
case "DBLoad":
5180
m.DBLoad = &value
5281
case "DBLoadCPU":
5382
m.DBLoadCPU = &value
5483
case "DBLoadNonCPU":
5584
m.DBLoadNonCPU = &value
56-
case "CPUUtilization":
57-
m.CPUUtilization = &value
5885
case "DatabaseConnections":
5986
m.DatabaseConnections = &value
87+
case "DiskQueueDepth":
88+
m.DiskQueueDepth = &value
89+
case "EBSByteBalance%":
90+
m.EBSByteBalance = &value
91+
case "EBSIOBalance%":
92+
m.EBSIOBalance = &value
6093
case "FreeStorageSpace":
6194
m.FreeStorageSpace = &value
6295
case "FreeableMemory":
6396
m.FreeableMemory = &value
64-
case "SwapUsage":
65-
m.SwapUsage = &value
66-
case "WriteIOPS":
67-
m.WriteIOPS = &value
97+
case "MaximumUsedTransactionIDs":
98+
m.MaximumUsedTransactionIDs = &value
99+
case "NetworkReceiveThroughput":
100+
m.NetworkReceiveThroughput = &value
101+
case "NetworkTransmitThroughput":
102+
m.NetworkTransmitThroughput = &value
103+
case "OldestReplicationSlotLag":
104+
m.OldestReplicationSlotLag = &value
105+
case "ReadLatency":
106+
m.ReadLatency = &value
68107
case "ReadIOPS":
69108
m.ReadIOPS = &value
109+
case "ReadThroughput":
110+
m.ReadThroughput = &value
70111
case "ReplicaLag":
71112
m.ReplicaLag = &value
72113
case "ReplicationSlotDiskUsage":
73114
m.ReplicationSlotDiskUsage = &value
74-
case "MaximumUsedTransactionIDs":
75-
m.MaximumUsedTransactionIDs = &value
76-
case "ReadThroughput":
77-
m.ReadThroughput = &value
78-
case "WriteThroughput":
79-
m.WriteThroughput = &value
115+
case "SwapUsage":
116+
m.SwapUsage = &value
80117
case "TransactionLogsDiskUsage":
81118
m.TransactionLogsDiskUsage = &value
119+
case "TransactionLogsGeneration":
120+
m.TransactionLogsGeneration = &value
121+
case "WriteLatency":
122+
m.WriteLatency = &value
123+
case "WriteIOPS":
124+
m.WriteIOPS = &value
125+
case "WriteThroughput":
126+
m.WriteThroughput = &value
82127
default:
83128
return fmt.Errorf("can't process '%s' metrics: %w", field, errUnknownMetric)
84129
}
@@ -87,23 +132,38 @@ func (m *RdsMetrics) Update(field string, value float64) error {
87132
}
88133

89134
// getCloudWatchMetricsName returns names of Cloudwatch metrics to collect
90-
func getCloudWatchMetricsName() [16]string {
91-
return [16]string{
135+
func getCloudWatchMetricsName() [31]string {
136+
return [31]string{
137+
"BurstBalance",
138+
"CheckpointLag",
139+
"CPUCreditBalance",
140+
"CPUCreditUsage",
141+
"CPUSurplusCreditBalance",
142+
"CPUSurplusCreditsCharged",
92143
"CPUUtilization",
93144
"DBLoad",
94145
"DBLoadCPU",
95146
"DBLoadNonCPU",
96147
"DatabaseConnections",
148+
"DiskQueueDepth",
149+
"EBSByteBalance%",
150+
"EBSIOBalance%",
97151
"FreeStorageSpace",
98152
"FreeableMemory",
99153
"MaximumUsedTransactionIDs",
154+
"NetworkReceiveThroughput",
155+
"NetworkTransmitThroughput",
156+
"OldestReplicationSlotLag",
157+
"ReadLatency",
100158
"ReadIOPS",
101159
"ReadThroughput",
102160
"ReplicaLag",
103161
"ReplicationSlotDiskUsage",
104162
"SwapUsage",
105163
"TransactionLogsDiskUsage",
164+
"TransactionLogsGeneration",
106165
"WriteIOPS",
166+
"WriteLatency",
107167
"WriteThroughput",
108168
}
109169
}

internal/app/cloudwatch/rds_test.go

Lines changed: 120 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,46 +15,106 @@ import (
1515
)
1616

1717
var db1ExpecteRdsMetrics = cloudwatch.RdsMetrics{
18+
BurstBalance: aws.Float64(100),
19+
CheckpointLag: aws.Float64(0),
20+
CPUCreditBalance: aws.Float64(100),
21+
CPUCreditUsage: aws.Float64(10),
22+
CPUSurplusCreditBalance: aws.Float64(0),
23+
CPUSurplusCreditsCharged: aws.Float64(0),
1824
CPUUtilization: aws.Float64(10),
1925
DBLoad: aws.Float64(1),
2026
DBLoadCPU: aws.Float64(2),
2127
DBLoadNonCPU: aws.Float64(4),
2228
DatabaseConnections: aws.Float64(42),
29+
DiskQueueDepth: aws.Float64(0),
30+
EBSByteBalance: aws.Float64(100),
31+
EBSIOBalance: aws.Float64(100),
2332
FreeStorageSpace: aws.Float64(5),
2433
FreeableMemory: aws.Float64(10),
2534
MaximumUsedTransactionIDs: aws.Float64(1000000),
35+
NetworkReceiveThroughput: aws.Float64(1000),
36+
NetworkTransmitThroughput: aws.Float64(1000),
37+
OldestReplicationSlotLag: aws.Float64(0),
38+
ReadLatency: aws.Float64(0.1),
2639
ReadIOPS: aws.Float64(100),
2740
ReadThroughput: aws.Float64(101),
2841
ReplicaLag: aws.Float64(42),
2942
ReplicationSlotDiskUsage: aws.Float64(100),
3043
SwapUsage: aws.Float64(10),
3144
TransactionLogsDiskUsage: aws.Float64(24),
45+
TransactionLogsGeneration: aws.Float64(100),
46+
WriteLatency: aws.Float64(0.1),
3247
WriteIOPS: aws.Float64(11),
3348
WriteThroughput: aws.Float64(12),
3449
}
3550

3651
var db2ExpecteRdsMetrics = cloudwatch.RdsMetrics{
52+
BurstBalance: aws.Float64(100),
53+
CheckpointLag: aws.Float64(0),
54+
CPUCreditBalance: aws.Float64(100),
55+
CPUCreditUsage: aws.Float64(10),
56+
CPUSurplusCreditBalance: aws.Float64(0),
57+
CPUSurplusCreditsCharged: aws.Float64(0),
3758
CPUUtilization: aws.Float64(40),
3859
DBLoad: aws.Float64(2),
3960
DBLoadCPU: aws.Float64(8),
4061
DBLoadNonCPU: aws.Float64(1),
4162
DatabaseConnections: aws.Float64(1000),
63+
DiskQueueDepth: aws.Float64(0),
64+
EBSByteBalance: aws.Float64(100),
65+
EBSIOBalance: aws.Float64(100),
4266
FreeStorageSpace: aws.Float64(10),
4367
FreeableMemory: aws.Float64(10),
4468
MaximumUsedTransactionIDs: aws.Float64(1000000),
69+
NetworkReceiveThroughput: aws.Float64(1000),
70+
NetworkTransmitThroughput: aws.Float64(1000),
71+
OldestReplicationSlotLag: aws.Float64(0),
72+
ReadLatency: aws.Float64(0.1),
4573
ReadIOPS: aws.Float64(100),
4674
ReadThroughput: aws.Float64(101),
4775
ReplicaLag: aws.Float64(42),
4876
ReplicationSlotDiskUsage: aws.Float64(100),
4977
SwapUsage: aws.Float64(10),
5078
TransactionLogsDiskUsage: aws.Float64(24),
79+
TransactionLogsGeneration: aws.Float64(100),
80+
WriteLatency: aws.Float64(0.1),
5181
WriteIOPS: aws.Float64(11),
5282
WriteThroughput: aws.Float64(12),
5383
}
5484

5585
// generateMockedMetricsForInstance returns cloudwatch API output for the instance
5686
func generateMockedMetricsForInstance(id int, m cloudwatch.RdsMetrics) []aws_cloudwatch_types.MetricDataResult {
5787
metrics := []aws_cloudwatch_types.MetricDataResult{
88+
{
89+
Id: aws.String(fmt.Sprintf("burstbalance_%d", id)),
90+
Label: aws.String("BurstBalance"),
91+
Values: []float64{*m.BurstBalance},
92+
},
93+
{
94+
Id: aws.String(fmt.Sprintf("checkpointlag_%d", id)),
95+
Label: aws.String("CheckpointLag"),
96+
Values: []float64{*m.CheckpointLag},
97+
},
98+
{
99+
Id: aws.String(fmt.Sprintf("cpucreditbalance_%d", id)),
100+
Label: aws.String("CPUCreditBalance"),
101+
Values: []float64{*m.CPUCreditBalance},
102+
},
103+
{
104+
Id: aws.String(fmt.Sprintf("cpucreditusage_%d", id)),
105+
Label: aws.String("CPUCreditUsage"),
106+
Values: []float64{*m.CPUCreditUsage},
107+
},
108+
{
109+
Id: aws.String(fmt.Sprintf("cpusurpluscreditbalance_%d", id)),
110+
Label: aws.String("CPUSurplusCreditBalance"),
111+
Values: []float64{*m.CPUSurplusCreditBalance},
112+
},
113+
{
114+
Id: aws.String(fmt.Sprintf("cpusurpluscreditscharged_%d", id)),
115+
Label: aws.String("CPUSurplusCreditsCharged"),
116+
Values: []float64{*m.CPUSurplusCreditsCharged},
117+
},
58118
{
59119
Id: aws.String(fmt.Sprintf("cpuutilization_%d", id)),
60120
Label: aws.String("CPUUtilization"),
@@ -80,6 +140,21 @@ func generateMockedMetricsForInstance(id int, m cloudwatch.RdsMetrics) []aws_clo
80140
Label: aws.String("DatabaseConnections"),
81141
Values: []float64{*m.DatabaseConnections},
82142
},
143+
{
144+
Id: aws.String(fmt.Sprintf("diskqueuedepth_%d", id)),
145+
Label: aws.String("DiskQueueDepth"),
146+
Values: []float64{*m.DiskQueueDepth},
147+
},
148+
{
149+
Id: aws.String(fmt.Sprintf("ebsbytebalance_%d", id)),
150+
Label: aws.String("EBSByteBalance%"),
151+
Values: []float64{*m.EBSByteBalance},
152+
},
153+
{
154+
Id: aws.String(fmt.Sprintf("ebsiobalance_%d", id)),
155+
Label: aws.String("EBSIOBalance%"),
156+
Values: []float64{*m.EBSIOBalance},
157+
},
83158
{
84159
Id: aws.String(fmt.Sprintf("freestoragespace_%d", id)),
85160
Label: aws.String("FreeStorageSpace"),
@@ -95,6 +170,26 @@ func generateMockedMetricsForInstance(id int, m cloudwatch.RdsMetrics) []aws_clo
95170
Label: aws.String("MaximumUsedTransactionIDs"),
96171
Values: []float64{*m.MaximumUsedTransactionIDs},
97172
},
173+
{
174+
Id: aws.String(fmt.Sprintf("networkreceivethroughput_%d", id)),
175+
Label: aws.String("NetworkReceiveThroughput"),
176+
Values: []float64{*m.NetworkReceiveThroughput},
177+
},
178+
{
179+
Id: aws.String(fmt.Sprintf("networktransmitthroughput_%d", id)),
180+
Label: aws.String("NetworkTransmitThroughput"),
181+
Values: []float64{*m.NetworkTransmitThroughput},
182+
},
183+
{
184+
Id: aws.String(fmt.Sprintf("oldestreplicationslotlag_%d", id)),
185+
Label: aws.String("OldestReplicationSlotLag"),
186+
Values: []float64{*m.OldestReplicationSlotLag},
187+
},
188+
{
189+
Id: aws.String(fmt.Sprintf("readlatency_%d", id)),
190+
Label: aws.String("ReadLatency"),
191+
Values: []float64{*m.ReadLatency},
192+
},
98193
{
99194
Id: aws.String(fmt.Sprintf("readiops_%d", id)),
100195
Label: aws.String("ReadIOPS"),
@@ -125,6 +220,16 @@ func generateMockedMetricsForInstance(id int, m cloudwatch.RdsMetrics) []aws_clo
125220
Label: aws.String("TransactionLogsDiskUsage"),
126221
Values: []float64{*m.TransactionLogsDiskUsage},
127222
},
223+
{
224+
Id: aws.String(fmt.Sprintf("transactionlogsgeneration_%d", id)),
225+
Label: aws.String("TransactionLogsGeneration"),
226+
Values: []float64{*m.TransactionLogsGeneration},
227+
},
228+
{
229+
Id: aws.String(fmt.Sprintf("writelatency_%d", id)),
230+
Label: aws.String("WriteLatency"),
231+
Values: []float64{*m.WriteLatency},
232+
},
128233
{
129234
Id: aws.String(fmt.Sprintf("writeiops_%d", id)),
130235
Label: aws.String("WriteIOPS"),
@@ -168,21 +273,35 @@ func TestGetDBInstanceTypeInformation(t *testing.T) {
168273
assert.Equal(t, float64(1), fetcher.GetStatistics().CloudWatchAPICall, "One call to Cloudwatch API")
169274

170275
for id, value := range instances {
171-
assert.Equal(t, value.DatabaseConnections, result.Instances[id].DatabaseConnections, "DatabaseConnections mismatch")
276+
assert.Equal(t, value.BurstBalance, result.Instances[id].BurstBalance, "BurstBalance mismatch")
277+
assert.Equal(t, value.CheckpointLag, result.Instances[id].CheckpointLag, "CheckpointLag mismatch")
278+
assert.Equal(t, value.CPUCreditBalance, result.Instances[id].CPUCreditBalance, "CPUCreditBalance mismatch")
279+
assert.Equal(t, value.CPUCreditUsage, result.Instances[id].CPUCreditUsage, "CPUCreditUsage mismatch")
280+
assert.Equal(t, value.CPUSurplusCreditBalance, result.Instances[id].CPUSurplusCreditBalance, "CPUSurplusCreditBalance mismatch")
281+
assert.Equal(t, value.CPUSurplusCreditsCharged, result.Instances[id].CPUSurplusCreditsCharged, "CPUSurplusCreditsCharged mismatch")
172282
assert.Equal(t, value.CPUUtilization, result.Instances[id].CPUUtilization, "CPU utilization mismatch")
173283
assert.Equal(t, value.DBLoad, result.Instances[id].DBLoad, "DBLoad mismatch")
174284
assert.Equal(t, value.DBLoadCPU, result.Instances[id].DBLoadCPU, "DBLoadCPU mismatch")
175285
assert.Equal(t, value.DBLoadNonCPU, result.Instances[id].DBLoadNonCPU, "DBLoadNonCPU mismatch")
176286
assert.Equal(t, value.DatabaseConnections, result.Instances[id].DatabaseConnections, "DatabaseConnections mismatch")
287+
assert.Equal(t, value.DiskQueueDepth, result.Instances[id].DiskQueueDepth, "DiskQueueDepth mismatch")
288+
assert.Equal(t, value.EBSByteBalance, result.Instances[id].EBSByteBalance, "EBSByteBalance mismatch")
289+
assert.Equal(t, value.EBSIOBalance, result.Instances[id].EBSIOBalance, "EBSIOBalance mismatch")
177290
assert.Equal(t, value.FreeStorageSpace, result.Instances[id].FreeStorageSpace, "FreeStorageSpace mismatch")
178291
assert.Equal(t, value.FreeableMemory, result.Instances[id].FreeableMemory, "FreeableMemory mismatch")
179292
assert.Equal(t, value.MaximumUsedTransactionIDs, result.Instances[id].MaximumUsedTransactionIDs, "MaximumUsedTransactionIDs mismatch")
293+
assert.Equal(t, value.NetworkReceiveThroughput, result.Instances[id].NetworkReceiveThroughput, "NetworkReceiveThroughput mismatch")
294+
assert.Equal(t, value.NetworkTransmitThroughput, result.Instances[id].NetworkTransmitThroughput, "NetworkTransmitThroughput mismatch")
295+
assert.Equal(t, value.OldestReplicationSlotLag, result.Instances[id].OldestReplicationSlotLag, "OldestReplicationSlotLag mismatch")
296+
assert.Equal(t, value.ReadLatency, result.Instances[id].ReadLatency, "ReadLatency mismatch")
180297
assert.Equal(t, value.ReadIOPS, result.Instances[id].ReadIOPS, "ReadIOPS mismatch")
181298
assert.Equal(t, value.ReadThroughput, result.Instances[id].ReadThroughput, "ReadThroughput mismatch")
182299
assert.Equal(t, value.ReplicaLag, result.Instances[id].ReplicaLag, "ReplicaLag mismatch")
183300
assert.Equal(t, value.ReplicationSlotDiskUsage, result.Instances[id].ReplicationSlotDiskUsage, "ReplicationSlotDiskUsage mismatch")
184301
assert.Equal(t, value.SwapUsage, result.Instances[id].SwapUsage, "SwapUsage mismatch")
185302
assert.Equal(t, value.TransactionLogsDiskUsage, result.Instances[id].TransactionLogsDiskUsage, "TransactionLogsDiskUsage mismatch")
303+
assert.Equal(t, value.TransactionLogsGeneration, result.Instances[id].TransactionLogsGeneration, "TransactionLogsGeneration mismatch")
304+
assert.Equal(t, value.WriteLatency, result.Instances[id].WriteLatency, "WriteLatency mismatch")
186305
assert.Equal(t, value.WriteIOPS, result.Instances[id].WriteIOPS, "WriteIOPS mismatch")
187306
assert.Equal(t, value.WriteThroughput, result.Instances[id].WriteThroughput, "WriteThroughput mismatch")
188307
}

0 commit comments

Comments
 (0)