Skip to content

Commit 416f662

Browse files
authored
feat(metrics): add error counters for comprehensive monitoring coverage (#3729)
Add missing error counter metrics. Ensure everything is in the unkey namespace. Add/update godoc comments
1 parent ffa3c0c commit 416f662

File tree

14 files changed

+211
-64
lines changed

14 files changed

+211
-64
lines changed

go/internal/services/keys/validation.go

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import (
44
"context"
55
"database/sql"
66
"fmt"
7-
"strconv"
87
"strings"
98
"time"
109

@@ -16,7 +15,6 @@ import (
1615
"github.com/unkeyed/unkey/go/pkg/codes"
1716
"github.com/unkeyed/unkey/go/pkg/fault"
1817
"github.com/unkeyed/unkey/go/pkg/otel/tracing"
19-
"github.com/unkeyed/unkey/go/pkg/prometheus/metrics"
2018
"github.com/unkeyed/unkey/go/pkg/ptr"
2119
"github.com/unkeyed/unkey/go/pkg/rbac"
2220
)
@@ -44,26 +42,6 @@ func (k *KeyVerifier) withCredits(ctx context.Context, cost int32) error {
4442
k.setInvalid(StatusUsageExceeded, "Key usage limit exceeded.")
4543
}
4644

47-
// Emit Prometheus metrics for credits spent
48-
identityID := ""
49-
if k.Key.IdentityID.Valid {
50-
identityID = k.Key.IdentityID.String
51-
}
52-
53-
// Credits are deducted when usage is valid AND cost > 0
54-
deducted := usage.Valid && cost > 0
55-
actualCostDeducted := int32(0)
56-
if deducted {
57-
actualCostDeducted = cost
58-
}
59-
60-
metrics.KeyCreditsSpentTotal.WithLabelValues(
61-
k.AuthorizedWorkspaceID, // workspace_id
62-
k.Key.ID, // key_id
63-
identityID, // identity_id
64-
strconv.FormatBool(deducted), // deducted - whether credits were actually deducted
65-
).Add(float64(actualCostDeducted)) // Add the actual amount deducted, not the requested cost
66-
6745
return nil
6846
}
6947

go/pkg/circuitbreaker/lib.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"github.com/unkeyed/unkey/go/pkg/clock"
1010
"github.com/unkeyed/unkey/go/pkg/otel/logging"
1111
"github.com/unkeyed/unkey/go/pkg/otel/tracing"
12+
"github.com/unkeyed/unkey/go/pkg/prometheus/metrics"
1213
)
1314

1415
type CB[Res any] struct {
@@ -198,7 +199,7 @@ func (cb *CB[Res]) preflight(ctx context.Context) error {
198199
cb.resetStateAt = now.Add(cb.config.timeout)
199200
}
200201

201-
requests.WithLabelValues(cb.config.name, string(cb.state)).Inc()
202+
metrics.CircuitBreakerRequests.WithLabelValues(cb.config.name, string(cb.state)).Inc()
202203

203204
if cb.state == Open {
204205
return ErrTripped

go/pkg/circuitbreaker/metrics.go

Lines changed: 0 additions & 15 deletions
This file was deleted.

go/pkg/db/replica.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@ func (r *Replica) ExecContext(ctx context.Context, query string, args ...interfa
4343
status = "error"
4444
}
4545

46-
metrics.DatabaseOperationLatency.WithLabelValues(r.mode, "exec", status).Observe(duration)
47-
metrics.DatabaseOperationTotal.WithLabelValues(r.mode, "exec", status).Inc()
46+
metrics.DatabaseOperationsLatency.WithLabelValues(r.mode, "exec", status).Observe(duration)
47+
metrics.DatabaseOperationsTotal.WithLabelValues(r.mode, "exec", status).Inc()
4848

4949
return result, err
5050
}
@@ -69,8 +69,8 @@ func (r *Replica) PrepareContext(ctx context.Context, query string) (*sql.Stmt,
6969
status = "error"
7070
}
7171

72-
metrics.DatabaseOperationLatency.WithLabelValues(r.mode, "prepare", status).Observe(duration)
73-
metrics.DatabaseOperationTotal.WithLabelValues(r.mode, "prepare", status).Inc()
72+
metrics.DatabaseOperationsLatency.WithLabelValues(r.mode, "prepare", status).Observe(duration)
73+
metrics.DatabaseOperationsTotal.WithLabelValues(r.mode, "prepare", status).Inc()
7474

7575
return stmt, err // nolint:sqlclosecheck
7676
}
@@ -95,8 +95,8 @@ func (r *Replica) QueryContext(ctx context.Context, query string, args ...interf
9595
status = "error"
9696
}
9797

98-
metrics.DatabaseOperationLatency.WithLabelValues(r.mode, "query", status).Observe(duration)
99-
metrics.DatabaseOperationTotal.WithLabelValues(r.mode, "query", status).Inc()
98+
metrics.DatabaseOperationsLatency.WithLabelValues(r.mode, "query", status).Observe(duration)
99+
metrics.DatabaseOperationsTotal.WithLabelValues(r.mode, "query", status).Inc()
100100

101101
return rows, err // nolint:sqlclosecheck
102102
}
@@ -119,8 +119,8 @@ func (r *Replica) QueryRowContext(ctx context.Context, query string, args ...int
119119
// QueryRowContext doesn't return an error, but we can still track timing
120120
status := "success"
121121

122-
metrics.DatabaseOperationLatency.WithLabelValues(r.mode, "query_row", status).Observe(duration)
123-
metrics.DatabaseOperationTotal.WithLabelValues(r.mode, "query_row", status).Inc()
122+
metrics.DatabaseOperationsLatency.WithLabelValues(r.mode, "query_row", status).Observe(duration)
123+
metrics.DatabaseOperationsTotal.WithLabelValues(r.mode, "query_row", status).Inc()
124124

125125
return row
126126
}
@@ -143,8 +143,8 @@ func (r *Replica) Begin(ctx context.Context) (*sql.Tx, error) {
143143
status = "error"
144144
}
145145

146-
metrics.DatabaseOperationLatency.WithLabelValues(r.mode, "begin", status).Observe(duration)
147-
metrics.DatabaseOperationTotal.WithLabelValues(r.mode, "begin", status).Inc()
146+
metrics.DatabaseOperationsLatency.WithLabelValues(r.mode, "begin", status).Observe(duration)
147+
metrics.DatabaseOperationsTotal.WithLabelValues(r.mode, "begin", status).Inc()
148148

149149
return tx, err
150150
}

go/pkg/prometheus/metrics/batch.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,4 +84,21 @@ var (
8484
},
8585
[]string{"name"},
8686
)
87+
88+
// BatchItemsProcessedErrorsTotal tracks the total number of items that resulted in errors
89+
// during batch processing, labeled by batch name.
90+
// Use this counter to monitor error rates in batch processing and identify problematic batches.
91+
//
92+
// Example usage:
93+
// metrics.BatchItemsProcessedErrorsTotal.WithLabelValues("database_writes").Add(float64(errorCount))
94+
BatchItemsProcessedErrorsTotal = promauto.NewCounterVec(
95+
prometheus.CounterOpts{
96+
Namespace: "unkey",
97+
Subsystem: "batch",
98+
Name: "items_processed_errors_total",
99+
Help: "Total number of items processed through batches that resulted in an error",
100+
ConstLabels: constLabels,
101+
},
102+
[]string{"name"},
103+
)
87104
)

go/pkg/prometheus/metrics/buffer.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,4 +47,20 @@ var (
4747
},
4848
[]string{"name", "drop"},
4949
)
50+
51+
// BufferErrorsTotal tracks the total number of buffer operation errors,
52+
// labeled by buffer name and error type. Use this counter to monitor buffer error rates.
53+
//
54+
// Example usage:
55+
// metrics.BufferErrorsTotal.WithLabelValues("batch_writer", "write_failed").Inc()
56+
BufferErrorsTotal = promauto.NewCounterVec(
57+
prometheus.CounterOpts{
58+
Namespace: "unkey",
59+
Subsystem: "buffer",
60+
Name: "errors_total",
61+
Help: "Total number of buffer operation errors by name and state.",
62+
ConstLabels: constLabels,
63+
},
64+
[]string{"name", "state"},
65+
)
5066
)

go/pkg/prometheus/metrics/cache.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ var (
1919
// metrics.CacheHits.WithLabelValues("user_profile")
2020
CacheReads = promauto.NewCounterVec(
2121
prometheus.CounterOpts{
22+
Namespace: "unkey",
2223
Subsystem: "cache",
2324
Name: "reads_total",
2425
Help: "Number of cache reads by resource type and hit status.",
@@ -35,6 +36,7 @@ var (
3536
// metrics.CacheWrites.WithLabelValues("user_profile").Set(float64(writeCount))
3637
CacheWrites = promauto.NewGaugeVec(
3738
prometheus.GaugeOpts{
39+
Namespace: "unkey",
3840
Subsystem: "cache",
3941
Name: "writes",
4042
Help: "Number of cache writes by resource type.",
@@ -52,6 +54,7 @@ var (
5254
// metrics.CacheDeleted.WithLabelValues("user_profile", "capacity").Set(float64(evictionCount))
5355
CacheDeleted = promauto.NewCounterVec(
5456
prometheus.CounterOpts{
57+
Namespace: "unkey",
5558
Subsystem: "cache",
5659
Name: "deleted_total",
5760
Help: "Number of cache entries deleted by resource type and reason.",
@@ -67,6 +70,7 @@ var (
6770
// metrics.CacheSize.WithLabelValues("user_profile").Set(float64(cacheSize))
6871
CacheSize = promauto.NewGaugeVec(
6972
prometheus.GaugeOpts{
73+
Namespace: "unkey",
7074
Subsystem: "cache",
7175
Name: "size",
7276
Help: "Current number of entries in the cache by resource type.",
@@ -82,6 +86,7 @@ var (
8286
// metrics.CacheCapacity.WithLabelValues("user_profile").Set(float64(cacheCapacity))
8387
CacheCapacity = promauto.NewGaugeVec(
8488
prometheus.GaugeOpts{
89+
Namespace: "unkey",
8590
Subsystem: "cache",
8691
Name: "capacity",
8792
Help: "Maximum capacity of the cache by resource type.",
@@ -97,11 +102,44 @@ var (
97102
// metrics.CacheRevalidations.WithLabelValues("user_profile").Inc()
98103
CacheRevalidations = promauto.NewCounterVec(
99104
prometheus.CounterOpts{
105+
Namespace: "unkey",
100106
Subsystem: "cache",
101107
Name: "revalidations_total",
102108
Help: "Total number of cache revalidations by resource type.",
103109
ConstLabels: constLabels,
104110
},
105111
[]string{"resource"},
106112
)
113+
114+
// CacheReadsErrorsTotal tracks the total number of cache read errors,
115+
// labeled by resource type. Use this counter to monitor cache read error rates.
116+
//
117+
// Example usage:
118+
// metrics.CacheReadsErrorsTotal.WithLabelValues("user_profile").Inc()
119+
CacheReadsErrorsTotal = promauto.NewCounterVec(
120+
prometheus.CounterOpts{
121+
Namespace: "unkey",
122+
Subsystem: "cache",
123+
Name: "reads_errors_total",
124+
Help: "Total number of cache read errors by resource type.",
125+
ConstLabels: constLabels,
126+
},
127+
[]string{"resource"},
128+
)
129+
130+
// CacheRevalidationsErrorsTotal tracks the total number of cache revalidation errors,
131+
// labeled by resource type. Use this counter to monitor cache revalidation error rates.
132+
//
133+
// Example usage:
134+
// metrics.CacheRevalidationsErrorsTotal.WithLabelValues("user_profile").Inc()
135+
CacheRevalidationsErrorsTotal = promauto.NewCounterVec(
136+
prometheus.CounterOpts{
137+
Namespace: "unkey",
138+
Subsystem: "cache",
139+
Name: "revalidations_errors_total",
140+
Help: "Total number of cache revalidation errors by resource type.",
141+
ConstLabels: constLabels,
142+
},
143+
[]string{"resource"},
144+
)
107145
)

go/pkg/prometheus/metrics/chproxy.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ var (
1818
// metrics.ChproxyRequestsTotal.WithLabelValues("verifications").Inc()
1919
ChproxyRequestsTotal = promauto.NewCounterVec(
2020
prometheus.CounterOpts{
21+
Namespace: "unkey",
2122
Subsystem: "chproxy",
2223
Name: "requests_total",
2324
Help: "Total number of ClickHouse proxy requests processed.",
@@ -26,18 +27,51 @@ var (
2627
[]string{"endpoint"},
2728
)
2829

30+
// ChproxyErrorsTotal tracks the total number of errors encountered by ClickHouse proxy,
31+
// labeled by endpoint. Use this counter to monitor error rates and identify problematic endpoints.
32+
//
33+
// Example usage:
34+
// metrics.ChproxyErrorsTotal.WithLabelValues("verifications").Inc()
35+
ChproxyErrorsTotal = promauto.NewCounterVec(
36+
prometheus.CounterOpts{
37+
Namespace: "unkey",
38+
Subsystem: "chproxy",
39+
Name: "errors_total",
40+
Help: "Total number of errors encountered by ClickHouse proxy.",
41+
ConstLabels: constLabels,
42+
},
43+
[]string{"endpoint"},
44+
)
45+
2946
// ChproxyRowsTotal tracks the total number of rows/events received in chproxy requests.
3047
// Use this counter to monitor data volume and ingestion patterns.
3148
//
3249
// Example usage:
3350
// metrics.ChproxyRowsTotal.WithLabelValues("verifications").Add(float64(len(events)))
3451
ChproxyRowsTotal = promauto.NewCounterVec(
3552
prometheus.CounterOpts{
53+
Namespace: "unkey",
3654
Subsystem: "chproxy",
3755
Name: "rows_total",
3856
Help: "Total number of rows/events processed by ClickHouse proxy.",
3957
ConstLabels: constLabels,
4058
},
4159
[]string{"endpoint"},
4260
)
61+
62+
// ChproxyRowsErrorsTotal tracks the total number of row processing errors in ClickHouse proxy,
63+
// labeled by endpoint. Use this counter to monitor row processing error rates.
64+
//
65+
// Example usage:
66+
// metrics.ChproxyRowsErrorsTotal.WithLabelValues("verifications").Inc()
67+
ChproxyRowsErrorsTotal = promauto.NewCounterVec(
68+
prometheus.CounterOpts{
69+
Namespace: "unkey",
70+
Subsystem: "chproxy",
71+
Name: "rows_errors_total",
72+
Help: "Total number of row processing errors in ClickHouse proxy.",
73+
ConstLabels: constLabels,
74+
},
75+
[]string{"endpoint"},
76+
)
4377
)

go/pkg/prometheus/metrics/circuitbreaker.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,23 @@ var (
1212
// Example usage:
1313
// metrics.CircuitBreakerRequests.WithLabelValues("my_circuit_breaker", "open").Inc()
1414
CircuitBreakerRequests = promauto.NewCounterVec(prometheus.CounterOpts{
15+
Namespace: "unkey",
1516
Subsystem: "circuitbreaker",
1617
Name: "requests_total",
1718
Help: "Tracks the number of requests made to the circuitbreaker by state.",
1819
ConstLabels: constLabels,
19-
}, []string{"name", "state"})
20+
}, []string{"service", "action"})
21+
22+
// CircuitBreakerErrorsTotal tracks the total number of circuit breaker errors,
23+
// labeled by service and action. Use this counter to monitor circuit breaker error rates.
24+
//
25+
// Example usage:
26+
// metrics.CircuitBreakerErrorsTotal.WithLabelValues("database", "timeout").Inc()
27+
CircuitBreakerErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
28+
Namespace: "unkey",
29+
Subsystem: "circuitbreaker",
30+
Name: "errors_total",
31+
Help: "Total number of circuit breaker errors by service and action.",
32+
ConstLabels: constLabels,
33+
}, []string{"service", "action"})
2034
)

go/pkg/prometheus/metrics/database.go

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,20 @@ import (
1212
)
1313

1414
var (
15-
// DatabaseOperationLatency tracks database operation latencies as a histogram,
15+
// DatabaseOperationsLatency tracks database operation latencies as a histogram,
1616
// labeled by replica type (rw/ro), operation type, and success status.
1717
// This collector uses predefined buckets optimized for typical database operation latencies.
1818
//
1919
// Example usage:
2020
// timer := prometheus.NewTimer(prometheus.ObserverFunc(func(v float64) {
21-
// metrics.DatabaseOperationLatency.WithLabelValues("rw", "exec", "success").Observe(v)
21+
// metrics.DatabaseOperationsLatency.WithLabelValues("rw", "exec", "success").Observe(v)
2222
// }))
2323
// defer timer.ObserveDuration()
24-
DatabaseOperationLatency = promauto.NewHistogramVec(
24+
DatabaseOperationsLatency = promauto.NewHistogramVec(
2525
prometheus.HistogramOpts{
26+
Namespace: "unkey",
2627
Subsystem: "database",
27-
Name: "operation_latency_seconds",
28+
Name: "operations_latency_seconds",
2829
Help: "Histogram of database operation latencies in seconds.",
2930
Buckets: latencyBuckets,
3031
ConstLabels: constLabels,
@@ -39,13 +40,28 @@ var (
3940
// Example usage:
4041
// metrics.DatabaseOperationTotal.WithLabelValues("rw", "exec", "success").Inc()
4142
// metrics.DatabaseOperationTotal.WithLabelValues("ro", "query", "error").Inc()
42-
DatabaseOperationTotal = promauto.NewCounterVec(
43+
DatabaseOperationsTotal = promauto.NewCounterVec(
4344
prometheus.CounterOpts{
45+
Namespace: "unkey",
4446
Subsystem: "database",
4547
Name: "operations_total",
4648
Help: "Total number of database operations processed.",
4749
ConstLabels: constLabels,
4850
},
4951
[]string{"replica", "operation", "status"},
5052
)
53+
54+
// DatabaseOperationsErrorsTotal tracks the total number of database operation errors,
55+
// labeled by replica type (rw/ro), and operation type.
56+
// Use this counter to monitor database error rates and identify problematic operations.
57+
//
58+
// Example usage:
59+
// metrics.DatabaseOperationsErrorsTotal.WithLabelValues("rw", "exec").Inc()
60+
DatabaseOperationsErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
61+
Namespace: "unkey",
62+
Subsystem: "database",
63+
Name: "operations_errors_total",
64+
Help: "Total number of database operation errors.",
65+
ConstLabels: constLabels,
66+
}, []string{"replica", "operation"})
5167
)

0 commit comments

Comments
 (0)