Skip to content

Commit f562dff

Browse files
committed
feat(metrics): add request prompt, generation, max_tokens and success metrics
Signed-off-by: googs1025 <[email protected]>
1 parent 699452c commit f562dff

File tree

6 files changed

+230
-3
lines changed

6 files changed

+230
-3
lines changed

manifests/config_with_fake.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@ time-to-first-token: 2000
77
inter-token-latency: 1000
88
kv-cache-transfer-latency: 100
99
seed: 100100100
10-
fake-metrics:
10+
fake-metrics:
11+
request-success-total: 20
1112
running-requests: 16
1213
waiting-requests: 3
1314
kv-cache-usage: 0.3

pkg/common/config.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,8 @@ type Metrics struct {
186186
WaitingRequests int64 `yaml:"waiting-requests" json:"waiting-requests"`
187187
// KVCacheUsagePercentage is the fraction of KV-cache blocks currently in use (from 0 to 1)
188188
KVCacheUsagePercentage float32 `yaml:"kv-cache-usage" json:"kv-cache-usage"`
189+
// RequestSuccessTotal is the number of inference requests that are successful
190+
RequestSuccessTotal int64 `yaml:"request-success-total" json:"request-success-total"`
189191
}
190192

191193
type LorasMetrics struct {

pkg/llm-d-inference-sim/metrics.go

Lines changed: 128 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package llmdinferencesim
2020

2121
import (
2222
"context"
23+
"math"
2324
"strconv"
2425
"strings"
2526
"sync"
@@ -94,6 +95,61 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
9495
return err
9596
}
9697

98+
s.requestPromptTokens = prometheus.NewHistogramVec(
99+
prometheus.HistogramOpts{
100+
Subsystem: "",
101+
Name: "vllm:request_prompt_tokens",
102+
Help: "Number of input prompt tokens in the request.",
103+
Buckets: build125Buckets(s.config.MaxModelLen),
104+
},
105+
[]string{vllmapi.PromLabelModelName},
106+
)
107+
if err := s.registry.Register(s.requestPromptTokens); err != nil {
108+
s.logger.Error(err, "Prometheus request_prompt_tokens histogram register failed")
109+
return err
110+
}
111+
112+
s.requestGenerationTokens = prometheus.NewHistogramVec(
113+
prometheus.HistogramOpts{
114+
Subsystem: "",
115+
Name: "vllm:request_generation_tokens",
116+
Help: "Number of generation tokens processed.",
117+
Buckets: build125Buckets(s.config.MaxModelLen),
118+
},
119+
[]string{vllmapi.PromLabelModelName},
120+
)
121+
if err := s.registry.Register(s.requestGenerationTokens); err != nil {
122+
s.logger.Error(err, "Prometheus request_generation_tokens histogram register failed")
123+
return err
124+
}
125+
126+
s.requestParamsMaxTokens = prometheus.NewHistogramVec(
127+
prometheus.HistogramOpts{
128+
Subsystem: "",
129+
Name: "vllm:request_params_max_tokens",
130+
Help: "Histogram of the max_tokens request parameter.",
131+
Buckets: build125Buckets(s.config.MaxModelLen),
132+
},
133+
[]string{vllmapi.PromLabelModelName},
134+
)
135+
if err := s.registry.Register(s.requestParamsMaxTokens); err != nil {
136+
s.logger.Error(err, "Prometheus request_params_max_tokens histogram register failed")
137+
return err
138+
}
139+
140+
s.requestSuccessTotal = prometheus.NewCounterVec(
141+
prometheus.CounterOpts{
142+
Subsystem: "",
143+
Name: "vllm:request_success_total",
144+
Help: "Count of successfully processed requests.",
145+
},
146+
[]string{vllmapi.PromLabelModelName, vllmapi.PromLabelFinishReason},
147+
)
148+
if err := s.registry.Register(s.requestSuccessTotal); err != nil {
149+
s.logger.Error(err, "Prometheus request_success_total counter register failed")
150+
return err
151+
}
152+
97153
s.setInitialPrometheusMetrics()
98154

99155
return nil
@@ -102,16 +158,18 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
102158
// setInitialPrometheusMetrics sends the default values to prometheus or
103159
// the fake metrics if set
104160
func (s *VllmSimulator) setInitialPrometheusMetrics() {
105-
var nRunningReqs, nWaitingReqs, kvCacheUsage float64
161+
var nRunningReqs, nWaitingReqs, kvCacheUsage, requestSuccessTotal float64
106162
if s.config.FakeMetrics != nil {
107163
nRunningReqs = float64(s.config.FakeMetrics.RunningRequests)
108164
nWaitingReqs = float64(s.config.FakeMetrics.WaitingRequests)
109165
kvCacheUsage = float64(s.config.FakeMetrics.KVCacheUsagePercentage)
166+
requestSuccessTotal = float64(s.config.FakeMetrics.RequestSuccessTotal)
110167
}
111168
modelName := s.getDisplayedModelName(s.config.Model)
112169
s.runningRequests.WithLabelValues(modelName).Set(nRunningReqs)
113170
s.waitingRequests.WithLabelValues(modelName).Set(nWaitingReqs)
114171
s.kvCacheUsagePercentage.WithLabelValues(modelName).Set(kvCacheUsage)
172+
s.requestSuccessTotal.WithLabelValues(modelName, "stop").Add(requestSuccessTotal)
115173

116174
if s.config.FakeMetrics != nil && len(s.config.FakeMetrics.LoraMetrics) != 0 {
117175
for _, metrics := range s.config.FakeMetrics.LoraMetrics {
@@ -198,6 +256,7 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
198256
go s.runningRequestsUpdater(ctx)
199257
go s.lorasUpdater(ctx)
200258
go s.kvCacheUsageUpdater(ctx)
259+
go s.recordRequestUpdater(ctx)
201260
}
202261

203262
// waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel
@@ -282,3 +341,71 @@ func (s *VllmSimulator) decrementLoraRefCount(lora string, theMap *sync.Map) {
282341
s.logger.Error(nil, "Zero model reference", "model", lora)
283342
}
284343
}
344+
345+
// recordRequestMetricsOnSuccess records metrics for a successfully completed request.
346+
func (s *VllmSimulator) recordRequestUpdater(ctx context.Context) {
347+
for {
348+
select {
349+
case <-ctx.Done():
350+
return
351+
case event := <-s.requestSuccessChan:
352+
s.recordRequestMetricsOnSuccess(
353+
event.PromptTokens,
354+
event.GenerationTokens,
355+
event.MaxTokens,
356+
event.FinishReason,
357+
)
358+
}
359+
}
360+
}
361+
362+
// requestSuccessEvent represents the data associated with a successfully completed request,
363+
// which is sent through the requestSuccessChan for asynchronous metrics recording.
364+
type requestSuccessEvent struct {
365+
// PromptTokens is the number of input (prompt) tokens in the request
366+
PromptTokens int
367+
// GenerationTokens is the number of generated (output) tokens in the response
368+
GenerationTokens int
369+
// MaxTokens is the maximum number of tokens allowed for generation (if specified in the request)
370+
MaxTokens *int64
371+
// FinishReason indicates why the generation stopped (e.g., "stop", "length", "tool_calls")
372+
FinishReason string
373+
}
374+
375+
// recordRequestMetricsOnSuccess records metrics for a successfully completed request
376+
func (s *VllmSimulator) recordRequestMetricsOnSuccess(promptTokens,
377+
generationTokens int, maxTokens *int64, finishReason string) {
378+
modelName := s.getDisplayedModelName(s.config.Model)
379+
s.requestPromptTokens.WithLabelValues(modelName).Observe(float64(promptTokens))
380+
s.requestGenerationTokens.WithLabelValues(modelName).Observe(float64(generationTokens))
381+
if maxTokens != nil {
382+
s.requestParamsMaxTokens.WithLabelValues(modelName).Observe(float64(*maxTokens))
383+
}
384+
s.requestSuccessTotal.WithLabelValues(modelName, finishReason).Inc()
385+
}
386+
387+
// build125Buckets generates histogram buckets in powers of 10 scaled by [1,2,5].
388+
// This matches vLLM's build_1_2_5_buckets() in metrics.py.
389+
//
390+
// Reference: https://github.com/vllm-project/vllm/blob/main/vllm/engine/metrics.py#L175
391+
func build125Buckets(maxValue int) []float64 {
392+
var buckets []float64
393+
exponent := 0
394+
mantissa := []int{1, 2, 5}
395+
396+
for {
397+
complete := true
398+
for _, m := range mantissa {
399+
value := m * int(math.Pow10(exponent))
400+
if value <= maxValue {
401+
buckets = append(buckets, float64(value))
402+
complete = false
403+
}
404+
}
405+
if complete {
406+
break
407+
}
408+
exponent++
409+
}
410+
return buckets
411+
}

pkg/llm-d-inference-sim/metrics_test.go

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,13 @@ import (
2222
"io"
2323
"net/http"
2424
"os"
25+
"reflect"
2526
"regexp"
2627
"sort"
2728
"strconv"
2829
"strings"
2930
"sync"
31+
"testing"
3032
"time"
3133

3234
"github.com/llm-d/llm-d-inference-sim/pkg/common"
@@ -572,3 +574,77 @@ func splitString(str string) []string {
572574
}
573575
return strings.Split(str, ",")
574576
}
577+
578+
// TestBuild125Buckets tests the build125Buckets function with various inputs.
579+
func TestBuild125Buckets(t *testing.T) {
580+
tests := []struct {
581+
name string
582+
maxValue int
583+
want []float64
584+
}{
585+
{
586+
name: "max_value zero",
587+
maxValue: 0,
588+
want: []float64{}, // no bucket <= 0
589+
},
590+
{
591+
name: "max_value one",
592+
maxValue: 1,
593+
want: []float64{1},
594+
},
595+
{
596+
name: "max_value five",
597+
maxValue: 5,
598+
want: []float64{1, 2, 5},
599+
},
600+
{
601+
name: "max_value ten",
602+
maxValue: 10,
603+
want: []float64{1, 2, 5, 10},
604+
},
605+
{
606+
name: "max_value 100",
607+
maxValue: 100,
608+
want: []float64{1, 2, 5, 10, 20, 50, 100},
609+
},
610+
{
611+
name: "max_value 999",
612+
maxValue: 999,
613+
want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500},
614+
},
615+
{
616+
name: "max_value 1024",
617+
maxValue: 1024,
618+
want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000},
619+
},
620+
{
621+
name: "max_value 4096",
622+
maxValue: 4096,
623+
want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 4000},
624+
},
625+
{
626+
name: "max_value 32768",
627+
maxValue: 32768,
628+
want: []float64{1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 30000},
629+
},
630+
{
631+
name: "max_value just below power of 10",
632+
maxValue: 99,
633+
want: []float64{1, 2, 5, 10, 20, 50},
634+
},
635+
{
636+
name: "max_value negative",
637+
maxValue: -1,
638+
want: []float64{}, // no positive bucket <= -1
639+
},
640+
}
641+
642+
for _, tt := range tests {
643+
t.Run(tt.name, func(t *testing.T) {
644+
got := build125Buckets(tt.maxValue)
645+
if !reflect.DeepEqual(got, tt.want) {
646+
t.Errorf("build125Buckets(%d) = %v, want %v", tt.maxValue, got, tt.want)
647+
}
648+
})
649+
}
650+
}

pkg/llm-d-inference-sim/simulator.go

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ type VllmSimulator struct {
9292
nRunningReqs int64
9393
// runReqChan is a channel to update nRunningReqs
9494
runReqChan chan int64
95+
// requestSuccessChan
96+
requestSuccessChan chan requestSuccessEvent
9597
// nWaitingReqs is the number of inference requests that are waiting to be processed
9698
nWaitingReqs int64
9799
// waitingReqChan is a channel to update nWaitingReqs
@@ -108,6 +110,14 @@ type VllmSimulator struct {
108110
waitingRequests *prometheus.GaugeVec
109111
// kvCacheUsagePercentage is prometheus gauge
110112
kvCacheUsagePercentage *prometheus.GaugeVec
113+
// requestPromptTokens is prometheus histogram for number of input (prompt) tokens in request
114+
requestPromptTokens *prometheus.HistogramVec
115+
// requestGenerationTokens is prometheus histogram for number of generated tokens in request
116+
requestGenerationTokens *prometheus.HistogramVec
117+
// requestParamsMaxTokens is prometheus histogram for 'max_tokens' parameter in request
118+
requestParamsMaxTokens *prometheus.HistogramVec
119+
// requestSuccessTotal is prometheus counter for total number of successful requests
120+
requestSuccessTotal *prometheus.CounterVec
111121
// channel for requeasts to be passed to workers
112122
reqChan chan *openaiserverapi.CompletionReqCtx
113123
// schema validator for tools parameters
@@ -597,9 +607,19 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) {
597607
// in case this is prefill pod processing, return special finish reason
598608
finishReason = common.RemoteDecodeFinishReason
599609
}
600-
601610
s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &usageData)
602611
}
612+
select {
613+
case s.requestSuccessChan <- requestSuccessEvent{
614+
PromptTokens: usageData.PromptTokens,
615+
GenerationTokens: usageData.CompletionTokens,
616+
MaxTokens: reqCtx.CompletionReq.GetMaxCompletionTokens(),
617+
FinishReason: finishReason,
618+
}:
619+
default:
620+
// 非阻塞:如果 channel 满了,丢弃(防止影响主流程)
621+
s.logger.V(1).Info("metricsChan full, dropping success event")
622+
}
603623
}
604624
reqCtx.Wg.Done()
605625
}

pkg/vllm-api/vllm-models.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ const (
2525
PromLabelRunningLoraAdapters = "running_lora_adapters"
2626
PromLabelMaxLora = "max_lora"
2727
PromLabelModelName = "model_name"
28+
PromLabelFinishReason = "finish_reason"
2829

2930
VllmLoraRequestInfo = "vllm:lora_requests_info"
3031
VllmNumRequestsRunning = "vllm:num_requests_running"

0 commit comments

Comments
 (0)