@@ -20,6 +20,7 @@ package llmdinferencesim
20
20
21
21
import (
22
22
"context"
23
+ "math"
23
24
"strconv"
24
25
"strings"
25
26
"sync"
@@ -94,6 +95,61 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
94
95
return err
95
96
}
96
97
98
+ s .requestPromptTokens = prometheus .NewHistogramVec (
99
+ prometheus.HistogramOpts {
100
+ Subsystem : "" ,
101
+ Name : "vllm:request_prompt_tokens" ,
102
+ Help : "Number of input prompt tokens in the request." ,
103
+ Buckets : build125Buckets (s .config .MaxModelLen ),
104
+ },
105
+ []string {vllmapi .PromLabelModelName },
106
+ )
107
+ if err := s .registry .Register (s .requestPromptTokens ); err != nil {
108
+ s .logger .Error (err , "Prometheus request_prompt_tokens histogram register failed" )
109
+ return err
110
+ }
111
+
112
+ s .requestGenerationTokens = prometheus .NewHistogramVec (
113
+ prometheus.HistogramOpts {
114
+ Subsystem : "" ,
115
+ Name : "vllm:request_generation_tokens" ,
116
+ Help : "Number of generation tokens processed." ,
117
+ Buckets : build125Buckets (s .config .MaxModelLen ),
118
+ },
119
+ []string {vllmapi .PromLabelModelName },
120
+ )
121
+ if err := s .registry .Register (s .requestGenerationTokens ); err != nil {
122
+ s .logger .Error (err , "Prometheus request_generation_tokens histogram register failed" )
123
+ return err
124
+ }
125
+
126
+ s .requestParamsMaxTokens = prometheus .NewHistogramVec (
127
+ prometheus.HistogramOpts {
128
+ Subsystem : "" ,
129
+ Name : "vllm:request_params_max_tokens" ,
130
+ Help : "Histogram of the max_tokens request parameter." ,
131
+ Buckets : build125Buckets (s .config .MaxModelLen ),
132
+ },
133
+ []string {vllmapi .PromLabelModelName },
134
+ )
135
+ if err := s .registry .Register (s .requestParamsMaxTokens ); err != nil {
136
+ s .logger .Error (err , "Prometheus request_params_max_tokens histogram register failed" )
137
+ return err
138
+ }
139
+
140
+ s .requestSuccessTotal = prometheus .NewCounterVec (
141
+ prometheus.CounterOpts {
142
+ Subsystem : "" ,
143
+ Name : "vllm:request_success_total" ,
144
+ Help : "Count of successfully processed requests." ,
145
+ },
146
+ []string {vllmapi .PromLabelModelName , vllmapi .PromLabelFinishReason },
147
+ )
148
+ if err := s .registry .Register (s .requestSuccessTotal ); err != nil {
149
+ s .logger .Error (err , "Prometheus request_success_total counter register failed" )
150
+ return err
151
+ }
152
+
97
153
s .setInitialPrometheusMetrics ()
98
154
99
155
return nil
@@ -102,16 +158,18 @@ func (s *VllmSimulator) createAndRegisterPrometheus() error {
102
158
// setInitialPrometheusMetrics sends the default values to prometheus or
103
159
// the fake metrics if set
104
160
func (s * VllmSimulator ) setInitialPrometheusMetrics () {
105
- var nRunningReqs , nWaitingReqs , kvCacheUsage float64
161
+ var nRunningReqs , nWaitingReqs , kvCacheUsage , requestSuccessTotal float64
106
162
if s .config .FakeMetrics != nil {
107
163
nRunningReqs = float64 (s .config .FakeMetrics .RunningRequests )
108
164
nWaitingReqs = float64 (s .config .FakeMetrics .WaitingRequests )
109
165
kvCacheUsage = float64 (s .config .FakeMetrics .KVCacheUsagePercentage )
166
+ requestSuccessTotal = float64 (s .config .FakeMetrics .RequestSuccessTotal )
110
167
}
111
168
modelName := s .getDisplayedModelName (s .config .Model )
112
169
s .runningRequests .WithLabelValues (modelName ).Set (nRunningReqs )
113
170
s .waitingRequests .WithLabelValues (modelName ).Set (nWaitingReqs )
114
171
s .kvCacheUsagePercentage .WithLabelValues (modelName ).Set (kvCacheUsage )
172
+ s .requestSuccessTotal .WithLabelValues (modelName , "stop" ).Add (requestSuccessTotal )
115
173
116
174
if s .config .FakeMetrics != nil && len (s .config .FakeMetrics .LoraMetrics ) != 0 {
117
175
for _ , metrics := range s .config .FakeMetrics .LoraMetrics {
@@ -198,6 +256,7 @@ func (s *VllmSimulator) startMetricsUpdaters(ctx context.Context) {
198
256
go s .runningRequestsUpdater (ctx )
199
257
go s .lorasUpdater (ctx )
200
258
go s .kvCacheUsageUpdater (ctx )
259
+ go s .recordRequestUpdater (ctx )
201
260
}
202
261
203
262
// waitingRequestsUpdater updates the waiting requests metric by listening on the relevant channel
@@ -282,3 +341,71 @@ func (s *VllmSimulator) decrementLoraRefCount(lora string, theMap *sync.Map) {
282
341
s .logger .Error (nil , "Zero model reference" , "model" , lora )
283
342
}
284
343
}
344
+
345
+ // recordRequestMetricsOnSuccess records metrics for a successfully completed request.
346
+ func (s * VllmSimulator ) recordRequestUpdater (ctx context.Context ) {
347
+ for {
348
+ select {
349
+ case <- ctx .Done ():
350
+ return
351
+ case event := <- s .requestSuccessChan :
352
+ s .recordRequestMetricsOnSuccess (
353
+ event .PromptTokens ,
354
+ event .GenerationTokens ,
355
+ event .MaxTokens ,
356
+ event .FinishReason ,
357
+ )
358
+ }
359
+ }
360
+ }
361
+
362
+ // requestSuccessEvent represents the data associated with a successfully completed request,
363
+ // which is sent through the requestSuccessChan for asynchronous metrics recording.
364
+ type requestSuccessEvent struct {
365
+ // PromptTokens is the number of input (prompt) tokens in the request
366
+ PromptTokens int
367
+ // GenerationTokens is the number of generated (output) tokens in the response
368
+ GenerationTokens int
369
+ // MaxTokens is the maximum number of tokens allowed for generation (if specified in the request)
370
+ MaxTokens * int64
371
+ // FinishReason indicates why the generation stopped (e.g., "stop", "length", "tool_calls")
372
+ FinishReason string
373
+ }
374
+
375
+ // recordRequestMetricsOnSuccess records metrics for a successfully completed request
376
+ func (s * VllmSimulator ) recordRequestMetricsOnSuccess (promptTokens ,
377
+ generationTokens int , maxTokens * int64 , finishReason string ) {
378
+ modelName := s .getDisplayedModelName (s .config .Model )
379
+ s .requestPromptTokens .WithLabelValues (modelName ).Observe (float64 (promptTokens ))
380
+ s .requestGenerationTokens .WithLabelValues (modelName ).Observe (float64 (generationTokens ))
381
+ if maxTokens != nil {
382
+ s .requestParamsMaxTokens .WithLabelValues (modelName ).Observe (float64 (* maxTokens ))
383
+ }
384
+ s .requestSuccessTotal .WithLabelValues (modelName , finishReason ).Inc ()
385
+ }
386
+
387
+ // build125Buckets generates histogram buckets in powers of 10 scaled by [1,2,5].
388
+ // This matches vLLM's build_1_2_5_buckets() in metrics.py.
389
+ //
390
+ // Reference: https://github.com/vllm-project/vllm/blob/main/vllm/engine/metrics.py#L175
391
+ func build125Buckets (maxValue int ) []float64 {
392
+ var buckets []float64
393
+ exponent := 0
394
+ mantissa := []int {1 , 2 , 5 }
395
+
396
+ for {
397
+ complete := true
398
+ for _ , m := range mantissa {
399
+ value := m * int (math .Pow10 (exponent ))
400
+ if value <= maxValue {
401
+ buckets = append (buckets , float64 (value ))
402
+ complete = false
403
+ }
404
+ }
405
+ if complete {
406
+ break
407
+ }
408
+ exponent ++
409
+ }
410
+ return buckets
411
+ }
0 commit comments