Skip to content

Commit 8b162c9

Browse files
committed
[V1][Metrics] Add iteration_tokens_total histogram from V0
Basing bucket sizes on cudagraph capture sizes was introduced in PR vllm-project#11031 and vllm-project#12243. Signed-off-by: Mark McLoughlin <[email protected]>
1 parent 83481ce commit 8b162c9

File tree

3 files changed

+35
-8
lines changed

3 files changed

+35
-8
lines changed

tests/entrypoints/openai/test_metrics.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,14 @@ async def client(server):
9696
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
9797
("_count", _NUM_REQUESTS)],
9898
"vllm:request_params_n": [("_count", _NUM_REQUESTS)],
99-
"vllm:request_params_max_tokens":
100-
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
101-
("_count", _NUM_REQUESTS)],
99+
"vllm:request_params_max_tokens": [
100+
("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
101+
("_count", _NUM_REQUESTS)
102+
],
103+
"vllm:iteration_tokens_total":
104+
[("_sum", _NUM_REQUESTS *
105+
(_NUM_PROMPT_TOKENS_PER_REQUEST + _NUM_GENERATION_TOKENS_PER_REQUEST)),
106+
("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST)],
102107
"vllm:prompt_tokens": [("_total",
103108
_NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
104109
"vllm:generation_tokens": [
@@ -197,6 +202,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
197202
"vllm:request_params_max_tokens_sum",
198203
"vllm:request_params_max_tokens_bucket",
199204
"vllm:request_params_max_tokens_count",
205+
"vllm:iteration_tokens_total",
200206
"vllm:num_preemptions_total",
201207
"vllm:prompt_tokens_total",
202208
"vllm:generation_tokens_total",
@@ -223,6 +229,7 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
223229
"vllm:gpu_prefix_cache_hits",
224230
"vllm:prompt_tokens_total",
225231
"vllm:generation_tokens_total",
232+
"vllm:iteration_tokens_total",
226233
"vllm:request_success_total",
227234
"vllm:request_prompt_tokens_sum",
228235
"vllm:request_prompt_tokens_bucket",

vllm/v1/engine/async_llm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def __init__(
5757
if self.log_stats:
5858
self.stat_loggers.extend([
5959
LoggingStatLogger(),
60-
PrometheusStatLogger(vllm_config.model_config),
60+
PrometheusStatLogger(vllm_config),
6161
])
6262

6363
# Tokenizer (+ ensure liveness if running in another process).

vllm/v1/metrics/loggers.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import numpy as np
88
import prometheus_client
99

10-
from vllm.config import ModelConfig
10+
from vllm.config import VllmConfig
1111
from vllm.logger import init_logger
1212
from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
1313
from vllm.v1.engine import FinishReason
@@ -92,13 +92,13 @@ def log(self, scheduler_stats: SchedulerStats,
9292

9393
class PrometheusStatLogger(StatLoggerBase):
9494

95-
def __init__(self, model_config: ModelConfig):
95+
def __init__(self, vllm_config: VllmConfig):
9696
self._unregister_vllm_metrics()
9797

9898
labelnames = ["model_name"]
99-
labelvalues = [model_config.served_model_name]
99+
labelvalues = [vllm_config.model_config.served_model_name]
100100

101-
max_model_len = model_config.max_model_len
101+
max_model_len = vllm_config.model_config.max_model_len
102102

103103
self.gauge_scheduler_running = prometheus_client.Gauge(
104104
name="vllm:num_requests_running",
@@ -162,6 +162,13 @@ def __init__(self, model_config: ModelConfig):
162162
buckets=build_1_2_5_buckets(max_model_len),
163163
labelnames=labelnames).labels(*labelvalues)
164164

165+
self.histogram_iteration_tokens = \
166+
prometheus_client.Histogram(
167+
name="vllm:iteration_tokens_total",
168+
documentation="Histogram of number of tokens per engine_step.",
169+
buckets=build_cudagraph_buckets(vllm_config),
170+
labelnames=labelnames).labels(*labelvalues)
171+
165172
self.histogram_time_to_first_token = \
166173
prometheus_client.Histogram(
167174
name="vllm:time_to_first_token_seconds",
@@ -237,6 +244,9 @@ def log(self, scheduler_stats: SchedulerStats,
237244
self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
238245
self.counter_generation_tokens.inc(
239246
iteration_stats.num_generation_tokens)
247+
self.histogram_iteration_tokens.observe(
248+
iteration_stats.num_prompt_tokens + \
249+
iteration_stats.num_generation_tokens)
240250

241251
for finished_request in iteration_stats.finished_requests:
242252
self.counter_request_success[finished_request.finish_reason].inc()
@@ -293,3 +303,13 @@ def build_1_2_5_buckets(max_value: int) -> List[int]:
293303
[1, 2, 5, 10, 20, 50, 100]
294304
"""
295305
return build_buckets([1, 2, 5], max_value)
306+
307+
308+
def build_cudagraph_buckets(vllm_config: VllmConfig) -> List[int]:
309+
if not vllm_config.model_config.enforce_eager:
310+
buckets = vllm_config.compilation_config.\
311+
cudagraph_capture_sizes.copy()
312+
buckets.sort()
313+
return buckets
314+
else:
315+
return [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096]

0 commit comments

Comments
 (0)