|
7 | 7 | import numpy as np
|
8 | 8 | import prometheus_client
|
9 | 9 |
|
10 |
| -from vllm.config import ModelConfig |
| 10 | +from vllm.config import VllmConfig |
11 | 11 | from vllm.logger import init_logger
|
12 | 12 | from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
|
13 | 13 | from vllm.v1.engine import FinishReason
|
@@ -92,13 +92,13 @@ def log(self, scheduler_stats: SchedulerStats,
|
92 | 92 |
|
93 | 93 | class PrometheusStatLogger(StatLoggerBase):
|
94 | 94 |
|
95 |
| - def __init__(self, model_config: ModelConfig): |
| 95 | + def __init__(self, vllm_config: VllmConfig): |
96 | 96 | self._unregister_vllm_metrics()
|
97 | 97 |
|
98 | 98 | labelnames = ["model_name"]
|
99 |
| - labelvalues = [model_config.served_model_name] |
| 99 | + labelvalues = [vllm_config.model_config.served_model_name] |
100 | 100 |
|
101 |
| - max_model_len = model_config.max_model_len |
| 101 | + max_model_len = vllm_config.model_config.max_model_len |
102 | 102 |
|
103 | 103 | self.gauge_scheduler_running = prometheus_client.Gauge(
|
104 | 104 | name="vllm:num_requests_running",
|
@@ -162,6 +162,13 @@ def __init__(self, model_config: ModelConfig):
|
162 | 162 | buckets=build_1_2_5_buckets(max_model_len),
|
163 | 163 | labelnames=labelnames).labels(*labelvalues)
|
164 | 164 |
|
| 165 | + self.histogram_iteration_tokens = \ |
| 166 | + prometheus_client.Histogram( |
| 167 | + name="vllm:iteration_tokens_total", |
| 168 | + documentation="Histogram of number of tokens per engine_step.", |
| 169 | + buckets=build_cudagraph_buckets(vllm_config), |
| 170 | + labelnames=labelnames).labels(*labelvalues) |
| 171 | + |
165 | 172 | self.histogram_time_to_first_token = \
|
166 | 173 | prometheus_client.Histogram(
|
167 | 174 | name="vllm:time_to_first_token_seconds",
|
@@ -237,6 +244,9 @@ def log(self, scheduler_stats: SchedulerStats,
|
237 | 244 | self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
|
238 | 245 | self.counter_generation_tokens.inc(
|
239 | 246 | iteration_stats.num_generation_tokens)
|
| 247 | + self.histogram_iteration_tokens.observe( |
| 248 | + iteration_stats.num_prompt_tokens + \ |
| 249 | + iteration_stats.num_generation_tokens) |
240 | 250 |
|
241 | 251 | for finished_request in iteration_stats.finished_requests:
|
242 | 252 | self.counter_request_success[finished_request.finish_reason].inc()
|
@@ -293,3 +303,13 @@ def build_1_2_5_buckets(max_value: int) -> List[int]:
|
293 | 303 | [1, 2, 5, 10, 20, 50, 100]
|
294 | 304 | """
|
295 | 305 | return build_buckets([1, 2, 5], max_value)
|
| 306 | + |
| 307 | + |
| 308 | +def build_cudagraph_buckets(vllm_config: VllmConfig) -> List[int]: |
| 309 | + if not vllm_config.model_config.enforce_eager: |
| 310 | + buckets = vllm_config.compilation_config.\ |
| 311 | + cudagraph_capture_sizes.copy() |
| 312 | + buckets.sort() |
| 313 | + return buckets |
| 314 | + else: |
| 315 | + return [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096] |
0 commit comments