|
1 | 1 | import time
|
2 | 2 | from typing import Iterable, List, Optional, Type, Union
|
3 | 3 |
|
4 |
| -from transformers import PreTrainedTokenizer |
| 4 | +from transformers import GenerationConfig, PreTrainedTokenizer |
5 | 5 |
|
6 | 6 | import vllm
|
7 | 7 | from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
|
|
34 | 34 | _LOCAL_LOGGING_INTERVAL_SEC = 5
|
35 | 35 |
|
36 | 36 |
|
| 37 | +def _load_generation_config_dict(model_config: ModelConfig): |
| 38 | + try: |
| 39 | + return GenerationConfig.from_pretrained( |
| 40 | + model_config.model, |
| 41 | + revision=model_config.revision, |
| 42 | + ).to_diff_dict() |
| 43 | + except OSError: |
| 44 | + # Not found. |
| 45 | + return {} |
| 46 | + |
| 47 | + |
37 | 48 | class LLMEngine:
|
38 | 49 | """An LLM engine that receives requests and generates texts.
|
39 | 50 |
|
@@ -124,6 +135,8 @@ def __init__(
|
124 | 135 | self._init_tokenizer()
|
125 | 136 | self.detokenizer = Detokenizer(self.tokenizer)
|
126 | 137 | self.seq_counter = Counter()
|
| 138 | + self.generation_config_fields = _load_generation_config_dict( |
| 139 | + model_config) |
127 | 140 |
|
128 | 141 | self.model_executor = executor_class(
|
129 | 142 | model_config=model_config,
|
@@ -391,6 +404,8 @@ def add_request(
|
391 | 404 | # inject the eos token id into the sampling_params to support min_tokens
|
392 | 405 | # processing
|
393 | 406 | sampling_params.eos_token_id = seq.eos_token_id
|
| 407 | + sampling_params.update_from_generation_config( |
| 408 | + self.generation_config_fields) |
394 | 409 |
|
395 | 410 | # Create the sequence group.
|
396 | 411 | seq_group = SequenceGroup(request_id, [seq], sampling_params,
|
@@ -435,7 +450,7 @@ def _process_model_outputs(
|
435 | 450 | scheduled_seq_groups: List[SequenceGroup],
|
436 | 451 | ignored_seq_groups: List[SequenceGroup]) -> List[RequestOutput]:
|
437 | 452 | """Apply the model output to the sequences in the scheduled seq groups.
|
438 |
| - |
| 453 | +
|
439 | 454 | Returns RequestOutputs that can be returned to the client.
|
440 | 455 | """
|
441 | 456 |
|
|
0 commit comments