Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ def __init__(
engine_args, usage_context=UsageContext.LLM_CLASS)

self.request_counter = Counter()
self.default_sampling_params: Union[dict[str, Any], None] = None

@staticmethod
def get_engine_class() -> type[LLMEngine]:
Expand All @@ -268,10 +269,11 @@ def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer)

def get_default_sampling_params(self) -> SamplingParams:
diff_sampling_param = (
self.llm_engine.model_config.get_diff_sampling_param())
if diff_sampling_param:
return SamplingParams.from_optional(**diff_sampling_param)
if self.default_sampling_params is None:
self.default_sampling_params = (
self.llm_engine.model_config.get_diff_sampling_param())
if self.default_sampling_params:
return SamplingParams.from_optional(**self.default_sampling_params)
return SamplingParams()

@overload
Expand Down
14 changes: 6 additions & 8 deletions vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,11 @@ def __init__(
"been registered") from e

self.enable_prompt_tokens_details = enable_prompt_tokens_details
diff_sampling_param = self.model_config.get_diff_sampling_param()
if diff_sampling_param:
self.default_sampling_params = (
self.model_config.get_diff_sampling_param())
if self.default_sampling_params:
logger.info("Overwriting default chat sampling param with: %s",
diff_sampling_param)
self.default_sampling_params)

async def create_chat_completion(
self,
Expand Down Expand Up @@ -210,17 +211,14 @@ async def create_chat_completion(
sampling_params: Union[SamplingParams, BeamSearchParams]
default_max_tokens = self.max_model_len - len(
engine_prompt["prompt_token_ids"])
# Build default sampling params
default_sampling_params = (
self.model_config.get_diff_sampling_param())
if request.use_beam_search:
sampling_params = request.to_beam_search_params(
default_max_tokens, default_sampling_params)
default_max_tokens, self.default_sampling_params)
else:
sampling_params = request.to_sampling_params(
default_max_tokens,
self.model_config.logits_processor_pattern,
default_sampling_params)
self.default_sampling_params)

self._log_inputs(request_id,
request_prompts[i],
Expand Down
14 changes: 6 additions & 8 deletions vllm/entrypoints/openai/serving_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,12 @@ def __init__(
models=models,
request_logger=request_logger,
return_tokens_as_token_ids=return_tokens_as_token_ids)
diff_sampling_param = self.model_config.get_diff_sampling_param()
if diff_sampling_param:
self.default_sampling_params = (
self.model_config.get_diff_sampling_param())
if self.default_sampling_params:
logger.info(
"Overwriting default completion sampling param with: %s",
diff_sampling_param)
self.default_sampling_params)

async def create_completion(
self,
Expand Down Expand Up @@ -119,17 +120,14 @@ async def create_completion(
sampling_params: Union[SamplingParams, BeamSearchParams]
default_max_tokens = self.max_model_len - len(
engine_prompt["prompt_token_ids"])
# Build default sampling params
default_sampling_params = (
self.model_config.get_diff_sampling_param())
if request.use_beam_search:
sampling_params = request.to_beam_search_params(
default_max_tokens, default_sampling_params)
default_max_tokens, self.default_sampling_params)
else:
sampling_params = request.to_sampling_params(
default_max_tokens,
self.model_config.logits_processor_pattern,
default_sampling_params)
self.default_sampling_params)

request_id_item = f"{request_id}-{i}"

Expand Down
10 changes: 5 additions & 5 deletions vllm/entrypoints/openai/serving_transcription.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,11 +161,12 @@ def __init__(
request_logger=request_logger,
return_tokens_as_token_ids=return_tokens_as_token_ids)

diff_sampling_param = self.model_config.get_diff_sampling_param()
if diff_sampling_param:
self.default_sampling_params = (
self.model_config.get_diff_sampling_param())
if self.default_sampling_params:
logger.info(
"Overwriting default completion sampling param with: %s",
diff_sampling_param)
self.default_sampling_params)

async def _preprocess_transcription(
self,
Expand Down Expand Up @@ -273,9 +274,8 @@ async def create_transcription(
try:
# TODO(rob): subtract len of tokenized prompt.
default_max_tokens = self.model_config.max_model_len
default_params = self.model_config.get_diff_sampling_param()
sampling_params = request.to_sampling_params(
default_max_tokens, default_params)
default_max_tokens, self.default_sampling_params)

self._log_inputs(
request_id,
Expand Down