@@ -585,6 +585,8 @@ def _validate_input(
585
585
(EmbeddingChatRequest , EmbeddingCompletionRequest ,
586
586
ScoreRequest , RerankRequest , ClassificationRequest )):
587
587
588
+ # Note: input length can be up to the entire model context length
589
+ # since these requests don't generate tokens.
588
590
if token_num > self .max_model_len :
589
591
operations : dict [type [AnyRequest ], str ] = {
590
592
ScoreRequest : "score" ,
@@ -613,21 +615,24 @@ def _validate_input(
613
615
max_tokens = request .max_completion_tokens or request .max_tokens
614
616
else :
615
617
max_tokens = getattr (request , "max_tokens" , None )
616
- if max_tokens is None :
617
- if token_num >= self .max_model_len :
618
- raise ValueError (
619
- f"This model's maximum context length is "
620
- f"{ self .max_model_len } tokens. However, you requested "
621
- f"{ token_num } tokens in the messages, "
622
- f"Please reduce the length of the messages." )
623
- elif token_num + max_tokens > self .max_model_len :
618
+
619
+ # Note: input length can be up to model context length - 1 for
620
+ # completion-like requests.
621
+ if token_num >= self .max_model_len :
624
622
raise ValueError (
625
623
f"This model's maximum context length is "
626
- f"{ self .max_model_len } tokens. However, you requested "
627
- f"{ max_tokens + token_num } tokens "
628
- f"({ token_num } in the messages, "
629
- f"{ max_tokens } in the completion). "
630
- f"Please reduce the length of the messages or completion." )
624
+ f"{ self .max_model_len } tokens. However, your request has "
625
+ f"{ token_num } input tokens. Please reduce the length of "
626
+ "the input messages." )
627
+
628
+ if max_tokens is not None and \
629
+ token_num + max_tokens > self .max_model_len :
630
+ raise ValueError (
631
+ "'max_tokens' or 'max_completion_tokens' is too large: "
632
+ f"{ max_tokens } . This model's maximum context length is "
633
+ f"{ self .max_model_len } tokens and your request has "
634
+ f"{ token_num } input tokens ({ max_tokens } > { self .max_model_len } "
635
+ f" - { token_num } )." )
631
636
632
637
return TextTokensPrompt (prompt = input_text , prompt_token_ids = input_ids )
633
638
0 commit comments