Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions python/sglang/srt/entrypoints/http_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,19 +53,25 @@
from sglang.srt.entrypoints.openai.protocol import (
ChatCompletionRequest,
CompletionRequest,
DetokenizeRequest,
EmbeddingRequest,
ErrorResponse,
ModelCard,
ModelList,
ResponsesRequest,
ScoringRequest,
TokenizeRequest,
V1RerankReqInput,
)
from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
from sglang.srt.entrypoints.openai.serving_completions import OpenAIServingCompletion
from sglang.srt.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
from sglang.srt.entrypoints.openai.serving_rerank import OpenAIServingRerank
from sglang.srt.entrypoints.openai.serving_score import OpenAIServingScore
from sglang.srt.entrypoints.openai.serving_tokenize import (
OpenAIServingDetokenize,
OpenAIServingTokenize,
)
from sglang.srt.function_call.function_call_parser import FunctionCallParser
from sglang.srt.managers.io_struct import (
AbortReq,
Expand Down Expand Up @@ -148,6 +154,12 @@ async def lifespan(fast_api_app: FastAPI):
fast_api_app.state.openai_serving_rerank = OpenAIServingRerank(
_global_state.tokenizer_manager
)
fast_api_app.state.openai_serving_tokenize = OpenAIServingTokenize(
_global_state.tokenizer_manager
)
fast_api_app.state.openai_serving_detokenize = OpenAIServingDetokenize(
_global_state.tokenizer_manager
)

server_args: ServerArgs = fast_api_app.server_args

Expand Down Expand Up @@ -896,6 +908,50 @@ async def openai_v1_embeddings(request: EmbeddingRequest, raw_request: Request):
)


@app.post(
"/v1/tokenize",
response_class=ORJSONResponse,
dependencies=[Depends(validate_json_request)],
)
async def openai_v1_tokenize(request: TokenizeRequest, raw_request: Request):
"""OpenAI-compatible tokenization endpoint."""
return await raw_request.app.state.openai_serving_tokenize.handle_request(
request, raw_request
)


@app.post(
"/v1/detokenize",
response_class=ORJSONResponse,
dependencies=[Depends(validate_json_request)],
)
async def openai_v1_detokenize(request: DetokenizeRequest, raw_request: Request):
"""OpenAI-compatible detokenization endpoint."""
return await raw_request.app.state.openai_serving_detokenize.handle_request(
request, raw_request
)


@app.post(
"/tokenize",
response_class=ORJSONResponse,
dependencies=[Depends(validate_json_request)],
include_in_schema=False,
)
async def openai_tokenize(request: TokenizeRequest, raw_request: Request):
return await openai_v1_tokenize(request, raw_request)


@app.post(
"/detokenize",
response_class=ORJSONResponse,
dependencies=[Depends(validate_json_request)],
include_in_schema=False,
)
async def openai_detokenize(request: DetokenizeRequest, raw_request: Request):
return await openai_v1_detokenize(request, raw_request)


@app.get("/v1/models", response_class=ORJSONResponse)
async def available_models():
"""Show available models. OpenAI-compatible endpoint."""
Expand Down
38 changes: 38 additions & 0 deletions python/sglang/srt/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,12 +629,50 @@ class RerankResponse(BaseModel):
meta_info: Optional[dict] = None


class TokenizeRequest(BaseModel):
"""Request schema for the /tokenize endpoint."""

model: str
prompt: Union[str, List[str]]
Copy link
Collaborator

@JustinTong0323 JustinTong0323 Aug 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shall we keep the batched option? cc @slin1237 @CatherineSue

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can keep it as this is not an official OpenAI endpoint, and it directly uses tokenizer, so no performance or compatibility concerns.

add_special_tokens: bool = Field(
default=True,
description="whether to add model-specific special tokens (e.g. BOS/EOS) during encoding.",
)


class TokenizeResponse(BaseModel):
"""Response schema for the /tokenize endpoint."""

tokens: Union[List[int], List[List[int]]]
count: Union[int, List[int]]
max_model_len: int


class DetokenizeRequest(BaseModel):
"""Request schema for the /detokenize endpoint."""

model: str
tokens: Union[List[int], List[List[int]]]
skip_special_tokens: bool = Field(
default=True,
description="whether to exclude special tokens (e.g. padding or EOS) during decoding.",
)


class DetokenizeResponse(BaseModel):
"""Response schema for the /detokenize endpoint."""

text: Union[str, List[str]]


OpenAIServingRequest = Union[
ChatCompletionRequest,
CompletionRequest,
EmbeddingRequest,
ScoringRequest,
V1RerankReqInput,
TokenizeRequest,
DetokenizeRequest,
]


Expand Down
144 changes: 144 additions & 0 deletions python/sglang/srt/entrypoints/openai/serving_tokenize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import logging
from http import HTTPStatus
from typing import List, Union

from fastapi import Request

from sglang.srt.entrypoints.openai.protocol import (
DetokenizeRequest,
DetokenizeResponse,
ErrorResponse,
TokenizeRequest,
TokenizeResponse,
)
from sglang.srt.entrypoints.openai.serving_base import OpenAIServingBase

logger = logging.getLogger(__name__)


class OpenAIServingTokenize(OpenAIServingBase):
"""Handler for /v1/tokenize requests"""

def _request_id_prefix(self) -> str:
return "tok-"

def _convert_to_internal_request(
self, request: TokenizeRequest
) -> tuple[TokenizeRequest, TokenizeRequest]:
return request, request

async def _handle_non_streaming_request(
self,
adapted_request: TokenizeRequest,
request: TokenizeRequest,
raw_request: Request,
) -> Union[TokenizeResponse, ErrorResponse]:
try:
tokenizer = self.tokenizer_manager.tokenizer
max_model_len = getattr(tokenizer, "model_max_length", -1)

if isinstance(request.prompt, str):
token_ids = tokenizer.encode(
request.prompt,
add_special_tokens=request.add_special_tokens,
)
tokens = token_ids
count = len(token_ids)
elif isinstance(request.prompt, list):
token_ids_list = [
tokenizer.encode(
text, add_special_tokens=request.add_special_tokens
)
for text in request.prompt
]
tokens = token_ids_list
count = [len(ids) for ids in token_ids_list]
else:
return self.create_error_response(
f"Invalid prompt type: {type(request.prompt)}. Expected str or List[str]."
)

return TokenizeResponse(
tokens=tokens, count=count, max_model_len=max_model_len
)
except Exception as e:
logger.error("Error during tokenization", exc_info=True)
return self.create_error_response(
f"Internal server error during tokenization: {e}",
err_type="InternalServerError",
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
)


class OpenAIServingDetokenize(OpenAIServingBase):
"""Handler for /v1/detokenize requests"""

def _request_id_prefix(self) -> str:
return "detok-"

def _convert_to_internal_request(
self, request: DetokenizeRequest
) -> tuple[DetokenizeRequest, DetokenizeRequest]:
return request, request

async def _handle_non_streaming_request(
self,
adapted_request: DetokenizeRequest,
request: DetokenizeRequest,
raw_request: Request,
) -> Union[DetokenizeResponse, ErrorResponse]:
try:
tokenizer = self.tokenizer_manager.tokenizer

if (
isinstance(request.tokens, list)
and request.tokens
and isinstance(request.tokens[0], int)
):
if not all(isinstance(t, int) for t in request.tokens):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Should this be better removed to _validate_request?

return self.create_error_response(
"Invalid input: 'tokens' must be a list of integers."
)
tokens_to_decode = [int(t) for t in request.tokens]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Why do we need int(t) here? I assume the above if check already makes sure tokens are int?

text = tokenizer.decode(
tokens_to_decode, skip_special_tokens=request.skip_special_tokens
)
text_out: Union[str, List[str]] = text
elif (
isinstance(request.tokens, list)
and request.tokens
and isinstance(request.tokens[0], list)
):
texts: List[str] = []
for token_list in request.tokens:
if not all(isinstance(t, int) for t in token_list):
return self.create_error_response(
f"Invalid input: Sublist in 'tokens' must contain only integers. Found: {token_list}"
)
decoded_text = tokenizer.decode(
[int(t) for t in token_list],
skip_special_tokens=request.skip_special_tokens,
)
texts.append(decoded_text)
text_out = texts
elif isinstance(request.tokens, list) and not request.tokens:
text_out = ""
else:
return self.create_error_response(
f"Invalid tokens type: {type(request.tokens)}. Expected List[int] or List[List[int]]."
)

return DetokenizeResponse(text=text_out)
except Exception as e:
logger.error("Error during detokenization", exc_info=True)
if "decode" in str(e).lower():
return self.create_error_response(
f"Error decoding tokens: {e}. Input tokens might be invalid for the model.",
err_type="DecodeError",
status_code=HTTPStatus.BAD_REQUEST,
)
return self.create_error_response(
f"Internal server error during detokenization: {e}",
err_type="InternalServerError",
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
)
Loading