opendatahub-io
diff --git a/‎tests/tgis/__init__.py b/‎tests/tgis/__init__.py
diff --git a/‎tests/tgis/test_hub.py
Lines changed: 50 additions & 0 deletions b/‎tests/tgis/test_hub.py
Lines changed: 50 additions & 0 deletions
diff --git a/‎vllm/entrypoints/openai/api_server.py
Lines changed: 29 additions & 0 deletions b/‎vllm/entrypoints/openai/api_server.py
Lines changed: 29 additions & 0 deletions
diff --git a/‎vllm/scripts.py
Lines changed: 190 additions & 0 deletions b/‎vllm/scripts.py
Lines changed: 190 additions & 0 deletions
@@ -0,0 +1,50 @@
+from pathlib import Path
+
+import pytest
+from huggingface_hub.utils import LocalEntryNotFoundError
+
+from vllm.tgis_utils.hub import (convert_files, download_weights, weight_files,
+                                 weight_hub_files)
+
+
+def test_convert_files():
+    model_id = "bigscience/bloom-560m"
+    local_pt_files = download_weights(model_id, extension=".bin")
+    local_pt_files = [Path(p) for p in local_pt_files]
+    local_st_files = [
+        p.parent / f"{p.stem.removeprefix('pytorch_')}.safetensors"
+        for p in local_pt_files
+    ]
+    convert_files(local_pt_files, local_st_files, discard_names=[])
+
+    found_st_files = weight_files(model_id)
+
+    assert all([str(p) in found_st_files for p in local_st_files])
+
+
+def test_weight_hub_files():
+    filenames = weight_hub_files("bigscience/bloom-560m")
+    assert filenames == ["model.safetensors"]
+
+
+def test_weight_hub_files_llm():
+    filenames = weight_hub_files("bigscience/bloom")
+    assert filenames == [
+        f"model_{i:05d}-of-00072.safetensors" for i in range(1, 73)
+    ]
+
+
+def test_weight_hub_files_empty():
+    filenames = weight_hub_files("bigscience/bloom", ".errors")
+    assert filenames == []
+
+
+def test_download_weights():
+    files = download_weights("bigscience/bloom-560m")
+    local_files = weight_files("bigscience/bloom-560m")
+    assert files == local_files
+
+
+def test_weight_files_error():
+    with pytest.raises(LocalEntryNotFoundError):
+        weight_files("bert-base-uncased")
@@ -18,6 +18,7 @@
 import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.grpc.grpc_server import start_grpc_server
 from vllm.entrypoints.openai.cli_args import make_arg_parser
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -34,6 +35,7 @@
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.logger import init_logger
+from vllm.tgis_utils.args import add_tgis_args, postprocess_tgis_args
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser
 from vllm.version import __version__ as VLLM_VERSION
@@ -46,6 +48,7 @@
 openai_serving_chat: OpenAIServingChat
 openai_serving_completion: OpenAIServingCompletion
 openai_serving_embedding: OpenAIServingEmbedding
+async_llm_engine: AsyncLLMEngine
 
 logger = init_logger('vllm.entrypoints.openai.api_server')
 
@@ -65,8 +68,15 @@ async def _force_log():
         _running_tasks.add(task)
         task.add_done_callback(_running_tasks.remove)
 
+    grpc_server = await start_grpc_server(async_llm_engine, args)
+
     yield
 
+    logger.info("Gracefully stopping gRPC server")
+    await grpc_server.stop(30)  #TODO configurable grace
+    await grpc_server.wait_for_termination()
+    logger.info("gRPC server stopped")
+
 
 router = APIRouter()
 
@@ -220,6 +230,16 @@ def run_server(args, llm_engine=None):
     global engine, engine_args
 
     engine_args = AsyncEngineArgs.from_cli_args(args)
+
+    # Enforce pixel values as image input type for vision language models
+    # when serving with API server
+    if engine_args.image_input_type is not None and \
+        engine_args.image_input_type.upper() != "PIXEL_VALUES":
+        raise ValueError(
+            f"Invalid image_input_type: {engine_args.image_input_type}. "
+            "Only --image-input-type 'pixel_values' is supported for serving "
+            "vision language models with the vLLM API server.")
+
     engine = (llm_engine
               if llm_engine is not None else AsyncLLMEngine.from_engine_args(
                   engine_args, usage_context=UsageContext.OPENAI_API_SERVER))
@@ -241,6 +261,7 @@ def run_server(args, llm_engine=None):
     global openai_serving_chat
     global openai_serving_completion
     global openai_serving_embedding
+    global async_llm_engine
 
     openai_serving_chat = OpenAIServingChat(engine, model_config,
                                             served_model_names,
@@ -252,6 +273,11 @@ def run_server(args, llm_engine=None):
         args.prompt_adapters)
     openai_serving_embedding = OpenAIServingEmbedding(engine, model_config,
                                                       served_model_names)
+
+    # 🌶️🌶️🌶️ Sets the engine for the TGIS gRPC server.
+    # Do not delete on merge conflicts!
+    async_llm_engine = engine
+
     app.root_path = args.root_path
 
     logger.info("Available routes are:")
@@ -278,5 +304,8 @@ def run_server(args, llm_engine=None):
     parser = FlexibleArgumentParser(
         description="vLLM OpenAI-Compatible RESTful API server.")
     parser = make_arg_parser(parser)
+    parser = add_tgis_args(parser)
     args = parser.parse_args()
+    args = postprocess_tgis_args(args)
+
     run_server(args)
@@ -3,6 +3,7 @@
 import os
 import signal
 import sys
+from pathlib import Path
 from typing import Optional
 
 from openai import OpenAI
@@ -49,6 +50,19 @@ def interactive_cli(args: argparse.Namespace) -> None:
         chat(args.system_prompt, model_name, openai_client)
 
 
+def tgis_cli(args: argparse.Namespace) -> None:
+    registrer_signal_handlers()
+
+    if args.command == "download-weights":
+        download_weights(args.model_name, args.revision, args.token,
+                         args.extension, args.auto_convert)
+    elif args.command == "convert-to-safetensors":
+        convert_to_safetensors(args.model_name, args.revision)
+    elif args.command == "convert-to-fast-tokenizer":
+        convert_to_fast_tokenizer(args.model_name, args.revision,
+                                  args.output_path)
+
+
 def complete(model_name: str, client: OpenAI) -> None:
     print("Please enter prompt to complete:")
     while True:
@@ -82,6 +96,151 @@ def chat(system_prompt: Optional[str], model_name: str,
         print(output)
 
 
+def download_weights(
+    model_name: str,
+    revision: Optional[str] = None,
+    token: Optional[str] = None,
+    extension: str = ".safetensors",
+    auto_convert: bool = True,
+) -> None:
+    from vllm.tgis_utils import hub
+
+    print(extension)
+    meta_exts = [".json", ".py", ".model", ".md"]
+
+    extensions = extension.split(",")
+
+    if len(extensions) == 1 and extensions[0] not in meta_exts:
+        extensions.extend(meta_exts)
+
+    files = hub.download_weights(model_name,
+                                 extensions,
+                                 revision=revision,
+                                 auth_token=token)
+
+    if auto_convert and ".safetensors" in extensions:
+        if not hub.local_weight_files(hub.get_model_path(model_name, revision),
+                                      ".safetensors"):
+            if ".bin" not in extensions:
+                print(".safetensors weights not found, \
+                    downloading pytorch weights to convert...")
+                hub.download_weights(model_name,
+                                     ".bin",
+                                     revision=revision,
+                                     auth_token=token)
+
+            print(".safetensors weights not found, \
+                    converting from pytorch weights...")
+            convert_to_safetensors(model_name, revision)
+        elif not any(f.endswith(".safetensors") for f in files):
+            print(".safetensors weights not found on hub, \
+                    but were found locally. Remove them first to re-convert")
+    if auto_convert:
+        convert_to_fast_tokenizer(model_name, revision)
+
+
+def convert_to_safetensors(
+    model_name: str,
+    revision: Optional[str] = None,
+):
+    from vllm.tgis_utils import hub
+
+    # Get local pytorch file paths
+    model_path = hub.get_model_path(model_name, revision)
+    local_pt_files = hub.local_weight_files(model_path, ".bin")
+    local_pt_index_files = hub.local_index_files(model_path, ".bin")
+    if len(local_pt_index_files) > 1:
+        print(
+            f"Found more than one .bin.index.json file: {local_pt_index_files}"
+        )
+        return
+    if not local_pt_files:
+        print("No pytorch .bin files found to convert")
+        return
+
+    local_pt_files = [Path(f) for f in local_pt_files]
+    local_pt_index_file = local_pt_index_files[
+        0] if local_pt_index_files else None
+
+    # Safetensors final filenames
+    local_st_files = [
+        p.parent / f"{p.stem.removeprefix('pytorch_')}.safetensors"
+        for p in local_pt_files
+    ]
+
+    if any(os.path.exists(p) for p in local_st_files):
+        print("Existing .safetensors weights found, \
+                remove them first to reconvert")
+        return
+
+    try:
+        import transformers
+
+        config = transformers.AutoConfig.from_pretrained(
+            model_name,
+            revision=revision,
+        )
+        architecture = config.architectures[0]
+
+        class_ = getattr(transformers, architecture)
+
+        # Name for this variable depends on transformers version
+        discard_names = getattr(class_, "_tied_weights_keys", [])
+        discard_names.extend(
+            getattr(class_, "_keys_to_ignore_on_load_missing", []))
+
+    except Exception:
+        discard_names = []
+
+    if local_pt_index_file:
+        local_pt_index_file = Path(local_pt_index_file)
+        st_prefix = local_pt_index_file.stem.removeprefix(
+            "pytorch_").removesuffix(".bin.index")
+        local_st_index_file = (local_pt_index_file.parent /
+                               f"{st_prefix}.safetensors.index.json")
+
+        if os.path.exists(local_st_index_file):
+            print("Existing .safetensors.index.json file found, \
+                    remove it first to reconvert")
+            return
+
+        hub.convert_index_file(local_pt_index_file, local_st_index_file,
+                               local_pt_files, local_st_files)
+
+    # Convert pytorch weights to safetensors
+    hub.convert_files(local_pt_files, local_st_files, discard_names)
+
+
+def convert_to_fast_tokenizer(
+    model_name: str,
+    revision: Optional[str] = None,
+    output_path: Optional[str] = None,
+):
+    from vllm.tgis_utils import hub
+
+    # Check for existing "tokenizer.json"
+    model_path = hub.get_model_path(model_name, revision)
+
+    if os.path.exists(os.path.join(model_path, "tokenizer.json")):
+        print(f"Model {model_name} already has a fast tokenizer")
+        return
+
+    if output_path is not None:
+        if not os.path.isdir(output_path):
+            print(f"Output path {output_path} must exist and be a directory")
+            return
+    else:
+        output_path = model_path
+
+    import transformers
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name,
+                                                           revision=revision)
+    tokenizer.save_pretrained(output_path)
+
+    print(f"Saved tokenizer to {output_path}")
+
+
 def _add_query_options(
         parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     parser.add_argument(
@@ -142,6 +301,37 @@ def main():
               "used for models that support system prompts."))
     chat_parser.set_defaults(dispatch_function=interactive_cli, command="chat")
 
+    download_weights_parser = subparsers.add_parser(
+        "download-weights",
+        help=("Download the weights of a given model"),
+        usage="vllm download-weights <model_name> [options]")
+    download_weights_parser.add_argument("model_name")
+    download_weights_parser.add_argument("--revision")
+    download_weights_parser.add_argument("--token")
+    download_weights_parser.add_argument("--extension", default=".safetensors")
+    download_weights_parser.add_argument("--auto_convert", default=True)
+    download_weights_parser.set_defaults(dispatch_function=tgis_cli,
+                                         command="download-weights")
+
+    convert_to_safetensors_parser = subparsers.add_parser(
+        "convert-to-safetensors",
+        help=("Convert model weights to safetensors"),
+        usage="vllm convert-to-safetensors <model_name> [options]")
+    convert_to_safetensors_parser.add_argument("model_name")
+    convert_to_safetensors_parser.add_argument("--revision")
+    convert_to_safetensors_parser.set_defaults(
+        dispatch_function=tgis_cli, command="convert-to-safetensors")
+
+    convert_to_fast_tokenizer_parser = subparsers.add_parser(
+        "convert-to-fast-tokenizer",
+        help=("Convert to fast tokenizer"),
+        usage="vllm convert-to-fast-tokenizer <model_name> [options]")
+    convert_to_fast_tokenizer_parser.add_argument("model_name")
+    convert_to_fast_tokenizer_parser.add_argument("--revision")
+    convert_to_fast_tokenizer_parser.add_argument("--output_path")
+    convert_to_fast_tokenizer_parser.set_defaults(
+        dispatch_function=tgis_cli, command="convert-to-fast-tokenizer")
+
     args = parser.parse_args()
     # One of the sub commands should be executed.
     if hasattr(args, "dispatch_function"):