eth-easl
diff --git a/‎docker/Dockerfile.x86_64-cuda‎
Lines changed: 1 addition & 3 deletions b/‎docker/Dockerfile.x86_64-cuda‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎meta/requirements.txt‎
Lines changed: 6 additions & 2 deletions b/‎meta/requirements.txt‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scratchpad/cli/handlers/chat.py‎
Lines changed: 2 additions & 1 deletion b/‎scratchpad/cli/handlers/chat.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎scratchpad/cli/sp.py‎
Lines changed: 3 additions & 2 deletions b/‎scratchpad/cli/sp.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎scratchpad/cli/spc.py‎
Lines changed: 21 additions & 0 deletions b/‎scratchpad/cli/spc.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎scratchpad/config/model_config.py‎
Lines changed: 14 additions & 23 deletions b/‎scratchpad/config/model_config.py‎
Lines changed: 14 additions & 23 deletions
diff --git a/‎scratchpad/config/utils.py‎
Lines changed: 31 additions & 2 deletions b/‎scratchpad/config/utils.py‎
Lines changed: 31 additions & 2 deletions
diff --git a/‎scratchpad/config/vllm_model_config.py‎
Lines changed: 9 additions & 19 deletions b/‎scratchpad/config/vllm_model_config.py‎
Lines changed: 9 additions & 19 deletions
diff --git a/‎scratchpad/constrained/xgrammar_backend.py‎
Lines changed: 4 additions & 7 deletions b/‎scratchpad/constrained/xgrammar_backend.py‎
Lines changed: 4 additions & 7 deletions
@@ -16,9 +16,7 @@ WORKDIR /scratchpad
 
 COPY . /scratchpad
 
-RUN git clone -b v0.1.6 https://github.com/flashinfer-ai/flashinfer.git --recursive && \
-    cd flashinfer/python && \
-    pip install --no-build-isolation --verbose --editable .
+RUN pip install flashinfer-python==0.2.3 -i https://flashinfer.ai/whl/cu124/torch2.5/
 
 RUN git clone https://github.com/eth-easl/triteia.git && \
     cd triteia && \
 
@@ -6,7 +6,7 @@ requests
 uvicorn
 zmq
 huggingface_hub
-transformers==4.46.3
+transformers
 outlines==0.0.46
 uvloop
 nvidia-ml-py
@@ -23,4 +23,8 @@ orjson
 xgrammar>=0.1.13
 nvidia-cuda-nvrtc-cu12
 cuda-python
-setproctitle
+setproctitle
+torch-memory-saver
+sgl-kernel
+decord
+soundfile
@@ -13,7 +13,7 @@ max-line-length = 120
 
 [project.scripts]
 sp = "scratchpad.cli.sp:app"
-
+spc = "scratchpad.cli.spc:app"
 [tool.setuptools]
 packages = ["scratchpad"]
 
 
@@ -57,7 +57,7 @@ def __init__(
         self.cache_prompt = cache_prompt
         self.headers = {"Content-Type": "application/json"}
         self.chat_history = []
-        self.model_name = ""
+        self.model_name = model_name
         self.console = Console()
         self.client = openai.OpenAI(api_key="test", base_url=self.serveraddr + "/v1")
         # TODO: Gracefully handle user input history file.
@@ -77,6 +77,7 @@ def chat_generator(self, prompt):
             "seed": self.seed,
             "model": self.model_name,
         }
+        print(f"Payload: {payload}")
         try:
             response = self.client.chat.completions.create(**payload)
             for chunk in response:
 
@@ -1,5 +1,5 @@
 import typer
-from scratchpad.server import dataclass_to_cli, ServerArgs, launch_server
+from scratchpad.server import dataclass_to_cli, ServerArgs
 from .handlers import ChatHandler, benchmark_quality
 
 app = typer.Typer()
@@ -13,6 +13,7 @@ def serve(
 ):
     """Spin up the server"""
     from scratchpad.server.args import global_args
+    from scratchpad.server import launch_server
     import multiprocessing as mp
 
     mp.set_start_method("spawn", force=True)
@@ -30,7 +31,7 @@ def version():
 @app.command()
 def chat(
     model: str,
-    backend: str = "http://localhost:8080",
+    backend: str = "http://localhost:3000",
 ):
     chat_handler = ChatHandler(server_addr=backend, model_name=model)
     chat_handler.chat()
 
@@ -0,0 +1,21 @@
+import typer
+
+app = typer.Typer()
+
+
+@app.command()
+def chat(
+    model: str,
+    backend: str = "http://localhost:3000",
+):
+    from .handlers import ChatHandler
+
+    print(f"Chatting with model: {model}, backend: {backend}")
+    chat_handler = ChatHandler(server_addr=backend, model_name=model)
+    chat_handler.chat()
+
+
+@app.command()
+def version():
+    """Print the version"""
+    typer.echo("0.1.0")
@@ -1,6 +1,6 @@
 from enum import IntEnum, auto
 from typing import Optional
-from typing import List
+from typing import List, Set
 from transformers import PretrainedConfig
 
 from scratchpad.utils import get_config, get_context_length
@@ -70,6 +70,9 @@ def __init__(
             self.hf_config.architectures, is_embedding
         )
         self.is_multimodal = is_multimodal_model(self.hf_config.architectures)
+        self.is_multimodal_gen = False
+        self.is_image_gen = False
+        self.is_audio_model = False
         self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
         if context_length is not None:
             self.context_len = context_length
@@ -82,38 +85,26 @@ def __init__(
             "head_dim",
             self.hf_text_config.hidden_size // self.hf_text_config.num_attention_heads,
         )
-
-        # FIXME: temporary special judge for deepseek v2 MLA architecture
-        if "DeepseekV2ForCausalLM" in self.hf_config.architectures:
-            self.head_dim = 256
-            self.attention_arch = AttentionArch.MLA
-            self.kv_lora_rank = self.hf_config.kv_lora_rank
-            self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
-        elif "MiniCPM3ForCausalLM" in self.hf_config.architectures:
-            self.head_dim = 128
-            self.attention_arch = AttentionArch.MLA
-            self.kv_lora_rank = self.hf_config.kv_lora_rank
-            self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
-        else:
-            self.attention_arch = AttentionArch.MHA
-
+        self.attention_arch = AttentionArch.MHA
         self.num_attention_heads = self.hf_text_config.num_attention_heads
         self.num_key_value_heads = getattr(
             self.hf_text_config, "num_key_value_heads", None
         )
-
-        # for Dbrx and MPT models
-        if self.hf_config.model_type in ["dbrx", "mpt"]:
-            self.num_key_value_heads = getattr(
-                self.hf_config.attn_config, "kv_n_heads", None
-            )
-
         if self.num_key_value_heads is None:
             self.num_key_value_heads = self.num_attention_heads
         self.hidden_size = self.hf_text_config.hidden_size
         self.num_hidden_layers = self.hf_text_config.num_hidden_layers
         self.vocab_size = self.hf_text_config.vocab_size
         self.is_encoder_decoder = self.hf_config.model_type in ["mllama"]
+        self.hf_eos_token_id = self.get_hf_eos_token_id()
+        self.image_token_id = getattr(self.hf_config, "image_token_id", None)
+
+    def get_hf_eos_token_id(self) -> Optional[Set[int]]:
+        eos_ids = getattr(self.hf_config, "eos_token_id", None)
+        if eos_ids:
+            # it can be either int or list of int
+            eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
+        return eos_ids
 
     # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L289
     def get_total_num_kv_heads(self) -> int:
 
@@ -26,9 +26,13 @@ def _get_and_verify_dtype(
         dtype = dtype.lower()
         if dtype == "auto":
             if config_dtype == torch.float32:
-                if config.model_type == "gemma2":
+                if config.model_type.startswith("gemma"):
+                    if config.model_type == "gemma":
+                        gemma_version = ""
+                    else:
+                        gemma_version = config.model_type[5]
                     logger.info(
-                        "For Gemma 2, we downcast float32 to bfloat16 instead "
+                        f"For Gemma {gemma_version}, we downcast float32 to bfloat16 instead "
                         "of float16 by default. Please specify `dtype` if you "
                         "want to use float16."
                     )
@@ -65,6 +69,13 @@ def _get_and_verify_dtype(
     return torch_dtype
 
 
+def get_min_sliding_window(sliding_window: Union[int, list[Optional[int]]]) -> int:
+    if isinstance(sliding_window, list):
+        return min(s for s in sliding_window if s is not None)
+
+    return sliding_window
+
+
 def _get_and_verify_max_len(
     hf_config: PretrainedConfig,
     max_model_len: Optional[int],
@@ -216,3 +227,21 @@ def get_served_model_name(
     if isinstance(served_model_name, list):
         return served_model_name[0]
     return served_model_name
+
+
+multimodal_model_archs = [
+    "Gemma3ForConditionalGeneration",
+    "MllamaForConditionalGeneration",
+    "Qwen2VLForConditionalGeneration",
+    "Qwen2_5_VLForConditionalGeneration",
+]
+
+
+def is_multimodal_model(model_architectures: List[str]):
+    if any(
+        multi_model_arch in model_architectures
+        for multi_model_arch in multimodal_model_archs
+    ):
+        return True
+    else:
+        return False
@@ -1,17 +1,14 @@
 import enum
-from dataclasses import dataclass, field, fields
-import contextlib
 from typing import (
-    TYPE_CHECKING,
     Any,
     Dict,
     List,
     Mapping,
     Optional,
     Union,
 )
-from pathlib import Path
 import torch
+from pathlib import Path
 from transformers import PretrainedConfig, AutoConfig
 from scratchpad.utils import (
     logger,
@@ -20,12 +17,17 @@
     get_hf_image_processor_config,
 )
 from scratchpad.config.modality_config import MultiModalConfig
-from scratchpad.nn.models import ModelRegistry
-from .utils import _get_and_verify_dtype, _get_and_verify_max_len, get_served_model_name
 import huggingface_hub
 from huggingface_hub import file_exists, try_to_load_from_cache
 from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
+from .utils import (
+    _get_and_verify_dtype,
+    _get_and_verify_max_len,
+    get_served_model_name,
+    is_multimodal_model,
+)
+
 
 class ConfigFormat(str, enum.Enum):
     AUTO = "auto"
@@ -307,15 +309,14 @@ def __init__(
             self._verify_tokenizer_mode()
 
         self.override_neuron_config = None
-        self._verify_embedding_mode()
         self._verify_cuda_graph()
         self._verify_bnb_config()
 
     def _init_multimodal_config(
         self, limit_mm_per_prompt: Optional[Mapping[str, int]]
     ) -> Optional["MultiModalConfig"]:
         architectures = getattr(self.hf_config, "architectures", [])
-        if any(ModelRegistry.is_multimodal_model(arch) for arch in architectures):
+        if is_multimodal_model(architectures):
             return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
         else:
             if limit_mm_per_prompt:
@@ -333,12 +334,6 @@ def _verify_tokenizer_mode(self) -> None:
             )
         self.tokenizer_mode = tokenizer_mode
 
-    def _verify_embedding_mode(self) -> None:
-        architectures = getattr(self.hf_config, "architectures", [])
-        self.embedding_mode = any(
-            ModelRegistry.is_embedding_model(arch) for arch in architectures
-        )
-
     def _parse_quant_hf_config(self):
         quant_cfg = getattr(self.hf_config, "quantization_config", None)
         if quant_cfg is None:
@@ -526,11 +521,6 @@ def is_encoder_decoder_model(self) -> bool:
             )
         )
 
-    @property
-    def is_embedding_model(self) -> bool:
-        """Extract the embedding model flag."""
-        return self.embedding_mode
-
     @property
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
@@ -1,6 +1,4 @@
-import logging
 from typing import List, Tuple
-
 import torch
 from xgrammar import (
     CompiledGrammar,
@@ -13,8 +11,7 @@
 )
 
 from .base_backend import BaseGrammarObject, BaseGrammarBackend
-
-logger = logging.getLogger(__name__)
+from scratchpad.utils import logger
 
 
 MAX_ROLLBACK_TOKENS = 200
@@ -104,23 +101,23 @@ def init_value_impl(self, key: Tuple[str, str]) -> XGrammarGrammar:
                 else:
                     ctx = self.grammar_compiler.compile_json_schema(schema=key_string)
             except RuntimeError as e:
-                logging.warning(
+                logger.warning(
                     f"Skip invalid json_schema: json_schema={key_string}, {e=}"
                 )
                 return None
         elif key_type == "ebnf":
             try:
                 ctx = self.grammar_compiler.compile_grammar(key_string)
             except RuntimeError as e:
-                logging.warning(f"Skip invalid ebnf: ebnf={key_string}, {e=}")
+                logger.warning(f"Skip invalid ebnf: ebnf={key_string}, {e=}")
                 return None
         elif key_type == "regex":
             try:
                 ctx = self.grammar_compiler.compile_grammar(
                     Grammar.from_regex(key_string)
                 )
             except RuntimeError as e:
-                logging.warning(f"Skip invalid regex: regex={key_string}, {e=}")
+                logger.warning(f"Skip invalid regex: regex={key_string}, {e=}")
                 return None
         else:
             raise ValueError(f"Invalid key_type: {key_type}")