Make sentencepiece and tiktoken optional deps (#337)

Ahajha · web-flow · commit e7a1deafc4ad · 2025-06-16T02:27:43.000-04:00
Sentencepiece has not had a new release in over a year, and users are running into easily fixable install issues like CMake minimum versions, missing headers, and missing wheels for newer python versions. See recent issues: https://github.com/google/sentencepiece/issues, like this one for example: google/sentencepiece#1113 Tiktoken hasn't caused any particular issues, but I'm including it because it's used in the same capacity here. Both dependencies are only actually used if users are, so it seems pointless to depend on them directly. This updates the support to silently skip over them if they are not installed.
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,8 +15,6 @@ keywords = ["machine learning", "inference"]
 requires-python = ">=3.8, <4"
 dependencies = [
   "pydantic",
-  "sentencepiece",
-  "tiktoken",
   "torch>=1.10.0",
   "transformers>=4.38.0",
   "triton; platform_system == 'Linux' and platform_machine == 'x86_64'",
diff --git a/python/xgrammar/tokenizer_info.py b/python/xgrammar/tokenizer_info.py
@@ -4,8 +4,15 @@
 from enum import Enum
 from typing import Any, Dict, List, Optional, Union
 
-import sentencepiece
-import tiktoken
+try:
+    import sentencepiece
+except ImportError:
+    sentencepiece = None
+try:
+    import tiktoken
+except ImportError:
+    tiktoken = None
+
 from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast
 
 from .base import XGRObject, _core
@@ -95,6 +102,9 @@ def __init__(
 
     @staticmethod
     def _is_tiktoken_tokenizer(tokenizer: PreTrainedTokenizerBase) -> bool:
+        if tiktoken is None:
+            return False
+
         # helper to check if tokenizer is a tiktoken tokenizer
         has_tiktoken_encoding = hasattr(tokenizer, "tokenizer") and isinstance(
             tokenizer.tokenizer, tiktoken.Encoding
@@ -110,6 +120,9 @@ def _is_tiktoken_tokenizer(tokenizer: PreTrainedTokenizerBase) -> bool:
 
     @staticmethod
     def _is_sentencepiece_tokenizer(tokenizer: PreTrainedTokenizerBase) -> bool:
+        if sentencepiece is None:
+            return False
+
         # helper to check if tokenizer is a sentence piece tokenizer
         has_sp_model_attr = hasattr(tokenizer, "sp_model") and isinstance(
             tokenizer.sp_model, sentencepiece.SentencePieceProcessor