Skip to content

Commit e7a1dea

Browse files
authored
Make sentencepiece and tiktoken optional deps (#337)
Sentencepiece has not had a new release in over a year, and users are running into easily fixable install issues like CMake minimum versions, missing headers, and missing wheels for newer python versions. See recent issues: https://github.com/google/sentencepiece/issues, like this one for example: google/sentencepiece#1113 Tiktoken hasn't caused any particular issues, but I'm including it because it's used in the same capacity here. Both dependencies are only actually used if users are, so it seems pointless to depend on them directly. This updates the support to silently skip over them if they are not installed.
1 parent b87ed7f commit e7a1dea

File tree

2 files changed

+15
-4
lines changed

2 files changed

+15
-4
lines changed

pyproject.toml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@ keywords = ["machine learning", "inference"]
1515
requires-python = ">=3.8, <4"
1616
dependencies = [
1717
"pydantic",
18-
"sentencepiece",
19-
"tiktoken",
2018
"torch>=1.10.0",
2119
"transformers>=4.38.0",
2220
"triton; platform_system == 'Linux' and platform_machine == 'x86_64'",

python/xgrammar/tokenizer_info.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,15 @@
44
from enum import Enum
55
from typing import Any, Dict, List, Optional, Union
66

7-
import sentencepiece
8-
import tiktoken
7+
try:
8+
import sentencepiece
9+
except ImportError:
10+
sentencepiece = None
11+
try:
12+
import tiktoken
13+
except ImportError:
14+
tiktoken = None
15+
916
from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast
1017

1118
from .base import XGRObject, _core
@@ -95,6 +102,9 @@ def __init__(
95102

96103
@staticmethod
97104
def _is_tiktoken_tokenizer(tokenizer: PreTrainedTokenizerBase) -> bool:
105+
if tiktoken is None:
106+
return False
107+
98108
# helper to check if tokenizer is a tiktoken tokenizer
99109
has_tiktoken_encoding = hasattr(tokenizer, "tokenizer") and isinstance(
100110
tokenizer.tokenizer, tiktoken.Encoding
@@ -110,6 +120,9 @@ def _is_tiktoken_tokenizer(tokenizer: PreTrainedTokenizerBase) -> bool:
110120

111121
@staticmethod
112122
def _is_sentencepiece_tokenizer(tokenizer: PreTrainedTokenizerBase) -> bool:
123+
if sentencepiece is None:
124+
return False
125+
113126
# helper to check if tokenizer is a sentence piece tokenizer
114127
has_sp_model_attr = hasattr(tokenizer, "sp_model") and isinstance(
115128
tokenizer.sp_model, sentencepiece.SentencePieceProcessor

0 commit comments

Comments
 (0)