Bind fill_next_token_bitmask against nb::ndarray (#338)

Ahajha · Ubospica · web-flow · commit 604d00429a58 · 2025-08-03T22:56:46.000-04:00
Nanobind has built-in support for the DLPack standard, which allows us to take anything adhering to the DLPack spec here rather than just PyTorch objects. See here: https://nanobind.readthedocs.io/en/latest/ndarray.html This does add a dependency on numpy, but only for a single type, which does seem a little overkill. If I push on #233 a little bit though, we should only need these definitions in the stubfiles, and will be unneeded for the `.py` files. So ideally this is a temporary dependency. My goal is to reduce the dependency on PyTorch a bit by making the code more general. I don't know if (or even think) that we can remove it entirely, but this seems worthwhile to do. The annotation on the `nb::ndarray` class will actually check at runtime that the parameter has the desired properties (in this case, it's on the CPU and is an int32_t), and will simply fail to call the function if not. This might be a slight breaking change in terms of what exception actually gets raised, but I think this is reasonable. A few related changes: - I've converted a few function signatures to take DLTensors by `const&` instead of `*`, didn't see a good reason for it. - Added an anonymous namespace in `nanobind.cc` just to keep things hygenic. --------- Signed-off-by: Ubospica <ubospica@gmail.com> Co-authored-by: Ubospica <ubospica@gmail.com> Co-authored-by: Yixin Dong <yixind@andrew.cmu.edu>
diff --git a/cpp/nanobind/nanobind.cc b/cpp/nanobind/nanobind.cc
@@ -4,6 +4,7 @@
  */
 
 #include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
 #include <nanobind/stl/optional.h>
 #include <nanobind/stl/pair.h>
 #include <nanobind/stl/string.h>
@@ -20,7 +21,8 @@
 #include "xgrammar/exception.h"
 
 namespace nb = nanobind;
-using namespace xgrammar;
+
+namespace xgrammar {
 
 std::vector<std::string> CommonEncodedVocabType(
     const nb::typed<nb::list, std::variant<std::string, nb::bytes>> encoded_vocab
@@ -39,6 +41,39 @@ std::vector<std::string> CommonEncodedVocabType(
   return encoded_vocab_strs;
 }
 
+bool GrammarMatcher_FillNextTokenBitmask(
+    GrammarMatcher& matcher, nb::ndarray<> arr, int32_t index, bool debug_print
+) {
+  if (arr.ndim() != 1 && arr.ndim() != 2) {
+    throw std::runtime_error("token_bitmask tensor must be 1D or 2D");
+  }
+
+  // 2. Device: ensure the tensor is on CPU
+  if (arr.device_type() != nb::device::cpu::value) {
+    throw std::runtime_error("token_bitmask array must be on CPU");
+  }
+
+  // 3. Data type: ensure 32-bit integers
+  if (arr.dtype() != nb::dtype<int32_t>()) {
+    throw std::runtime_error("token_bitmask array must be int32");
+  }
+
+  // Under the hood these are stored with the same standard (DLPack), but nanobind
+  // defines its own types, and doesn't expose a way to just get the object directly.
+  // We'll just do some pointer hackery to get there, rather than build the type back up manually:
+
+  // The data in an ndarray is defined as:
+  // detail::ndarray_handle* m_handle = nullptr;
+  // dlpack::dltensor m_dltensor;
+  // Assert this, then skip over m_handle and reinterpret m_dltensor.
+  static_assert(sizeof(arr) == sizeof(void*) + sizeof(nb::dlpack::dltensor));
+
+  DLTensor* bitmask_dltensor_ptr =
+      reinterpret_cast<::DLTensor*>(reinterpret_cast<char*>(&arr) + sizeof(void*));
+
+  return matcher.FillNextTokenBitmask(bitmask_dltensor_ptr, index, debug_print);
+}
+
 std::vector<nanobind::bytes> TokenizerInfo_GetDecodedVocab(const TokenizerInfo& tokenizer) {
   const auto& decoded_vocab = tokenizer.GetDecodedVocab();
   std::vector<nanobind::bytes> py_result;
@@ -55,6 +90,10 @@ static void RegisterRuntimeError(nb::module_& m, const char* name) {
   static_cast<void>(nb::exception<T>{m, name, PyExc_RuntimeError});
 }
 
+}  // namespace xgrammar
+
+using namespace xgrammar;
+
 NB_MODULE(xgrammar_bindings, m) {
   RegisterRuntimeError<DeserializeFormatError>(m, "DeserializeFormatError");
   RegisterRuntimeError<DeserializeVersionError>(m, "DeserializeVersionError");
diff --git a/cpp/nanobind/python_methods.cc b/cpp/nanobind/python_methods.cc
@@ -41,27 +41,6 @@ int TokenizerInfo_GetVocabType(const TokenizerInfo& tokenizer) {
   return static_cast<int>(tokenizer.GetVocabType());
 }
 
-bool GrammarMatcher_FillNextTokenBitmask(
-    GrammarMatcher& matcher,
-    intptr_t token_bitmask_ptr,
-    std::vector<int64_t> shape,
-    int32_t index,
-    bool debug_print
-) {
-  XGRAMMAR_CHECK(shape.size() == 1 || shape.size() == 2) << "token_bitmask tensor must be 1D or 2D";
-
-  DLTensor bitmask_dltensor{
-      reinterpret_cast<void*>(token_bitmask_ptr),
-      DLDevice{kDLCPU, 0},
-      static_cast<int32_t>(shape.size()),
-      GetBitmaskDLType(),
-      shape.data(),
-      nullptr,
-      0
-  };
-  return matcher.FillNextTokenBitmask(&bitmask_dltensor, index, debug_print);
-}
-
 std::vector<int> Testing_DebugGetMaskedTokensFromBitmask(
     intptr_t token_bitmask_ptr, std::vector<int64_t> shape, int32_t vocab_size, int32_t index
 ) {
diff --git a/cpp/nanobind/python_methods.h b/cpp/nanobind/python_methods.h
@@ -28,14 +28,6 @@ TokenizerInfo TokenizerInfo_Init(
 
 int TokenizerInfo_GetVocabType(const TokenizerInfo& tokenizer);
 
-bool GrammarMatcher_FillNextTokenBitmask(
-    GrammarMatcher& matcher,
-    intptr_t token_bitmask_ptr,
-    std::vector<int64_t> shape,
-    int32_t index,
-    bool debug_print
-);
-
 std::vector<int> Testing_DebugGetMaskedTokensFromBitmask(
     intptr_t token_bitmask_ptr, std::vector<int64_t> shape, int32_t vocab_size, int32_t index
 );
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,6 +21,7 @@ dependencies = [
   "triton; platform_system == 'Linux' and platform_machine == 'x86_64'",
   "mlx-lm; platform_system == 'Darwin' and platform_machine == 'arm64'",
   "ninja",
+  "numpy",
   "typing-extensions>=4.9.0",
 ]
 
diff --git a/python/xgrammar/matcher.py b/python/xgrammar/matcher.py
@@ -7,6 +7,7 @@
 from typing import List, Optional, Tuple, Union
 
 import torch
+from numpy.typing import ArrayLike
 
 from .base import XGRObject, _core
 from .compiler import CompiledGrammar
@@ -281,7 +282,7 @@ def accept_string(self, input_str: Union[str, bytes], *, debug_print: bool = Fal
         return self._handle.accept_string(input_str, debug_print)
 
     def fill_next_token_bitmask(
-        self, bitmask: torch.Tensor, index: int = 0, *, debug_print: bool = False
+        self, bitmask: ArrayLike, index: int = 0, *, debug_print: bool = False
     ) -> bool:
         """Fill the bitmask for the next token prediction. The input bitmask can be generated
         by allocate_token_bitmask, and must be on CPU. bitmask[index] will be filled with the
@@ -309,15 +310,11 @@ def fill_next_token_bitmask(
         Raises
         ------
         RuntimeError
+            If the bitmask is invalid (not on CPU, not int32, shape mismatch).
+
             If the recursion depth is exceeded.
         """
-        if bitmask.device.type != "cpu":
-            raise ValueError("bitmask should be on CPU.")
-        if bitmask.dtype != bitmask_dtype:
-            raise ValueError(f"bitmask should be of type {bitmask_dtype}.")
-        return self._handle.fill_next_token_bitmask(
-            bitmask.data_ptr(), list(bitmask.shape), index, debug_print
-        )
+        return self._handle.fill_next_token_bitmask(bitmask, index, debug_print)
 
     def find_jump_forward_string(self) -> str:
         """Find the jump-forward string for jump-forward decoding. This is the longest string that
diff --git a/tests/python/test_grammar_matcher_basic.py b/tests/python/test_grammar_matcher_basic.py
@@ -1,5 +1,6 @@
 """Test the basic functionality of GrammarMatcher."""
 
+import math
 import sys
 from typing import List, Optional, Union
 
@@ -15,6 +16,8 @@
     _is_grammar_accept_string,
 )
 
+_is_cuda_available = torch.cuda.is_available()
+
 json_grammar = xgr.Grammar.builtin_json_grammar()
 
 
@@ -363,5 +366,34 @@ def test_override_stop_tokens(tokenizer_path: str, override_stop_tokens: List[in
     assert matcher_2.stop_token_ids == override_stop_tokens
 
 
+def test_fill_next_token_bitmask_errors():
+    # llama 3.1 8b
+    tokenizer = AutoTokenizer.from_pretrained(
+        "meta-llama/Meta-Llama-3-8B-Instruct", use_fast=True, trust_remote_code=True
+    )
+    tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer)
+    matcher = _get_matcher_from_grammar_and_tokenizer_info(json_grammar, tokenizer_info)
+
+    bitmask1 = torch.zeros(1, math.ceil(tokenizer_info.vocab_size / 32) - 1, dtype=torch.int32)
+    with pytest.raises(RuntimeError):
+        matcher.fill_next_token_bitmask(bitmask1)
+
+    bitmask2 = torch.zeros(1, math.ceil(tokenizer_info.vocab_size / 32), dtype=torch.int32)
+    with pytest.raises(RuntimeError):
+        matcher.fill_next_token_bitmask(bitmask2, index=1)
+
+    bitmask3 = torch.zeros(1, math.ceil(tokenizer_info.vocab_size / 32), dtype=torch.float32)
+    with pytest.raises(RuntimeError):
+        matcher.fill_next_token_bitmask(bitmask3)
+
+    if _is_cuda_available:
+        bitmask3 = torch.zeros(1, math.ceil(tokenizer_info.vocab_size / 32), 1, dtype=torch.int32)
+        with pytest.raises(RuntimeError):
+            matcher.fill_next_token_bitmask(bitmask3)
+
+    bitmask_correct = torch.zeros(1, math.ceil(tokenizer_info.vocab_size / 32), dtype=torch.int32)
+    matcher.fill_next_token_bitmask(bitmask_correct)
+
+
 if __name__ == "__main__":
     pytest.main(sys.argv)

Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@ dependencies = [`
`21`	`21`	`"triton; platform_system == 'Linux' and platform_machine == 'x86_64'",`
`22`	`22`	`"mlx-lm; platform_system == 'Darwin' and platform_machine == 'arm64'",`
`23`	`23`	`"ninja",`
	`24`	`+ "numpy",`
`24`	`25`	`"typing-extensions>=4.9.0",`
`25`	`26`	`]`
`26`	`27`