Bind fill_next_token_bitmask against nb::ndarray

Ahajha · Ahajha · commit 4ccf71166c33 · 2025-06-16T02:25:38.000-04:00
diff --git a/cpp/grammar_matcher.cc b/cpp/grammar_matcher.cc
@@ -270,7 +270,9 @@ class GrammarMatcher::Impl : public GrammarMatcherBase {
 
   bool AcceptString(const std::string& input_str, bool debug_print = false);
 
-  bool FillNextTokenBitmask(DLTensor* next_token_bitmask, int index, bool debug_print = false);
+  bool FillNextTokenBitmask(
+      const DLTensor& next_token_bitmask, int index, bool debug_print = false
+  );
 
   std::string FindJumpForwardString();
 
@@ -493,13 +495,13 @@ bool GrammarMatcher::Impl::IsTokenBitmaskAllTrue(int32_t* bitmask_data_ptr) {
 }
 
 bool GrammarMatcher::Impl::FillNextTokenBitmask(
-    DLTensor* next_token_bitmask, int index, bool debug_print
+    const DLTensor& next_token_bitmask, int index, bool debug_print
 ) {
   XGRAMMAR_CHECK(!IsStopTokenAccepted())
       << "GrammarMatcher has terminated after accepting the stop token, but is trying to "
          "find the next token mask";
   int32_t* bitmask_data_ptr =
-      CheckAndGetBitmaskPtr(*next_token_bitmask, tokenizer_info_.GetVocabSize(), index);
+      CheckAndGetBitmaskPtr(next_token_bitmask, tokenizer_info_.GetVocabSize(), index);
   const auto& sorted_decoded_vocab = tokenizer_info_.GetSortedDecodedVocab();
   const auto& adaptive_token_mask_cache = compiled_grammar_->adaptive_token_mask_cache;
   const auto& latest_stack_tops = stack_tops_history_.GetLatest();
@@ -851,7 +853,7 @@ bool GrammarMatcher::AcceptString(const std::string& input_str, bool debug_print
 }
 
 bool GrammarMatcher::FillNextTokenBitmask(
-    DLTensor* next_token_bitmask, int index, bool debug_print
+    const DLTensor& next_token_bitmask, int index, bool debug_print
 ) {
   return pimpl_->FillNextTokenBitmask(next_token_bitmask, index, debug_print);
 }
diff --git a/cpp/nanobind/nanobind.cc b/cpp/nanobind/nanobind.cc
@@ -4,6 +4,7 @@
  */
 
 #include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
 #include <nanobind/stl/optional.h>
 #include <nanobind/stl/pair.h>
 #include <nanobind/stl/string.h>
@@ -22,6 +23,8 @@
 namespace nb = nanobind;
 using namespace xgrammar;
 
+namespace {
+
 std::vector<std::string> CommonEncodedVocabType(
     const nb::typed<nb::list, std::variant<std::string, nb::bytes>> encoded_vocab
 ) {
@@ -39,6 +42,32 @@ std::vector<std::string> CommonEncodedVocabType(
   return encoded_vocab_strs;
 }
 
+bool GrammarMatcher_FillNextTokenBitmask(
+    GrammarMatcher& matcher,
+    nb::ndarray<int32_t, nb::device::cpu> arr,
+    int32_t index,
+    bool debug_print
+) {
+  if (arr.ndim() != 1 && arr.ndim() != 2) {
+    throw nb::type_error("token_bitmask tensor must be 1D or 2D");
+  }
+
+  // Under the hood these are stored with the same standard (DLPack), but nanobind
+  // defines its own types, and doesn't expose a way to just get the object directly.
+  // We'll just do some pointer hackery to get there, rather than build the type back up manually:
+
+  // The data in an ndarray is defined as:
+  // detail::ndarray_handle* m_handle = nullptr;
+  // dlpack::dltensor m_dltensor;
+  // Assert this, then skip over m_handle and reinterpret m_dltensor.
+  static_assert(sizeof(arr) == sizeof(void*) + sizeof(nb::dlpack::dltensor));
+
+  const DLTensor& bitmask_dltensor =
+      *reinterpret_cast<::DLTensor*>(reinterpret_cast<char*>(&arr) + sizeof(void*));
+
+  return matcher.FillNextTokenBitmask(bitmask_dltensor, index, debug_print);
+}
+
 std::vector<nanobind::bytes> TokenizerInfo_GetDecodedVocab(const TokenizerInfo& tokenizer) {
   const auto& decoded_vocab = tokenizer.GetDecodedVocab();
   std::vector<nanobind::bytes> py_result;
@@ -49,6 +78,8 @@ std::vector<nanobind::bytes> TokenizerInfo_GetDecodedVocab(const TokenizerInfo&
   return py_result;
 }
 
+}  // namespace
+
 NB_MODULE(xgrammar_bindings, m) {
   auto pyTokenizerInfo = nb::class_<TokenizerInfo>(m, "TokenizerInfo");
   pyTokenizerInfo
diff --git a/cpp/nanobind/python_methods.cc b/cpp/nanobind/python_methods.cc
@@ -39,27 +39,6 @@ int TokenizerInfo_GetVocabType(const TokenizerInfo& tokenizer) {
   return static_cast<int>(tokenizer.GetVocabType());
 }
 
-bool GrammarMatcher_FillNextTokenBitmask(
-    GrammarMatcher& matcher,
-    intptr_t token_bitmask_ptr,
-    std::vector<int64_t> shape,
-    int32_t index,
-    bool debug_print
-) {
-  XGRAMMAR_CHECK(shape.size() == 1 || shape.size() == 2) << "token_bitmask tensor must be 1D or 2D";
-
-  DLTensor bitmask_dltensor{
-      reinterpret_cast<void*>(token_bitmask_ptr),
-      DLDevice{kDLCPU, 0},
-      static_cast<int32_t>(shape.size()),
-      GetBitmaskDLType(),
-      shape.data(),
-      nullptr,
-      0
-  };
-  return matcher.FillNextTokenBitmask(&bitmask_dltensor, index, debug_print);
-}
-
 std::vector<int> Testing_DebugGetMaskedTokensFromBitmask(
     intptr_t token_bitmask_ptr, std::vector<int64_t> shape, int32_t vocab_size, int32_t index
 ) {
diff --git a/cpp/nanobind/python_methods.h b/cpp/nanobind/python_methods.h
@@ -27,14 +27,6 @@ TokenizerInfo TokenizerInfo_Init(
 
 int TokenizerInfo_GetVocabType(const TokenizerInfo& tokenizer);
 
-bool GrammarMatcher_FillNextTokenBitmask(
-    GrammarMatcher& matcher,
-    intptr_t token_bitmask_ptr,
-    std::vector<int64_t> shape,
-    int32_t index,
-    bool debug_print
-);
-
 std::vector<int> Testing_DebugGetMaskedTokensFromBitmask(
     intptr_t token_bitmask_ptr, std::vector<int64_t> shape, int32_t vocab_size, int32_t index
 );
diff --git a/include/xgrammar/matcher.h b/include/xgrammar/matcher.h
@@ -107,7 +107,9 @@ class GrammarMatcher {
    * and with shape (GetBitmaskSize(),) and dtype int32.
    * \return Whether the bitmask need to be applied (not all-true).
    */
-  bool FillNextTokenBitmask(DLTensor* next_token_bitmask, int index = 0, bool debug_print = false);
+  bool FillNextTokenBitmask(
+      const DLTensor& next_token_bitmask, int index = 0, bool debug_print = false
+  );
 
   /*!
    * \brief Find the jump-forward string for jump-forward decoding. This is the longest string that
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,6 +22,7 @@ dependencies = [
   "triton; platform_system == 'Linux' and platform_machine == 'x86_64'",
   "mlx-lm; platform_system == 'Darwin' and platform_machine == 'arm64'",
   "ninja",
+  "numpy",
 ]
 dynamic = ["version"]
 
diff --git a/python/xgrammar/matcher.py b/python/xgrammar/matcher.py
@@ -6,6 +6,7 @@
 from typing import List, Optional, Tuple, Union
 
 import torch
+from numpy.typing import ArrayLike
 
 from .base import XGRObject, _core
 from .compiler import CompiledGrammar
@@ -269,7 +270,7 @@ def accept_string(self, input_str: Union[str, bytes], *, debug_print: bool = Fal
         return self._handle.accept_string(input_str, debug_print)
 
     def fill_next_token_bitmask(
-        self, bitmask: torch.Tensor, index: int = 0, *, debug_print: bool = False
+        self, bitmask: ArrayLike, index: int = 0, *, debug_print: bool = False
     ) -> bool:
         """Fill the bitmask for the next token prediction. The input bitmask can be generated
         by allocate_token_bitmask, and must be on CPU. bitmask[index] will be filled with the
@@ -299,13 +300,7 @@ def fill_next_token_bitmask(
         RuntimeError
             If the recursion depth is exceeded.
         """
-        if bitmask.device.type != "cpu":
-            raise ValueError("bitmask should be on CPU.")
-        if bitmask.dtype != bitmask_dtype:
-            raise ValueError(f"bitmask should be of type {bitmask_dtype}.")
-        return self._handle.fill_next_token_bitmask(
-            bitmask.data_ptr(), list(bitmask.shape), index, debug_print
-        )
+        return self._handle.fill_next_token_bitmask(bitmask, index, debug_print)
 
     def find_jump_forward_string(self) -> str:
         """Find the jump-forward string for jump-forward decoding. This is the longest string that
diff --git a/web/src/xgrammar_binding.cc b/web/src/xgrammar_binding.cc
@@ -85,7 +85,7 @@ std::vector<int32_t> GrammarMatcher_GetNextTokenBitmask(GrammarMatcher& matcher,
   tensor.strides = &strides[0];
   tensor.byte_offset = 0;
   // 3. Populate tensor, hence result
-  matcher.FillNextTokenBitmask(&tensor);
+  matcher.FillNextTokenBitmask(tensor);
   return result;
 }
 

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@ dependencies = [`
`22`	`22`	`"triton; platform_system == 'Linux' and platform_machine == 'x86_64'",`
`23`	`23`	`"mlx-lm; platform_system == 'Darwin' and platform_machine == 'arm64'",`
`24`	`24`	`"ninja",`
	`25`	`+ "numpy",`
`25`	`26`	`]`
`26`	`27`	`dynamic = ["version"]`
`27`	`28`
Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,7 @@ std::vector<int32_t> GrammarMatcher_GetNextTokenBitmask(GrammarMatcher& matcher,`
`85`	`85`	`tensor.strides = &strides[0];`
`86`	`86`	`tensor.byte_offset = 0;`
`87`	`87`	`// 3. Populate tensor, hence result`
`88`		`- matcher.FillNextTokenBitmask(&tensor);`
	`88`	`+ matcher.FillNextTokenBitmask(tensor);`
`89`	`89`	`return result;`
`90`	`90`	`}`
`91`	`91`