Fix and improve apply_token_bitmask benchmark script (#391)

Jialin · web-flow · commit 591dff9cab36 · 2025-08-09T21:01:29.000-04:00
Currently, the bench script is not runnable (from xgrammar.kernels
import apply_token_bitmask_inplace_kernels not found).

# Change
- Update the script to make it runnable
- Kick off multiple setup in a single run, so we could create benchmark
report in one shot

# Usage
```bash
(xgrammar) Fri Aug 08 22:18:25 [/data/users/jialino/gitrepos/xgrammar] python3 examples/benchmark/bench_apply_token_bitmask_inplace.py
Running cmake --build &amp; --install in /data/users/jialino/gitrepos/xgrammar/build
ninja: no work to do.
-- Install configuration: "RelWithDebInfo"
-- Up-to-date: /home/jialino/uv_env/xgrammar/lib64/python3.12/site-packages/xgrammar/./xgrammar_bindings.cpython-312-x86_64-linux-gnu.so
W0808 22:18:51.578000 320509 torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. 
W0808 22:18:51.578000 320509 torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [01:13&lt;00:00,  4.92s/it]
|   Batch |   Vocab |   Masked cnt |   Torch Compile |         Triton  |
|    size |    size |              |     Baseline us |    us (speedup) |
|--------:|--------:|-------------:|----------------:|----------------:|
|       1 |  128000 |            1 |            6.04 |    5.52 (1.09x) |
|       1 |  128000 |        64000 |            5.96 |    6.16 (0.97x) |
|       1 |  128000 |       127000 |            6.01 |    6.27 (0.96x) |
|       8 |  128000 |            1 |           10.90 |    6.04 (1.81x) |
|       8 |  128000 |        64000 |           10.90 |    7.76 (1.40x) |
|       8 |  128000 |       127000 |           10.91 |    8.02 (1.36x) |
|      64 |  128000 |            1 |           48.72 |   13.36 (3.65x) |
|      64 |  128000 |        64000 |           48.74 |   46.35 (1.05x) |
|      64 |  128000 |       127000 |           48.74 |   33.26 (1.47x) |
|     512 |  128000 |            1 |          350.11 |   67.43 (5.19x) |
|     512 |  128000 |        64000 |          347.57 |  330.76 (1.05x) |
|     512 |  128000 |       127000 |          345.73 |  250.06 (1.38x) |
|    4096 |  128000 |            1 |         2903.81 |  494.67 (5.87x) |
|    4096 |  128000 |        64000 |         2855.70 | 2516.79 (1.13x) |
|    4096 |  128000 |       127000 |         2720.98 | 1936.44 (1.41x) |
```

Signed-off-by: Jialin Ouyang &lt;Jialin.Ouyang@gmail.com&gt;
diff --git a/examples/benchmark/README.md b/examples/benchmark/README.md
@@ -21,44 +21,26 @@ python3 bench_grammar_compile_mask_gen.py [-h] [--backend {xgrammar,outlines,lmf
 
 #### Run
 ```bash
-python3 bench_apply_token_bitmask_inplace.py [-h] [--impl {cuda,triton}]
-                                             [--batch_size BATCH_SIZE] [--vocab_size VOCAB_SIZE]
-                                             [--masked_cnt MASKED_CNT] [--stride STRIDE]
-                                             [--logits_dtype {float32,float16,bfloat16}]
-                                             [--warmup WARMUP] [--rep REP]
+python3 examples/benchmark/bench_apply_token_bitmask_inplace.py
 ```
 
 #### Results
-
-| GPU            | Batch size | Vocab size | Masked cnt | Triton (μs)  | CUDA (μs) | Speedup |
-|:--------------:|-----------:|-----------:|-----------:|-------------:|----------:|--------:|
-| H100 80GB HBM3 |          1 |       128k |         1k |         5.95 |      6.57 |   0.91x |
-|                |          1 |       128k |        64k |         6.38 |      6.46 |   0.99x |
-|                |          1 |       128k |       127k |         6.69 |      6.48 |   1.03x |
-|                |          8 |       128k |         1k |         6.77 |      6.94 |   0.98x |
-|                |          8 |       128k |        64k |         8.05 |      9.19 |   0.88x |
-|                |          8 |       128k |       127k |         8.49 |      8.08 |   1.05x |
-|                |         64 |       128k |         1k |        14.97 |     13.82 |   1.08x |
-|                |         64 |       128k |        64k |        43.13 |     30.98 |   1.39x |
-|                |         64 |       128k |       127k |        33.85 |     21.43 |   1.58x |
-|                |        512 |       128k |         1k |        82.65 |     61.13 |   1.35x |
-|                |        512 |       128k |        64k |       293.51 |    194.06 |   1.51x |
-|                |        512 |       128k |       127k |       240.11 |    119.77 |   2.00x |
-|                |       4096 |       128k |         1k |       566.17 |    417.33 |   1.36x |
-|                |       4096 |       128k |        64k |      2198.59 |   1491.79 |   1.47x |
-|                |       4096 |       128k |       127k |      1812.39 |    897.17 |   2.02x |
-| A100 SXM4 80GB |          1 |       128k |         1k |         8.32 |      7.97 |   1.04x |
-|                |          1 |       128k |        64k |         9.26 |      8.24 |   1.12x |
-|                |          1 |       128k |       127k |         8.81 |      8.71 |   1.01x |
-|                |          8 |       128k |         1k |         9.56 |     10.31 |   0.93x |
-|                |          8 |       128k |        64k |        12.72 |     13.22 |   0.96x |
-|                |          8 |       128k |       127k |        13.45 |     11.27 |   1.19x |
-|                |         64 |       128k |         1k |        22.95 |     25.57 |   0.90x |
-|                |         64 |       128k |        64k |        58.52 |     56.47 |   1.04x |
-|                |         64 |       128k |       127k |        44.83 |     39.29 |   1.14x |
-|                |        512 |       128k |         1k |       132.92 |    108.60 |   1.22x |
-|                |        512 |       128k |        64k |       362.08 |    349.54 |   1.04x |
-|                |        512 |       128k |       127k |       306.75 |    233.20 |   1.32x |
-|                |       4096 |       128k |         1k |       955.99 |    777.94 |   1.23x |
-|                |       4096 |       128k |        64k |      2756.63 |   2707.57 |   1.02x |
-|                |       4096 |       128k |       127k |      2472.82 |   1782.41 |   1.39x |
+H100
+|   Batch |   Vocab |   Masked cnt |   Torch Compile |         Triton  |
+|    size |    size |              |     Baseline us |    us (speedup) |
+|--------:|--------:|-------------:|----------------:|----------------:|
+|       1 |  128000 |            1 |            6.04 |    5.52 (1.09x) |
+|       1 |  128000 |        64000 |            5.96 |    6.16 (0.97x) |
+|       1 |  128000 |       127000 |            6.01 |    6.27 (0.96x) |
+|       8 |  128000 |            1 |           10.90 |    6.04 (1.81x) |
+|       8 |  128000 |        64000 |           10.90 |    7.76 (1.40x) |
+|       8 |  128000 |       127000 |           10.91 |    8.02 (1.36x) |
+|      64 |  128000 |            1 |           48.72 |   13.36 (3.65x) |
+|      64 |  128000 |        64000 |           48.74 |   46.35 (1.05x) |
+|      64 |  128000 |       127000 |           48.74 |   33.26 (1.47x) |
+|     512 |  128000 |            1 |          350.11 |   67.43 (5.19x) |
+|     512 |  128000 |        64000 |          347.57 |  330.76 (1.05x) |
+|     512 |  128000 |       127000 |          345.73 |  250.06 (1.38x) |
+|    4096 |  128000 |            1 |         2903.81 |  494.67 (5.87x) |
+|    4096 |  128000 |        64000 |         2855.70 | 2516.79 (1.13x) |
+|    4096 |  128000 |       127000 |         2720.98 | 1936.44 (1.41x) |
diff --git a/examples/benchmark/bench_apply_token_bitmask_inplace.py b/examples/benchmark/bench_apply_token_bitmask_inplace.py
@@ -12,38 +12,57 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
+from itertools import product
+from typing import Any
 
 import torch
+from tabulate import tabulate
+from tqdm import tqdm
 from triton.testing import do_bench
 
-from xgrammar.kernels import apply_token_bitmask_inplace_kernels
+from xgrammar.kernels.apply_token_bitmask_inplace_cuda import apply_token_bitmask_inplace_cuda
+from xgrammar.kernels.apply_token_bitmask_inplace_torch_compile import (
+    apply_token_bitmask_inplace_torch_compile,
+)
+from xgrammar.kernels.apply_token_bitmask_inplace_triton import apply_token_bitmask_inplace_triton
 from xgrammar.testing import _bool_mask_to_bitmask
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--impl", type=str, choices=["cuda", "triton"], default="cuda")
-    parser.add_argument("--batch_size", type=int, default=4096)
-    parser.add_argument("--vocab_size", type=int, default=128000)
-    parser.add_argument("--masked_cnt", type=int, default=1024)
-    parser.add_argument("--stride", type=int, default=1)
-    parser.add_argument(
-        "--logits_dtype", type=str, choices=["float32", "float16", "bfloat16"], default="float32"
-    )
-    parser.add_argument("--warmup", type=int, default=500)
-    parser.add_argument("--rep", type=int, default=2000)
-    args = parser.parse_args()
+IMPL_TORCH_COMPILE: str = "Torch Compile"
+IMPL_TRITON: str = "Triton"
+IMPL_CUDA: str = "CUDA"
+
+ALL_IMPLS: list[str] = [IMPL_TORCH_COMPILE, IMPL_TRITON, IMPL_CUDA]
+
+
+def bench_single_impl(
+    impl: str,
+    logits: torch.Tensor,
+    bitmask: torch.Tensor,
+    logits_expected: torch.Tensor,
+    kwargs: dict[str, Any],
+    args: argparse.Namespace,
+) -> float:
+    if impl == IMPL_TORCH_COMPILE:
+        f = lambda: apply_token_bitmask_inplace_torch_compile(logits, bitmask, **kwargs)
+    elif impl == IMPL_TRITON:
+        f = lambda: apply_token_bitmask_inplace_triton(logits, bitmask, **kwargs)
+    else:
+        f = lambda: apply_token_bitmask_inplace_cuda(logits, bitmask, **kwargs)
+
+    f()
+    torch.testing.assert_close(logits, logits_expected.to("cuda"))
+
+    torch.cuda.synchronize()
+    exec_time = do_bench(f, warmup=args.warmup, rep=args.rep)
+    return exec_time * 1000
 
+
+def bench_single_setup(batch_size: int, masked_cnt: int, args: argparse.Namespace) -> list[float]:
     vocab_size = args.vocab_size
-    batch_size = args.batch_size
-    bitmask_size = (vocab_size + 32 - 1) // 32
-    masked_cnt = args.masked_cnt
     stride = args.stride
     logits_dtype = getattr(torch, args.logits_dtype)
-
     logits = torch.randn(batch_size, vocab_size, dtype=logits_dtype, device="cuda")
-
     if masked_cnt >= vocab_size:
         bool_mask = torch.zeros(batch_size, vocab_size, dtype=torch.bool, device="cuda")
     else:
@@ -55,29 +74,60 @@
             bool_mask.scatter_(1, masked_positions, False)
             assert (bool_mask.sum(dim=-1) + masked_cnt == vocab_size).all().item()
     bitmask = _bool_mask_to_bitmask(bool_mask)
-
     masked_batch_ids = torch.arange(0, batch_size, stride, dtype=torch.int32, device="cuda")
     kwargs = {} if stride == 1 else {"indices": masked_batch_ids}
 
-    logits_expected = logits.clone()
-    logits_expected[masked_batch_ids] = torch.masked_fill(
-        logits_expected[masked_batch_ids], ~bool_mask[masked_batch_ids], float("-inf")
+    logits_copies = [logits.clone() for _ in range(len(args.impl))]
+    logits[masked_batch_ids] = torch.masked_fill(
+        logits[masked_batch_ids], ~bool_mask[masked_batch_ids], float("-inf")
     )
+    return [
+        bench_single_impl(impl, logits_copy, bitmask, logits, kwargs, args)
+        for impl, logits_copy in zip(args.impl, logits_copies)
+    ]
 
-    if args.impl == "cuda":
-        if "cuda" not in apply_token_bitmask_inplace_kernels:
-            raise ImportError("CUDA is not installed")
-        f = lambda: apply_token_bitmask_inplace_kernels["cuda"](logits, bitmask, **kwargs)
-    elif args.impl == "triton":
-        if "triton" not in apply_token_bitmask_inplace_kernels:
-            raise ImportError("Triton is not installed")
-        f = lambda: apply_token_bitmask_inplace_kernels["triton"](logits, bitmask, **kwargs)
 
-    f()
-    torch.testing.assert_close(logits, logits_expected.to("cuda"))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--impl", type=str, nargs="*", choices=ALL_IMPLS, default=[IMPL_TORCH_COMPILE, IMPL_TRITON]
+    )
+    parser.add_argument("--batch-size", type=int, nargs="*", default=[1, 8, 64, 512, 4096])
+    parser.add_argument("--vocab-size", type=int, default=128000)
+    parser.add_argument("--masked-cnt", type=int, nargs="*", default=[1, 64000, 127000])
+    parser.add_argument("--stride", type=int, default=1)
+    parser.add_argument(
+        "--logits_dtype", type=str, choices=["float32", "float16", "bfloat16"], default="float32"
+    )
+    parser.add_argument("--warmup", type=int, default=500)
+    parser.add_argument("--rep", type=int, default=2000)
+    args = parser.parse_args()
 
-    torch.cuda.synchronize()
-    exec_time = do_bench(f, warmup=args.warmup, rep=args.rep)
-    exec_time *= 10**3
+    data_rows = []
+    for batch_size, masked_cnt in tqdm(list(product(args.batch_size, args.masked_cnt))):
+        all_us = bench_single_setup(batch_size, masked_cnt, args)
+        data_rows.append(
+            [
+                batch_size,
+                args.vocab_size,
+                masked_cnt,
+                f"{all_us[0]:.2f}",
+                *[f"{us:.2f} ({all_us[0]/us:>4.2f}x)" for us in all_us[1:]],
+            ]
+        )
 
-    print(f"Implementation: {args.impl}\t| Execution time (μs): {exec_time:.4f}")
+    print(
+        tabulate(
+            data_rows,
+            headers=[
+                "Batch\nsize",
+                "Vocab\nsize",
+                "Masked cnt",
+                f"{args.impl[0]}\nBaseline us",
+                *[f"{impl} \nus (speedup)" for impl in args.impl[1:]],
+            ],
+            tablefmt="pipe",
+            floatfmt=".2f",
+            colalign=["right"] * len(data_rows[0]),
+        )
+    )