[gpt-oss] flashinfer attention sink init (vllm-project#22330)

zyongye · LiuXiaoxuanPKU · simon-mo · web-flow · commit 90ec006937c4 · 2025-08-05T23:48:19.000-07:00
Signed-off-by: simon-mo &lt;xmo@berkeley.edu&gt;
Co-authored-by: LiuXiaoxuanPKU &lt;lilyliupku@gmail.com&gt;
Co-authored-by: simon-mo &lt;xmo@berkeley.edu&gt;
Co-authored-by: Chen Zhang &lt;zhangch99@outlook.com&gt;
Co-authored-by: Woosuk Kwon &lt;woosuk.kwon@berkeley.edu&gt;
Co-authored-by: Hongxia Yang &lt;62075498+hongxiayang@users.noreply.github.com&gt;
Co-authored-by: Minseok Lee &lt;47620120+minseokl@users.noreply.github.com&gt;
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
@@ -611,6 +611,7 @@ def __init__(
         logits_soft_cap: Optional[float] = None,
         attn_type: AttentionType = AttentionType.DECODER,
         kv_sharing_target_layer_name: Optional[int] = None,
+        sinks: Optional[torch.Tensor] = None,
     ) -> None:
         self.num_heads = num_heads
         self.head_size = head_size
@@ -635,6 +636,15 @@ def __init__(
                                       "are not implemented for "
                                       "FlashInferImpl")
 
+        self.sinks: Optional[torch.Tensor] = None
+        if sinks is not None:
+            assert sinks.shape[0] == num_heads, (
+                "Sinks must have the same number of heads "
+                "as the number of heads in the layer"
+            )
+            assert sinks.dtype == torch.float32, "Sinks must be of type float32"
+            self.sinks = sinks
+
     def forward(
         self,
         layer: torch.nn.Module,