File tree Expand file tree Collapse file tree 1 file changed +10
-0
lines changed
vllm/v1/attention/backends Expand file tree Collapse file tree 1 file changed +10
-0
lines changed Original file line number Diff line number Diff line change @@ -611,6 +611,7 @@ def __init__(
611
611
logits_soft_cap : Optional [float ] = None ,
612
612
attn_type : AttentionType = AttentionType .DECODER ,
613
613
kv_sharing_target_layer_name : Optional [int ] = None ,
614
+ sinks : Optional [torch .Tensor ] = None ,
614
615
) -> None :
615
616
self .num_heads = num_heads
616
617
self .head_size = head_size
@@ -635,6 +636,15 @@ def __init__(
635
636
"are not implemented for "
636
637
"FlashInferImpl" )
637
638
639
+ self .sinks : Optional [torch .Tensor ] = None
640
+ if sinks is not None :
641
+ assert sinks .shape [0 ] == num_heads , (
642
+ "Sinks must have the same number of heads "
643
+ "as the number of heads in the layer"
644
+ )
645
+ assert sinks .dtype == torch .float32 , "Sinks must be of type float32"
646
+ self .sinks = sinks
647
+
638
648
def forward (
639
649
self ,
640
650
layer : torch .nn .Module ,
You can’t perform that action at this time.
0 commit comments