Feature(MInference): add xAttention (#149)

iofu728 · Guangxuan-Xiao · web-flow · commit 91a0506f2bf7 · 2025-05-12T20:26:39.000+08:00
Co-authored-by: Guangxuan Xiao &lt;xiaoguangxuan@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -87,7 +87,7 @@ supported_kv_types = MInferenceConfig.get_available_kv_types()
 
 Currently, we support the following long-context methods:
 
-- **[① KV Cache Generation]:** [MInference](https://arxiv.org/abs/2407.02490), [FlexPrefill](https://openreview.net/forum?id=OfjIlbelrT), [A-shape](https://arxiv.org/abs/2309.17453), [Tri-shape](https://arxiv.org/abs/2412.10319), [MInference w/ static](https://arxiv.org/abs/2407.02490), [Dilated](https://arxiv.org/abs/2004.05150), [Strided](https://arxiv.org/abs/1904.10509)
+- **[① KV Cache Generation]:** [MInference](https://arxiv.org/abs/2407.02490), [xAttention](https://arxiv.org/abs/2503.16428), [FlexPrefill](https://arxiv.org/abs/2502.20766), [A-shape](https://arxiv.org/abs/2309.17453), [Tri-shape](https://arxiv.org/abs/2412.10319), [MInference w/ static](https://arxiv.org/abs/2407.02490), [Dilated](https://arxiv.org/abs/2004.05150), [Strided](https://arxiv.org/abs/1904.10509)
 - **[② KV Cache Compression]:** [StreamingLLM](https://arxiv.org/abs/2309.17453), [SnapKV](https://arxiv.org/abs/2404.14469), [PyramidKV](https://arxiv.org/abs/2406.02069), [KIVI](https://arxiv.org/abs/2402.02750)
 - **[③ KV Cache Retrieval]:** [CacheBlend](https://arxiv.org/abs/2405.16444)
 - **[④ KV Cache Loading]:** [Quest](https://arxiv.org/abs/2406.10774), [RetrievalAttention](https://arxiv.org/abs/2409.10516)
diff --git a/minference/minference_configuration.py b/minference/minference_configuration.py
@@ -1,8 +1,6 @@
-# Copyright (c) 2024 Microsoft
+# Copyright (c) 2024-2025 Microsoft
 # Licensed under The MIT License [see LICENSE for details]
 
-import os
-
 from .configs.model2path import MODEL2PATH
 
 
@@ -27,6 +25,7 @@ class MInferenceConfig:
         "inf_llm",
         "flexprefill",
         "vllm_flexprefill",
+        "xattention",
     ]
     KV_TYPES = [
         "dense",
@@ -72,7 +71,7 @@ def __init__(
         self.kv_type = kv_type
         self.attn_kwargs = attn_kwargs
 
-    def update_config_path(self, config_path: str, model_name: str):
+    def update_config_path(self, config_path: str = None, model_name: str = None):
         if self.attn_type in self.OTHER_ATTENTION_TYPES:
             return ""
         if config_path is not None:
diff --git a/minference/models_patch.py b/minference/models_patch.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024 Microsoft
+# Copyright (c) 2024-2025 Microsoft
 # Licensed under The MIT License [see LICENSE for details]
 
 import json
@@ -97,7 +97,7 @@ def patch_model(self, model):
             self.config.attn_kwargs.setdefault("n_last", 100)
             model = new_patch(model, self.config)
 
-        elif self.config.attn_type in ["flexprefill", "dense"]:
+        elif self.config.attn_type in ["flexprefill", "dense", "xattention"]:
             model = new_patch(model, self.config)
 
         elif self.config.attn_type == "dilated1":
diff --git a/minference/modules/forward.py b/minference/modules/forward.py
@@ -13,6 +13,7 @@
 from ..modules.minference_forward import minference_prefill_forward
 from ..modules.quest import quest_decode_kernel
 from ..modules.retr_attn import retr_attn
+from ..modules.xattention import xattention_forward
 from ..ops.streaming_kernel import a_shape_kernel, tri_shape_kernel
 
 
@@ -187,6 +188,7 @@ def attn_forward(
     "tri_shape": tri_shape_kernel,
     "minference": minference_prefill_forward,
     "flexprefill": flexprefill_forward,
+    "xattention": xattention_forward,
 }
 
 decoding_forwards = {
diff --git a/minference/modules/xattention.py b/minference/modules/xattention.py
diff --git a/minference/ops/xattention_fa.py b/minference/ops/xattention_fa.py