[DCU] fix DCU w8a8c8 GEMM shape (#9115)

YanhuiDua · web-flow · commit 73a3db98d6de · 2024-09-11T21:18:13.000+08:00
diff --git a/llm/predict/export_model.py b/llm/predict/export_model.py
@@ -18,7 +18,7 @@
 
 import paddle
 from paddle.distributed import fleet
-from predict.predictor import ModelArgument, PredictorArgument, create_predictor
+from predictor import ModelArgument, PredictorArgument, create_predictor
 
 from paddlenlp.trainer import PdArgumentParser
 from paddlenlp.utils import llm_utils
diff --git a/paddlenlp/experimental/transformers/fused_transformer_layers.py b/paddlenlp/experimental/transformers/fused_transformer_layers.py
@@ -50,16 +50,19 @@
         from paddlenlp_ops import cutlass_fp8_fp8_half_gemm_fused as fp8_gemm_fused
     else:
         from paddle.linalg import fp8_fp8_half_gemm_fused as fp8_gemm_fused
-    from paddlenlp_ops import (
-        dequant_int8,
-        encode_rotary_qk,
-        gemm_dequant,
-        qkv_transpose_split,
-        quant_int8,
-        rebuild_padding,
-        transpose_remove_padding,
-        write_cache_kv,
-    )
+    try:
+        from paddlenlp_ops import (
+            dequant_int8,
+            encode_rotary_qk,
+            gemm_dequant,
+            qkv_transpose_split,
+            quant_int8,
+            rebuild_padding,
+            transpose_remove_padding,
+            write_cache_kv,
+        )
+    except:
+        pass
 
 __all__ = [
     "MoeConfig",
diff --git a/paddlenlp/experimental/transformers/llama/modeling.py b/paddlenlp/experimental/transformers/llama/modeling.py
@@ -674,7 +674,7 @@ def __init__(self, config: LlamaConfig):
                 use_neox_rotary_style=self.use_neox,
                 cachekv_int8_type=config.cachekv_int8_type,
                 rank_id=config.tensor_parallel_rank,
-                trans_qkvw=(False if paddle.is_compiled_with_rocm() and self.quant_type == "a8w8" else True),
+                trans_qkvw=(False if paddle.is_compiled_with_rocm() and "a8w8" in self.quant_type else True),
             )
 
         self.set_transformer_block(transformer_config)
@@ -861,7 +861,7 @@ def set_state_dict(self, state_dict):
                 unfused_state_dict["self_attn.v_proj.weight"] = state_dict[
                     "llama.layers.{}.self_attn.v_proj.weight".format(idx)
                 ]
-                if paddle.is_compiled_with_rocm() and self.quant_type == "a8w8":
+                if paddle.is_compiled_with_rocm() and "a8w8" in self.quant_type:
                     concated_qkv_weight = np.concatenate(
                         [
                             unfused_state_dict["self_attn.q_proj.weight"],
diff --git a/paddlenlp/experimental/transformers/mixtral/modeling.py b/paddlenlp/experimental/transformers/mixtral/modeling.py
@@ -338,7 +338,7 @@ def __init__(self, config: MixtralConfig):
             use_neox_rotary_style=self.use_neox,
             cachekv_int8_type=config.cachekv_int8_type,
             rank_id=config.tensor_parallel_rank,
-            trans_qkvw=(False if paddle.is_compiled_with_rocm() and self.quant_type == "a8w8" else True),
+            trans_qkvw=(False if paddle.is_compiled_with_rocm() and "a8w8" in self.quant_type else True),
             moe_config=moe_config,
         )
 
@@ -527,7 +527,7 @@ def set_state_dict(self, state_dict):
                 unfused_state_dict["self_attn.v_proj.weight"] = state_dict[
                     "mixtral.layers.{}.self_attn.v_proj.weight".format(idx)
                 ]
-                if paddle.is_compiled_with_rocm() and self.quant_type == "a8w8":
+                if paddle.is_compiled_with_rocm() and "a8w8" in self.quant_type:
                     concated_qkv_weight = np.concatenate(
                         [
                             unfused_state_dict["self_attn.q_proj.weight"],
diff --git a/paddlenlp/experimental/transformers/qwen2/modeling.py b/paddlenlp/experimental/transformers/qwen2/modeling.py
@@ -372,7 +372,7 @@ def __init__(self, config: Qwen2Config):
                 use_neox_rotary_style=self.use_neox,
                 cachekv_int8_type=config.cachekv_int8_type,
                 rank_id=config.tensor_parallel_rank,
-                trans_qkvw=(False if paddle.is_compiled_with_rocm() and self.quant_type == "a8w8" else True),
+                trans_qkvw=(False if paddle.is_compiled_with_rocm() and "a8w8" in self.quant_type else True),
             )
 
         self.set_transformer_block(transformer_config)
@@ -433,7 +433,7 @@ def set_state_dict(self, state_dict):
                 unfused_state_dict["qwen2.self_attn.v_proj.weight"] = state_dict[
                     "qwen2.layers.{}.self_attn.v_proj.weight".format(idx)
                 ]
-                if paddle.is_compiled_with_rocm() and (self.quant_type == "a8w8" or self.quant_type == "a8w8c8"):
+                if paddle.is_compiled_with_rocm() and "a8w8" in self.quant_type:
                     concated_qkv_weight = np.concatenate(
                         [
                             unfused_state_dict["self_attn.q_proj.weight"],

Original file line number	Diff line number	Diff line change
`@@ -674,7 +674,7 @@ def __init__(self, config: LlamaConfig):`
`674`	`674`	`use_neox_rotary_style=self.use_neox,`
`675`	`675`	`cachekv_int8_type=config.cachekv_int8_type,`
`676`	`676`	`rank_id=config.tensor_parallel_rank,`
`677`		`- trans_qkvw=(False if paddle.is_compiled_with_rocm() and self.quant_type == "a8w8" else True),`
	`677`	`+ trans_qkvw=(False if paddle.is_compiled_with_rocm() and "a8w8" in self.quant_type else True),`
`678`	`678`	`)`
`679`	`679`
`680`	`680`	`self.set_transformer_block(transformer_config)`
`@@ -861,7 +861,7 @@ def set_state_dict(self, state_dict):`
`861`	`861`	`unfused_state_dict["self_attn.v_proj.weight"] = state_dict[`
`862`	`862`	`"llama.layers.{}.self_attn.v_proj.weight".format(idx)`
`863`	`863`	`]`
`864`		`- if paddle.is_compiled_with_rocm() and self.quant_type == "a8w8":`
	`864`	`+ if paddle.is_compiled_with_rocm() and "a8w8" in self.quant_type:`
`865`	`865`	`concated_qkv_weight = np.concatenate(`
`866`	`866`	`[`
`867`	`867`	`unfused_state_dict["self_attn.q_proj.weight"],`
Original file line number	Diff line number	Diff line change
`@@ -338,7 +338,7 @@ def __init__(self, config: MixtralConfig):`
`338`	`338`	`use_neox_rotary_style=self.use_neox,`
`339`	`339`	`cachekv_int8_type=config.cachekv_int8_type,`
`340`	`340`	`rank_id=config.tensor_parallel_rank,`
`341`		`- trans_qkvw=(False if paddle.is_compiled_with_rocm() and self.quant_type == "a8w8" else True),`
	`341`	`+ trans_qkvw=(False if paddle.is_compiled_with_rocm() and "a8w8" in self.quant_type else True),`
`342`	`342`	`moe_config=moe_config,`
`343`	`343`	`)`
`344`	`344`
`@@ -527,7 +527,7 @@ def set_state_dict(self, state_dict):`
`527`	`527`	`unfused_state_dict["self_attn.v_proj.weight"] = state_dict[`
`528`	`528`	`"mixtral.layers.{}.self_attn.v_proj.weight".format(idx)`
`529`	`529`	`]`
`530`		`- if paddle.is_compiled_with_rocm() and self.quant_type == "a8w8":`
	`530`	`+ if paddle.is_compiled_with_rocm() and "a8w8" in self.quant_type:`
`531`	`531`	`concated_qkv_weight = np.concatenate(`
`532`	`532`	`[`
`533`	`533`	`unfused_state_dict["self_attn.q_proj.weight"],`
Original file line number	Diff line number	Diff line change
`@@ -372,7 +372,7 @@ def __init__(self, config: Qwen2Config):`
`372`	`372`	`use_neox_rotary_style=self.use_neox,`
`373`	`373`	`cachekv_int8_type=config.cachekv_int8_type,`
`374`	`374`	`rank_id=config.tensor_parallel_rank,`
`375`		`- trans_qkvw=(False if paddle.is_compiled_with_rocm() and self.quant_type == "a8w8" else True),`
	`375`	`+ trans_qkvw=(False if paddle.is_compiled_with_rocm() and "a8w8" in self.quant_type else True),`
`376`	`376`	`)`
`377`	`377`
`378`	`378`	`self.set_transformer_block(transformer_config)`
`@@ -433,7 +433,7 @@ def set_state_dict(self, state_dict):`
`433`	`433`	`unfused_state_dict["qwen2.self_attn.v_proj.weight"] = state_dict[`
`434`	`434`	`"qwen2.layers.{}.self_attn.v_proj.weight".format(idx)`
`435`	`435`	`]`
`436`		`- if paddle.is_compiled_with_rocm() and (self.quant_type == "a8w8" or self.quant_type == "a8w8c8"):`
	`436`	`+ if paddle.is_compiled_with_rocm() and "a8w8" in self.quant_type:`
`437`	`437`	`concated_qkv_weight = np.concatenate(`
`438`	`438`	`[`
`439`	`439`	`unfused_state_dict["self_attn.q_proj.weight"],`