update register_sequence_parallel_allreduce_hooks

DesmonDay · DesmonDay · commit 934a757f44be · 2025-01-09T11:44:32.000+08:00
diff --git a/llm/alignment/dpo/run_dpo.py b/llm/alignment/dpo/run_dpo.py
@@ -43,7 +43,6 @@
     LlamaForCausalLMPipe,
     Qwen2ForCausalLM,
     Qwen2ForCausalLMPipe,
-    register_sequence_parallel_allreduce_hooks,
 )
 from paddlenlp.transformers.configuration_utils import LlmMetaConfig
 from paddlenlp.trl import (
@@ -154,10 +153,6 @@ def main():
     if model_args.flash_mask and not any(isinstance(model, cls) for cls in flash_mask_support_list):
         raise NotImplementedError(f"{model.__class__} not support flash mask.")
 
-    if training_args.sequence_parallel:
-        register_sequence_parallel_allreduce_hooks(
-            model, training_args.gradient_accumulation_steps, training_args.fuse_sequence_parallel_allreduce
-        )
     if model_args.tokenizer_name_or_path is not None:
         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
     else:
diff --git a/llm/alignment/kto/run_kto.py b/llm/alignment/kto/run_kto.py
@@ -38,7 +38,6 @@
     LlamaForCausalLM,
     LlamaForCausalLMPipe,
     Qwen2ForCausalLM,
-    register_sequence_parallel_allreduce_hooks,
 )
 from paddlenlp.transformers.configuration_utils import LlmMetaConfig
 from paddlenlp.trl import (
@@ -140,10 +139,6 @@ def main():
     if model_args.flash_mask and not any(isinstance(model, cls) for cls in flash_mask_support_list):
         raise NotImplementedError(f"{model.__class__} not support flash mask.")
 
-    if training_args.sequence_parallel:
-        register_sequence_parallel_allreduce_hooks(
-            model, training_args.gradient_accumulation_steps, training_args.fuse_sequence_parallel_allreduce
-        )
     if model_args.tokenizer_name_or_path is not None:
         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
     else:
diff --git a/llm/alignment/rm/flashmask/run_reward.py b/llm/alignment/rm/flashmask/run_reward.py
@@ -35,11 +35,7 @@
     get_last_checkpoint,
     set_seed,
 )
-from paddlenlp.transformers import (
-    AutoConfig,
-    AutoTokenizer,
-    register_sequence_parallel_allreduce_hooks,
-)
+from paddlenlp.transformers import AutoConfig, AutoTokenizer
 from paddlenlp.utils.log import logger
 
 
@@ -126,10 +122,6 @@ def main():
         logger.warning("`flash_mask` must use with zero padding and flash attention.")
         model.config.use_flash_attention = True
 
-    if model_args.sequence_parallel:
-        register_sequence_parallel_allreduce_hooks(
-            model, training_args.gradient_accumulation_steps, training_args.fuse_sequence_parallel_allreduce
-        )
     if model_args.tokenizer_name_or_path is not None:
         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
     else:
diff --git a/llm/run_finetune.py b/llm/run_finetune.py
@@ -58,7 +58,6 @@
     LlamaTokenizer,
     Qwen2ForCausalLM,
     Qwen2ForCausalLMPipe,
-    register_sequence_parallel_allreduce_hooks,
 )
 from paddlenlp.transformers.configuration_utils import LlmMetaConfig
 from paddlenlp.trl import DataConfig, ModelConfig, SFTConfig, SFTTrainer
@@ -231,10 +230,6 @@ def neft_post_hook(module, input, output):
         else:
             raise NotImplementedError("Only support neftune for model with get_input_embeddings")
 
-    if training_args.sequence_parallel:
-        register_sequence_parallel_allreduce_hooks(
-            model, training_args.gradient_accumulation_steps, training_args.fuse_sequence_parallel_allreduce
-        )
     # Load tokenizer & dataset
     tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, from_aistudio=model_args.from_aistudio)
     reft_layers = None
diff --git a/llm/run_pretrain.py b/llm/run_pretrain.py
@@ -41,7 +41,6 @@
     AutoTokenizer,
     CosineAnnealingWithWarmupDecay,
     LinearAnnealingWithWarmupDecay,
-    register_sequence_parallel_allreduce_hooks,
 )
 from paddlenlp.transformers.configuration_utils import LlmMetaConfig, llmmetaclass
 from paddlenlp.utils.batch_sampler import DistributedBatchSampler
@@ -492,11 +491,6 @@ def main():
     else:
         model = model_class.from_config(config, dtype=dtype)
 
-    if training_args.sequence_parallel:
-        register_sequence_parallel_allreduce_hooks(
-            model, training_args.gradient_accumulation_steps, training_args.fuse_sequence_parallel_allreduce
-        )
-
     if training_args.recompute:
         model.recompute_enable()
 
diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py
@@ -87,6 +87,12 @@
     from ..quantization.quantization_linear import QuantizationLinear
 except:
     QuantizationLinear = None
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        register_sequence_parallel_allreduce_hooks,
+    )
+except:
+    pass
 from ..transformers.context_parallel_utils import split_inputs_sequence_dim_load_balance
 from ..transformers.model_utils import (
     PretrainedModel,
@@ -428,6 +434,11 @@ def _save_ckpt_func(state_dict, path, signal_path=None):
                     "We do not support skip_save_model_weight in peft model when using unified checkpoint, remove this config."
                 )
 
+        if args.sequence_parallel:
+            register_sequence_parallel_allreduce_hooks(
+                self.model, args.gradient_accumulation_steps, args.fuse_sequence_parallel_allreduce
+            )
+
         self.do_grad_scaling = False
         self.enable_autocast_context_manager = False
         if args.fp16 or args.bf16: