Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,7 @@ python -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" ./predict/flask_ser

#### 7.2 大模型服务化部署工具

该部署工具是基于英伟达Triton框架专为服务器场景的大模型服务化部署而设计。它提供了支持gRPC、HTTP协议的服务接口,以及流式Token输出能力。底层推理引擎支持连续批处理、weight only int8、后训练量化(PTQ)等加速优化策略,为用户带来易用且高性能的部署体验。
该部署工具是基于英伟达 Triton 框架专为服务器场景的大模型服务化部署而设计。它提供了支持 gRPC、HTTP 协议的服务接口,以及流式 Token 输出能力。底层推理引擎支持连续批处理、weight only int8、后训练量化(PTQ)等加速优化策略,为用户带来易用且高性能的部署体验。

基于预编译镜像部署,本节以 Meta-Llama-3-8B-Instruct-A8W8C8 为例,更多模型请参考[LLaMA](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/docs/predict/llama.md)、[Qwen](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/docs/predict/qwen.md)、[Mixtral](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/docs/predict/mixtral.md), 更细致的模型推理、量化教程可以参考[大模型推理教程](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/docs/predict/inference.md):

Expand Down
5 changes: 0 additions & 5 deletions llm/alignment/dpo/run_dpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
LlamaForCausalLMPipe,
Qwen2ForCausalLM,
Qwen2ForCausalLMPipe,
register_sequence_parallel_allreduce_hooks,
)
from paddlenlp.transformers.configuration_utils import LlmMetaConfig
from paddlenlp.trl import (
Expand Down Expand Up @@ -154,10 +153,6 @@ def main():
if model_args.flash_mask and not any(isinstance(model, cls) for cls in flash_mask_support_list):
raise NotImplementedError(f"{model.__class__} not support flash mask.")

if training_args.sequence_parallel:
register_sequence_parallel_allreduce_hooks(
model, training_args.gradient_accumulation_steps, training_args.fuse_sequence_parallel_allreduce
)
if model_args.tokenizer_name_or_path is not None:
tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
else:
Expand Down
5 changes: 0 additions & 5 deletions llm/alignment/kto/run_kto.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
LlamaForCausalLM,
LlamaForCausalLMPipe,
Qwen2ForCausalLM,
register_sequence_parallel_allreduce_hooks,
)
from paddlenlp.transformers.configuration_utils import LlmMetaConfig
from paddlenlp.trl import (
Expand Down Expand Up @@ -140,10 +139,6 @@ def main():
if model_args.flash_mask and not any(isinstance(model, cls) for cls in flash_mask_support_list):
raise NotImplementedError(f"{model.__class__} not support flash mask.")

if training_args.sequence_parallel:
register_sequence_parallel_allreduce_hooks(
model, training_args.gradient_accumulation_steps, training_args.fuse_sequence_parallel_allreduce
)
if model_args.tokenizer_name_or_path is not None:
tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
else:
Expand Down
4 changes: 0 additions & 4 deletions llm/alignment/rm/flashmask/reward_argument.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,3 @@ class ModelArgument:
default=1,
metadata={"help": "virtual_pp_degree"},
)
sequence_parallel: bool = field(
default=False,
metadata={"help": "whether to use sequence parallel"},
)
10 changes: 1 addition & 9 deletions llm/alignment/rm/flashmask/run_reward.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,7 @@
get_last_checkpoint,
set_seed,
)
from paddlenlp.transformers import (
AutoConfig,
AutoTokenizer,
register_sequence_parallel_allreduce_hooks,
)
from paddlenlp.transformers import AutoConfig, AutoTokenizer
from paddlenlp.utils.log import logger


Expand Down Expand Up @@ -126,10 +122,6 @@ def main():
logger.warning("`flash_mask` must use with zero padding and flash attention.")
model.config.use_flash_attention = True

if model_args.sequence_parallel:
register_sequence_parallel_allreduce_hooks(
model, training_args.gradient_accumulation_steps, training_args.fuse_sequence_parallel_allreduce
)
if model_args.tokenizer_name_or_path is not None:
tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
else:
Expand Down
12 changes: 2 additions & 10 deletions llm/auto_parallel/gpt-3/run_pretrain_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,14 +225,6 @@ class ModelArguments:
hidden_dropout_prob: float = field(default=0.1, metadata={"help": "The hidden dropout prob."})
attention_probs_dropout_prob: float = field(default=0.1, metadata={"help": "The attention hidden dropout prob."})

sequence_parallel: bool = field(
default=False,
metadata={"help": "whether to use sequence parallel"},
)
fuse_sequence_parallel_allreduce: bool = field(
default=False,
metadata={"help": "whether to use fuse sequence parallel allreduce"},
)
use_fused_rope: Optional[bool] = field(
default=False,
metadata={"help": "Enable rope fusion or not."},
Expand Down Expand Up @@ -502,8 +494,8 @@ def main():
config.fuse_attention_ffn = model_args.fuse_attention_ffn
config.recompute_granularity = model_args.recompute_granularity
config.virtual_pp_degree = model_args.virtual_pp_degree
config.sequence_parallel = model_args.sequence_parallel
config.fuse_sequence_parallel_allreduce = model_args.fuse_sequence_parallel_allreduce
config.sequence_parallel = training_args.sequence_parallel
config.fuse_sequence_parallel_allreduce = training_args.fuse_sequence_parallel_allreduce
config.use_fused_rope = model_args.use_fused_rope
config.no_recompute_layers = model_args.no_recompute_layers
config.pp_recompute_interval = model_args.pp_recompute_interval
Expand Down
12 changes: 2 additions & 10 deletions llm/auto_parallel/llama/run_pretrain_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,14 +221,6 @@ class ModelArguments:
"help": "Pre-training from existing paddlenlp model weights. Default False and model will train from scratch. If set True, the model_name_or_path argument must exist in the paddlenlp models."
},
)
sequence_parallel: bool = field(
default=False,
metadata={"help": "whether to use sequence parallel"},
)
fuse_sequence_parallel_allreduce: bool = field(
default=False,
metadata={"help": "whether to use fuse sequence parallel allreduce"},
)
use_fused_rope: Optional[bool] = field(
default=False,
metadata={"help": "Enable rope fusion or not."},
Expand Down Expand Up @@ -534,8 +526,8 @@ def main():
config.fuse_attention_ffn = model_args.fuse_attention_ffn
config.recompute_granularity = model_args.recompute_granularity
config.virtual_pp_degree = model_args.virtual_pp_degree
config.sequence_parallel = model_args.sequence_parallel
config.fuse_sequence_parallel_allreduce = model_args.fuse_sequence_parallel_allreduce
config.sequence_parallel = training_args.sequence_parallel
config.fuse_sequence_parallel_allreduce = training_args.fuse_sequence_parallel_allreduce
config.use_fused_rope = model_args.use_fused_rope
config.no_recompute_layers = model_args.no_recompute_layers
config.pp_recompute_interval = model_args.pp_recompute_interval
Expand Down
12 changes: 2 additions & 10 deletions llm/auto_parallel/qwen/run_pretrain_3D_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,14 +225,6 @@ class ModelArguments:
"help": "Pre-training from existing paddlenlp model weights. Default False and model will train from scratch. If set True, the model_name_or_path argument must exist in the paddlenlp models."
},
)
sequence_parallel: bool = field(
default=False,
metadata={"help": "whether to use sequence parallel"},
)
fuse_sequence_parallel_allreduce: bool = field(
default=False,
metadata={"help": "whether to use fuse sequence parallel allreduce"},
)
use_fused_rope: Optional[bool] = field(
default=False,
metadata={"help": "Enable rope fusion or not."},
Expand Down Expand Up @@ -513,8 +505,8 @@ def main():
config.fuse_attention_ffn = model_args.fuse_attention_ffn
config.recompute_granularity = model_args.recompute_granularity
config.virtual_pp_degree = model_args.virtual_pp_degree
config.sequence_parallel = model_args.sequence_parallel
config.fuse_sequence_parallel_allreduce = model_args.fuse_sequence_parallel_allreduce
config.sequence_parallel = training_args.sequence_parallel
config.fuse_sequence_parallel_allreduce = training_args.fuse_sequence_parallel_allreduce
config.use_fused_rope = model_args.use_fused_rope
config.no_recompute_layers = model_args.no_recompute_layers
config.pp_recompute_interval = model_args.pp_recompute_interval
Expand Down
5 changes: 0 additions & 5 deletions llm/run_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@
LlamaTokenizer,
Qwen2ForCausalLM,
Qwen2ForCausalLMPipe,
register_sequence_parallel_allreduce_hooks,
)
from paddlenlp.transformers.configuration_utils import LlmMetaConfig
from paddlenlp.trl import DataConfig, ModelConfig, SFTConfig, SFTTrainer
Expand Down Expand Up @@ -231,10 +230,6 @@ def neft_post_hook(module, input, output):
else:
raise NotImplementedError("Only support neftune for model with get_input_embeddings")

if training_args.sequence_parallel:
register_sequence_parallel_allreduce_hooks(
model, training_args.gradient_accumulation_steps, training_args.fuse_sequence_parallel_allreduce
)
# Load tokenizer & dataset
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, from_aistudio=model_args.from_aistudio)
reft_layers = None
Expand Down
6 changes: 0 additions & 6 deletions llm/run_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
AutoTokenizer,
CosineAnnealingWithWarmupDecay,
LinearAnnealingWithWarmupDecay,
register_sequence_parallel_allreduce_hooks,
)
from paddlenlp.transformers.configuration_utils import LlmMetaConfig, llmmetaclass
from paddlenlp.utils.batch_sampler import DistributedBatchSampler
Expand Down Expand Up @@ -492,11 +491,6 @@ def main():
else:
model = model_class.from_config(config, dtype=dtype)

if training_args.sequence_parallel:
register_sequence_parallel_allreduce_hooks(
model, training_args.gradient_accumulation_steps, training_args.fuse_sequence_parallel_allreduce
)

if training_args.recompute:
model.recompute_enable()

Expand Down
5 changes: 0 additions & 5 deletions llm/run_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
LlamaTokenizer,
Qwen2ForCausalLM,
Qwen2ForCausalLMPipe,
register_sequence_parallel_allreduce_hooks,
)
from paddlenlp.transformers.configuration_utils import LlmMetaConfig
from paddlenlp.trl import DataConfig, ModelConfig, QuantConfig, SFTConfig, SFTTrainer
Expand Down Expand Up @@ -162,10 +161,6 @@ def main():
if model_args.flash_mask and not any(isinstance(model, cls) for cls in flash_mask_support_list):
raise NotImplementedError(f"{model.__class__} not support flash mask.")

if training_args.sequence_parallel:
register_sequence_parallel_allreduce_hooks(
model, training_args.gradient_accumulation_steps, training_args.fuse_sequence_parallel_allreduce
)
# Load tokenizer & dataset
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, from_aistudio=model_args.from_aistudio)
# init chat_template for tokenizer
Expand Down
11 changes: 11 additions & 0 deletions paddlenlp/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,12 @@
from ..quantization.quantization_linear import QuantizationLinear
except:
QuantizationLinear = None
try:
from paddle.distributed.fleet.utils.sequence_parallel_utils import (
register_sequence_parallel_allreduce_hooks,
)
except:
pass

Check warning on line 95 in paddlenlp/trainer/trainer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/trainer.py#L94-L95

Added lines #L94 - L95 were not covered by tests
from ..transformers.context_parallel_utils import split_inputs_sequence_dim_load_balance
from ..transformers.model_utils import (
PretrainedModel,
Expand Down Expand Up @@ -428,6 +434,11 @@
"We do not support skip_save_model_weight in peft model when using unified checkpoint, remove this config."
)

if args.sequence_parallel:
register_sequence_parallel_allreduce_hooks(

Check warning on line 438 in paddlenlp/trainer/trainer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/trainer.py#L438

Added line #L438 was not covered by tests
self.model, args.gradient_accumulation_steps, args.fuse_sequence_parallel_allreduce
)

self.do_grad_scaling = False
self.enable_autocast_context_manager = False
if args.fp16 or args.bf16:
Expand Down
16 changes: 15 additions & 1 deletion paddlenlp/trainer/training_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -669,6 +669,13 @@
)
},
)
sequence_parallel: bool = field(
default=False,
metadata={"help": "Whether to enable sequence parallel."},
)
fuse_sequence_parallel_allreduce: bool = field(
default=False, metadata={"help": "Whether to use fuse sequence parallel allreduce."}
)
sequence_parallel_config: str = field(
default="",
metadata={
Expand Down Expand Up @@ -1209,10 +1216,17 @@
f"Found unknown pipeline mode config {x}, accpet config is disable_p2p_cache_shape, disable_partial_send_recv."
)

enable_partial_send_recv = "disable_partial_send_recv" not in pipeline_parallel_config
if self.sequence_parallel and enable_partial_send_recv:
logger.warning(

Check warning on line 1221 in paddlenlp/trainer/training_args.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/training_args.py#L1219-L1221

Added lines #L1219 - L1221 were not covered by tests
"When use pipeline parallel and sequence parallel simultaneously, we should turn off partial send recv."
)
enable_partial_send_recv = False

Check warning on line 1224 in paddlenlp/trainer/training_args.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/trainer/training_args.py#L1224

Added line #L1224 was not covered by tests

strategy.pipeline_configs = {
"accumulate_steps": self.gradient_accumulation_steps,
"micro_batch_size": self.per_device_train_batch_size,
"enable_partial_send_recv": "disable_partial_send_recv" not in pipeline_parallel_config,
"enable_partial_send_recv": enable_partial_send_recv,
"p2p_cache_shape": False if "disable_p2p_cache_shape" in pipeline_parallel_config else True,
# "delay_scale_loss": True, Fix ME
}
Expand Down