Skip to content

Commit 8fcedfd

Browse files
authored
Update PEFT benchmarks (#6187)
* update * updates * update * update * commit * wip * updates
1 parent 94892fe commit 8fcedfd

File tree

5 files changed

+116
-76
lines changed

5 files changed

+116
-76
lines changed

examples/benchmark/peft/README.md

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,15 @@
11
# Benchmark Results
22

3-
### 硬件与环境
3+
### 配置
44

55
- 硬件: A100-80G with NVLink, 具体卡数见表
6-
- Torch环境:见torch/requirements.txt
6+
- Torch环境: 见torch/requirements.txt
7+
- 数据: 10k条[Chinese-Vicuna/guanaco_belle_merge_v1.0](https://huggingface.co/datasets/Chinese-Vicuna/guanaco_belle_merge_v1.0)
78

89
### Bloom
910

10-
| Model | Method | Num GPUs | Batch Size | Paddle Setup | Paddle (s/epoch) | Torch Setup | Torch (s/epoch) | Delta |
11-
|---------------|----------|----------|------------|--------------|------------------|-------------|-----------------|-------|
12-
| Bloomz-7b1-mt | LoRA | 1 | 4 | fp16 O2 | 179 | fp16 | 219 | -18% |
13-
| Bloomz-7b1-mt | LoRA | 1 | 8 | FP16 O2 | 171 | fp16 | 197 | -13% |
14-
| Bloomz-7b1-mt | Finetune | 4 | 8 | fp16 O2 MP 4 | 133 | fp16 ZeRO 3 | 288 | -54% |
15-
| Bloomz-7b1-mt | Finetune | 4 | 16 | fp16 O2 MP 4 | 106 | fp16 ZeRO 3 | 150 | -29% |
16-
| Bloomz-7b1-mt | Finetune | 4 | 32 | fp16 O2 MP 4 | 85 | fp16 ZeRO 3 | 94 | -10% |
17-
18-
* transformers默认的half_precision_backend是`torch.cuda.amp`, 不使用`fp16_opt_level`参数
11+
| Model | Method | Num GPUs | Batch Size | Paddle Setup | Paddle Effective Tokens/s | Torch Setup | Torch Effective Tokens/s | Speedup |
12+
|---------------|----------|----------|------------|--------------|---------------------------|-------------|--------------------------|---------|
13+
| Bloomz-7b1-mt | LoRA | 1 | 4 | fp16 O2 | 2293.46 | fp16 | 1736.92 | +32% |
14+
| Bloomz-7b1-mt | Finetune | 4 | 8 | fp16 O2 MP 4 | 2873.13 | fp16 ZeRO 3 | 1634.58 | +76% |
15+
| Bloomz-7b1-mt | Finetune | 4 | 16 | fp16 O2 MP 4 | 2853.83 | fp16 ZeRO 3 | 2694.64 | +6% |

examples/benchmark/peft/paddle/train_nl2sql.py renamed to examples/benchmark/peft/paddle/benchmark.py

Lines changed: 35 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -16,34 +16,32 @@
1616
from typing import Optional
1717

1818
from datasets import load_dataset
19+
from utils import CustomTrainer
1920

2021
from paddlenlp.data import DataCollatorForSeq2Seq
2122
from paddlenlp.peft import LoRAConfig, LoRAModel
22-
from paddlenlp.trainer import PdArgumentParser, Trainer, TrainingArguments
23+
from paddlenlp.trainer import PdArgumentParser, TrainingArguments
2324
from paddlenlp.transformers import AutoModelForCausalLM, AutoTokenizer
2425

2526
"""
2627
单卡
27-
python train_nl2sql.py --model_name_or_path bigscience/bloomz-7b1-mt \
28-
--train_file nl2sql/dev.jsonl --validation_file nl2sql/dev.jsonl \
28+
python benchmark.py --model_name_or_path bigscience/bloomz-7b1-mt \
2929
--num_train_epochs 1 --per_device_train_batch_size 4 \
30-
--evaluation_strategy epoch --save_strategy epoch \
31-
--fp16 --fp16_opt_level O2 \
30+
--evaluation_strategy no --save_strategy no \
31+
--fp16 --fp16_opt_level O2 --lora \
3232
--logging_steps 50 --output_dir outputs
3333
34-
多卡 mp
35-
python train_nl2sql.py --model_name_or_path bigscience/bloomz-7b1-mt \
36-
--train_file nl2sql/dev.jsonl --validation_file nl2sql/dev.jsonl \
37-
--num_train_epochs 1 --per_device_train_batch_size 16 \
38-
--evaluation_strategy epoch --save_strategy epoch \
39-
--fp16 --fp16_opt_level O2 \
34+
多卡mp
35+
python -m paddle.distributed.launch --gpus "0,1,2,3" benchmark.py --model_name_or_path bigscience/bloomz-7b1-mt \
36+
--num_train_epochs 1 --per_device_train_batch_size 8 \
37+
--evaluation_strategy no --save_strategy no \
38+
--fp16 --fp16_opt_level O2 --tensor_parallel_degree 4 \
4039
--logging_steps 50 --output_dir outputs
4140
42-
多卡 sharding 3
43-
python -m paddle.distributed.launch --gpus "0,1,2,3" train_nl2sql.py --model_name_or_path bigscience/bloomz-7b1-mt \
44-
--train_file nl2sql/dev.jsonl --validation_file nl2sql/dev.jsonl \
41+
多卡sharding 3
42+
python -m paddle.distributed.launch --gpus "0,1,2,3" benchmark.py --model_name_or_path bigscience/bloomz-7b1-mt \
4543
--num_train_epochs 1 --per_device_train_batch_size 4 \
46-
--evaluation_strategy epoch --save_strategy epoch \
44+
--evaluation_strategy no --save_strategy no \
4745
--fp16 --fp16_opt_level O2 \
4846
--sharding "stage3" --sharding_parallel_degree 4 \
4947
--logging_steps 50 --output_dir outputs
@@ -60,19 +58,9 @@ class ModelArguments:
6058
lora: Optional[bool] = field(default=False, metadata={"help": "whether to use LoRA"})
6159

6260

63-
@dataclass
64-
class DataTrainingArguments:
65-
"""
66-
Arguments pertaining to what data we are going to input our model for training and eval.
67-
"""
68-
69-
train_file: str = field(default=None, metadata={"help": "The input training data file (a text file)."})
70-
validation_file: str = field(default=None, metadata={"help": "The input evaluation data file (a text file).e)."})
71-
72-
7361
def main():
74-
parser = PdArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
75-
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
62+
parser = PdArgumentParser((ModelArguments, TrainingArguments))
63+
model_args, training_args = parser.parse_args_into_dataclasses()
7664

7765
# Set the dtype for loading model
7866
dtype = None
@@ -83,10 +71,13 @@ def main():
8371
dtype = "bfloat16"
8472

8573
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
74+
if "llama" in model_args.model_name_or_path:
75+
tokenizer.pad_token = tokenizer.unk_token
8676
model = AutoModelForCausalLM.from_pretrained(
8777
model_args.model_name_or_path,
8878
load_state_as_np=True,
8979
low_cpu_mem_usage=True,
80+
# use_flash_attention=True,
9081
dtype=dtype,
9182
tensor_parallel_degree=training_args.tensor_parallel_degree,
9283
tensor_parallel_rank=training_args.tensor_parallel_rank,
@@ -105,9 +96,9 @@ def main():
10596
model.mark_only_lora_as_trainable()
10697
model.print_trainable_parameters()
10798

108-
def preprocess_function(example, max_src_length=512, max_tgt_length=256):
109-
inputs = example["src"][0]
110-
targets = example["tgt"][0]
99+
def preprocess_function(example, max_src_length=512, max_tgt_length=512):
100+
inputs = example["instruction"]
101+
targets = example["output"]
111102
model_inputs = tokenizer(inputs, max_length=max_src_length, truncation=True, return_attention_mask=False)
112103
labels = tokenizer(targets, max_length=max_tgt_length, truncation=True, return_attention_mask=False)
113104
labels_input_ids = labels["input_ids"] + [tokenizer.eos_token_id]
@@ -116,17 +107,25 @@ def preprocess_function(example, max_src_length=512, max_tgt_length=256):
116107

117108
return model_inputs
118109

119-
dataset = load_dataset("json", data_files={"train": data_args.train_file, "dev": data_args.validation_file})
120-
dataset = dataset.map(lambda example: preprocess_function(example))
110+
dataset = load_dataset("Chinese-Vicuna/guanaco_belle_merge_v1.0")
111+
# select first 10k examples for benchmarking
112+
dataset = dataset["train"].select(range(10000))
113+
dataset = dataset.map(
114+
lambda example: preprocess_function(example), remove_columns=["instruction", "input", "output"]
115+
)
116+
total_effective_tokens = sum([len(i["input_ids"]) for i in dataset]) * training_args.num_train_epochs
121117

122-
trainer = Trainer(
118+
trainer = CustomTrainer(
123119
model=model,
124-
train_dataset=dataset["train"],
125-
eval_dataset=dataset["dev"],
120+
train_dataset=dataset,
126121
args=training_args,
127122
data_collator=DataCollatorForSeq2Seq(return_tensors="pd", tokenizer=tokenizer),
128123
)
129-
trainer.train()
124+
train_metrics = trainer.train()
125+
tokens_per_second = trainer.total_observed_tokens / train_metrics.metrics["train_runtime"]
126+
effective_tokens_per_second = total_effective_tokens / train_metrics.metrics["train_runtime"]
127+
print(f"Tokens per second: {tokens_per_second:.2f}")
128+
print(f"Effective Tokens per second: {effective_tokens_per_second:.2f}")
130129

131130

132131
if __name__ == "__main__":
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from paddlenlp.trainer import Trainer
16+
17+
18+
class CustomTrainer(Trainer):
19+
total_observed_tokens = 0.0
20+
21+
def training_step(self, model, inputs):
22+
input_ids = inputs["input_ids"]
23+
self.total_observed_tokens += float(input_ids.shape[0] * input_ids.shape[1])
24+
return super().training_step(model, inputs)

examples/benchmark/peft/torch/train_nl2sql.py renamed to examples/benchmark/peft/torch/benchmark.py

Lines changed: 25 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -22,25 +22,23 @@
2222
AutoTokenizer,
2323
DataCollatorForSeq2Seq,
2424
HfArgumentParser,
25-
Trainer,
2625
TrainingArguments,
2726
)
27+
from utils import CustomTrainer
2828

2929
"""
3030
单卡
31-
python train_nl2sql.py --model_name_or_path bigscience/bloomz-7b1-mt \
32-
--train_file nl2sql/dev.jsonl --validation_file nl2sql/dev.jsonl \
31+
python benchmark.py --model_name_or_path bigscience/bloomz-7b1-mt \
3332
--num_train_epochs 1 --per_device_train_batch_size 4 \
34-
--evaluation_strategy epoch --save_strategy epoch \
35-
--fp16 \
33+
--evaluation_strategy no --save_strategy no \
34+
--fp16 --lora \
3635
--logging_steps 50 --output_dir outputs
3736
3837
多卡 deepspeed zero3
39-
python -m torch.distributed.run --nproc_per_node=4 train_nl2sql.py --deepspeed ds_config.json \
38+
python -m torch.distributed.run --nproc_per_node=4 benchmark.py --deepspeed ds_config.json \
4039
--model_name_or_path bigscience/bloomz-7b1-mt \
41-
--train_file nl2sql/dev.jsonl --validation_file nl2sql/dev.jsonl \
4240
--num_train_epochs 1 --per_device_train_batch_size 2 \
43-
--evaluation_strategy epoch --save_strategy epoch \
41+
--evaluation_strategy no --save_strategy no \
4442
--fp16 \
4543
--logging_steps 50 --output_dir outputs
4644
"""
@@ -56,19 +54,9 @@ class ModelArguments:
5654
lora: Optional[bool] = field(default=False, metadata={"help": "whether to use LoRA"})
5755

5856

59-
@dataclass
60-
class DataTrainingArguments:
61-
"""
62-
Arguments pertaining to what data we are going to input our model for training and eval.
63-
"""
64-
65-
train_file: str = field(default=None, metadata={"help": "The input training data file (a text file)."})
66-
validation_file: str = field(default=None, metadata={"help": "The input evaluation data file (a text file).e)."})
67-
68-
6957
def main():
70-
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
71-
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
58+
parser = HfArgumentParser((ModelArguments, TrainingArguments))
59+
model_args, training_args = parser.parse_args_into_dataclasses()
7260
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
7361
model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path)
7462

@@ -80,9 +68,9 @@ def main():
8068
model = get_peft_model(model, peft_config)
8169
model.print_trainable_parameters()
8270

83-
def preprocess_function(example, max_src_length=512, max_tgt_length=256):
84-
inputs = example["src"][0]
85-
targets = example["tgt"][0]
71+
def preprocess_function(example, max_src_length=512, max_tgt_length=512):
72+
inputs = example["instruction"]
73+
targets = example["output"]
8674
model_inputs = tokenizer(inputs, max_length=max_src_length, truncation=True, return_attention_mask=False)
8775
labels = tokenizer(targets, max_length=max_tgt_length, truncation=True, return_attention_mask=False)
8876
labels_input_ids = labels["input_ids"] + [tokenizer.eos_token_id]
@@ -91,18 +79,26 @@ def preprocess_function(example, max_src_length=512, max_tgt_length=256):
9179

9280
return model_inputs
9381

94-
dataset = load_dataset("json", data_files={"train": data_args.train_file, "dev": data_args.validation_file})
95-
dataset = dataset.map(lambda example: preprocess_function(example))
82+
dataset = load_dataset("Chinese-Vicuna/guanaco_belle_merge_v1.0")
83+
# select first 10k examples for benchmarking
84+
dataset = dataset["train"].select(range(10000))
85+
dataset = dataset.map(
86+
lambda example: preprocess_function(example), remove_columns=["instruction", "input", "output"]
87+
)
88+
total_effective_tokens = sum([len(i["input_ids"]) for i in dataset]) * training_args.num_train_epochs
9689

97-
trainer = Trainer(
90+
trainer = CustomTrainer(
9891
model=model,
99-
train_dataset=dataset["train"],
100-
eval_dataset=dataset["dev"],
92+
train_dataset=dataset,
10193
args=training_args,
10294
data_collator=DataCollatorForSeq2Seq(return_tensors="pt", tokenizer=tokenizer),
10395
)
10496
model.config.use_cache = False # silence the warnings. Please re-enable for inference!
105-
trainer.train()
97+
train_metrics = trainer.train()
98+
tokens_per_second = trainer.total_observed_tokens / train_metrics.metrics["train_runtime"]
99+
effective_tokens_per_second = total_effective_tokens / train_metrics.metrics["train_runtime"]
100+
print(f"Tokens per second: {tokens_per_second:.2f}")
101+
print(f"Effective Tokens per second: {effective_tokens_per_second:.2f}")
106102

107103

108104
if __name__ == "__main__":
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from transformers import Trainer
16+
17+
18+
class CustomTrainer(Trainer):
19+
total_observed_tokens = 0.0
20+
21+
def training_step(self, model, inputs):
22+
input_ids = inputs["input_ids"]
23+
self.total_observed_tokens += float(input_ids.shape[0] * input_ids.shape[1])
24+
return super().training_step(model, inputs)

0 commit comments

Comments
 (0)