Update PEFT benchmarks (#6187)

sijunhe · web-flow · commit 8fcedfd8d3b3 · 2023-06-15T21:31:06.000+08:00
* update

* updates

* update

* update

* commit

* wip

* updates
diff --git a/examples/benchmark/peft/README.md b/examples/benchmark/peft/README.md
@@ -1,18 +1,15 @@
 # Benchmark Results
 
-### 硬件与环境
+### 配置
 
 - 硬件: A100-80G with NVLink, 具体卡数见表
-- Torch环境：见torch/requirements.txt
+- Torch环境: 见torch/requirements.txt
+- 数据: 10k条[Chinese-Vicuna/guanaco_belle_merge_v1.0](https://huggingface.co/datasets/Chinese-Vicuna/guanaco_belle_merge_v1.0)
 
 ### Bloom
 
-| Model         | Method   | Num GPUs | Batch Size | Paddle Setup | Paddle (s/epoch) | Torch Setup | Torch (s/epoch) | Delta |
-|---------------|----------|----------|------------|--------------|------------------|-------------|-----------------|-------|
-| Bloomz-7b1-mt | LoRA     | 1        | 4          | fp16 O2      | 179              | fp16        | 219             | -18%  |
-| Bloomz-7b1-mt | LoRA     | 1        | 8          | FP16 O2      | 171              | fp16        | 197             | -13%  |
-| Bloomz-7b1-mt | Finetune | 4        | 8          | fp16 O2 MP 4 | 133              | fp16 ZeRO 3 | 288             | -54%  |
-| Bloomz-7b1-mt | Finetune | 4        | 16         | fp16 O2 MP 4 | 106              | fp16 ZeRO 3 | 150             | -29%  |
-| Bloomz-7b1-mt | Finetune | 4        | 32         | fp16 O2 MP 4 | 85               | fp16 ZeRO 3 | 94              | -10%  |
-
-* transformers默认的half_precision_backend是`torch.cuda.amp`, 不使用`fp16_opt_level`参数
+| Model         | Method   | Num GPUs | Batch Size | Paddle Setup | Paddle Effective Tokens/s | Torch Setup | Torch Effective Tokens/s | Speedup |
+|---------------|----------|----------|------------|--------------|---------------------------|-------------|--------------------------|---------|
+| Bloomz-7b1-mt | LoRA     | 1        | 4          | fp16 O2      | 2293.46                   | fp16        | 1736.92                  | +32%    |
+| Bloomz-7b1-mt | Finetune | 4        | 8          | fp16 O2 MP 4 | 2873.13                   | fp16 ZeRO 3 | 1634.58                  | +76%    |
+| Bloomz-7b1-mt | Finetune | 4        | 16         | fp16 O2 MP 4 | 2853.83                   | fp16 ZeRO 3 | 2694.64                  | +6%     |
diff --git a/examples/benchmark/peft/paddle/benchmark.py b/examples/benchmark/peft/paddle/benchmark.py
@@ -16,34 +16,32 @@
 from typing import Optional
 
 from datasets import load_dataset
+from utils import CustomTrainer
 
 from paddlenlp.data import DataCollatorForSeq2Seq
 from paddlenlp.peft import LoRAConfig, LoRAModel
-from paddlenlp.trainer import PdArgumentParser, Trainer, TrainingArguments
+from paddlenlp.trainer import PdArgumentParser, TrainingArguments
 from paddlenlp.transformers import AutoModelForCausalLM, AutoTokenizer
 
 """
 单卡
-python train_nl2sql.py --model_name_or_path bigscience/bloomz-7b1-mt  \
-    --train_file nl2sql/dev.jsonl --validation_file nl2sql/dev.jsonl \
+python benchmark.py --model_name_or_path bigscience/bloomz-7b1-mt  \
     --num_train_epochs 1 --per_device_train_batch_size 4 \
-    --evaluation_strategy epoch --save_strategy epoch \
-    --fp16 --fp16_opt_level O2 \
+    --evaluation_strategy no --save_strategy no \
+    --fp16 --fp16_opt_level O2 --lora \
     --logging_steps 50 --output_dir outputs
 
-多卡 mp
-python train_nl2sql.py --model_name_or_path bigscience/bloomz-7b1-mt  \
-    --train_file nl2sql/dev.jsonl --validation_file nl2sql/dev.jsonl \
-    --num_train_epochs 1 --per_device_train_batch_size 16 \
-    --evaluation_strategy epoch --save_strategy epoch \
-    --fp16 --fp16_opt_level O2 \
+多卡mp
+python -m paddle.distributed.launch --gpus "0,1,2,3" benchmark.py --model_name_or_path bigscience/bloomz-7b1-mt  \
+    --num_train_epochs 1 --per_device_train_batch_size 8 \
+    --evaluation_strategy no --save_strategy no \
+    --fp16 --fp16_opt_level O2 --tensor_parallel_degree 4 \
     --logging_steps 50 --output_dir outputs
 
-多卡 sharding 3
-python -m paddle.distributed.launch --gpus "0,1,2,3" train_nl2sql.py --model_name_or_path bigscience/bloomz-7b1-mt  \
-    --train_file nl2sql/dev.jsonl --validation_file nl2sql/dev.jsonl \
+多卡sharding 3
+python -m paddle.distributed.launch --gpus "0,1,2,3" benchmark.py --model_name_or_path bigscience/bloomz-7b1-mt  \
     --num_train_epochs 1 --per_device_train_batch_size 4 \
-    --evaluation_strategy epoch --save_strategy epoch \
+    --evaluation_strategy no --save_strategy no \
     --fp16 --fp16_opt_level O2 \
     --sharding "stage3" --sharding_parallel_degree 4 \
     --logging_steps 50 --output_dir outputs
@@ -60,19 +58,9 @@ class ModelArguments:
     lora: Optional[bool] = field(default=False, metadata={"help": "whether to use LoRA"})
 
 
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    train_file: str = field(default=None, metadata={"help": "The input training data file (a text file)."})
-    validation_file: str = field(default=None, metadata={"help": "The input evaluation data file (a text file).e)."})
-
-
 def main():
-    parser = PdArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    parser = PdArgumentParser((ModelArguments, TrainingArguments))
+    model_args, training_args = parser.parse_args_into_dataclasses()
 
     # Set the dtype for loading model
     dtype = None
@@ -83,10 +71,13 @@ def main():
             dtype = "bfloat16"
 
     tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+    if "llama" in model_args.model_name_or_path:
+        tokenizer.pad_token = tokenizer.unk_token
     model = AutoModelForCausalLM.from_pretrained(
         model_args.model_name_or_path,
         load_state_as_np=True,
         low_cpu_mem_usage=True,
+        # use_flash_attention=True,
         dtype=dtype,
         tensor_parallel_degree=training_args.tensor_parallel_degree,
         tensor_parallel_rank=training_args.tensor_parallel_rank,
@@ -105,9 +96,9 @@ def main():
         model.mark_only_lora_as_trainable()
         model.print_trainable_parameters()
 
-    def preprocess_function(example, max_src_length=512, max_tgt_length=256):
-        inputs = example["src"][0]
-        targets = example["tgt"][0]
+    def preprocess_function(example, max_src_length=512, max_tgt_length=512):
+        inputs = example["instruction"]
+        targets = example["output"]
         model_inputs = tokenizer(inputs, max_length=max_src_length, truncation=True, return_attention_mask=False)
         labels = tokenizer(targets, max_length=max_tgt_length, truncation=True, return_attention_mask=False)
         labels_input_ids = labels["input_ids"] + [tokenizer.eos_token_id]
@@ -116,17 +107,25 @@ def preprocess_function(example, max_src_length=512, max_tgt_length=256):
 
         return model_inputs
 
-    dataset = load_dataset("json", data_files={"train": data_args.train_file, "dev": data_args.validation_file})
-    dataset = dataset.map(lambda example: preprocess_function(example))
+    dataset = load_dataset("Chinese-Vicuna/guanaco_belle_merge_v1.0")
+    # select first 10k examples for benchmarking
+    dataset = dataset["train"].select(range(10000))
+    dataset = dataset.map(
+        lambda example: preprocess_function(example), remove_columns=["instruction", "input", "output"]
+    )
+    total_effective_tokens = sum([len(i["input_ids"]) for i in dataset]) * training_args.num_train_epochs
 
-    trainer = Trainer(
+    trainer = CustomTrainer(
         model=model,
-        train_dataset=dataset["train"],
-        eval_dataset=dataset["dev"],
+        train_dataset=dataset,
         args=training_args,
         data_collator=DataCollatorForSeq2Seq(return_tensors="pd", tokenizer=tokenizer),
     )
-    trainer.train()
+    train_metrics = trainer.train()
+    tokens_per_second = trainer.total_observed_tokens / train_metrics.metrics["train_runtime"]
+    effective_tokens_per_second = total_effective_tokens / train_metrics.metrics["train_runtime"]
+    print(f"Tokens per second: {tokens_per_second:.2f}")
+    print(f"Effective Tokens per second: {effective_tokens_per_second:.2f}")
 
 
 if __name__ == "__main__":
diff --git a/examples/benchmark/peft/paddle/utils.py b/examples/benchmark/peft/paddle/utils.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlenlp.trainer import Trainer
+
+
+class CustomTrainer(Trainer):
+    total_observed_tokens = 0.0
+
+    def training_step(self, model, inputs):
+        input_ids = inputs["input_ids"]
+        self.total_observed_tokens += float(input_ids.shape[0] * input_ids.shape[1])
+        return super().training_step(model, inputs)
diff --git a/examples/benchmark/peft/torch/benchmark.py b/examples/benchmark/peft/torch/benchmark.py
@@ -22,25 +22,23 @@
     AutoTokenizer,
     DataCollatorForSeq2Seq,
     HfArgumentParser,
-    Trainer,
     TrainingArguments,
 )
+from utils import CustomTrainer
 
 """
 单卡
-python train_nl2sql.py --model_name_or_path bigscience/bloomz-7b1-mt  \
-    --train_file nl2sql/dev.jsonl --validation_file nl2sql/dev.jsonl \
+python benchmark.py --model_name_or_path bigscience/bloomz-7b1-mt  \
     --num_train_epochs 1 --per_device_train_batch_size 4 \
-    --evaluation_strategy epoch --save_strategy epoch \
-    --fp16 \
+    --evaluation_strategy no --save_strategy no \
+    --fp16 --lora \
     --logging_steps 50 --output_dir outputs
 
 多卡 deepspeed zero3
-python -m torch.distributed.run --nproc_per_node=4 train_nl2sql.py --deepspeed ds_config.json \
+python -m torch.distributed.run --nproc_per_node=4 benchmark.py --deepspeed ds_config.json \
     --model_name_or_path bigscience/bloomz-7b1-mt  \
-    --train_file nl2sql/dev.jsonl --validation_file nl2sql/dev.jsonl \
     --num_train_epochs 1 --per_device_train_batch_size 2 \
-    --evaluation_strategy epoch --save_strategy epoch \
+    --evaluation_strategy no --save_strategy no \
     --fp16 \
     --logging_steps 50 --output_dir outputs
 """
@@ -56,19 +54,9 @@ class ModelArguments:
     lora: Optional[bool] = field(default=False, metadata={"help": "whether to use LoRA"})
 
 
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    train_file: str = field(default=None, metadata={"help": "The input training data file (a text file)."})
-    validation_file: str = field(default=None, metadata={"help": "The input evaluation data file (a text file).e)."})
-
-
 def main():
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    parser = HfArgumentParser((ModelArguments, TrainingArguments))
+    model_args, training_args = parser.parse_args_into_dataclasses()
     tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
     model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path)
 
@@ -80,9 +68,9 @@ def main():
         model = get_peft_model(model, peft_config)
         model.print_trainable_parameters()
 
-    def preprocess_function(example, max_src_length=512, max_tgt_length=256):
-        inputs = example["src"][0]
-        targets = example["tgt"][0]
+    def preprocess_function(example, max_src_length=512, max_tgt_length=512):
+        inputs = example["instruction"]
+        targets = example["output"]
         model_inputs = tokenizer(inputs, max_length=max_src_length, truncation=True, return_attention_mask=False)
         labels = tokenizer(targets, max_length=max_tgt_length, truncation=True, return_attention_mask=False)
         labels_input_ids = labels["input_ids"] + [tokenizer.eos_token_id]
@@ -91,18 +79,26 @@ def preprocess_function(example, max_src_length=512, max_tgt_length=256):
 
         return model_inputs
 
-    dataset = load_dataset("json", data_files={"train": data_args.train_file, "dev": data_args.validation_file})
-    dataset = dataset.map(lambda example: preprocess_function(example))
+    dataset = load_dataset("Chinese-Vicuna/guanaco_belle_merge_v1.0")
+    # select first 10k examples for benchmarking
+    dataset = dataset["train"].select(range(10000))
+    dataset = dataset.map(
+        lambda example: preprocess_function(example), remove_columns=["instruction", "input", "output"]
+    )
+    total_effective_tokens = sum([len(i["input_ids"]) for i in dataset]) * training_args.num_train_epochs
 
-    trainer = Trainer(
+    trainer = CustomTrainer(
         model=model,
-        train_dataset=dataset["train"],
-        eval_dataset=dataset["dev"],
+        train_dataset=dataset,
         args=training_args,
         data_collator=DataCollatorForSeq2Seq(return_tensors="pt", tokenizer=tokenizer),
     )
     model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
-    trainer.train()
+    train_metrics = trainer.train()
+    tokens_per_second = trainer.total_observed_tokens / train_metrics.metrics["train_runtime"]
+    effective_tokens_per_second = total_effective_tokens / train_metrics.metrics["train_runtime"]
+    print(f"Tokens per second: {tokens_per_second:.2f}")
+    print(f"Effective Tokens per second: {effective_tokens_per_second:.2f}")
 
 
 if __name__ == "__main__":
diff --git a/examples/benchmark/peft/torch/utils.py b/examples/benchmark/peft/torch/utils.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers import Trainer
+
+
+class CustomTrainer(Trainer):
+    total_observed_tokens = 0.0
+
+    def training_step(self, model, inputs):
+        input_ids = inputs["input_ids"]
+        self.total_observed_tokens += float(input_ids.shape[0] * input_ids.shape[1])
+        return super().training_step(model, inputs)