support train_dataset_mix_ds using custom_local_path (#582)

Jintao-Huang · web-flow · commit 7e1834e8206a · 2024-03-20T21:46:50.000+08:00
diff --git a/docs/source/LLM/命令行参数.md b/docs/source/LLM/命令行参数.md
@@ -58,11 +58,11 @@
 - `--deepspeed`: 用于指定deepspeed的配置文件的路径或者直接传入json格式的配置信息, 默认为`None`, 即不开启deepspeed. deepspeed可以节约显存. 我们书写了默认的[ZeRO-2配置文件](https://github.com/modelscope/swift/blob/main/swift/llm/ds_config/zero2.json), [ZeRO-3配置文件](https://github.com/modelscope/swift/blob/main/swift/llm/ds_config/zero3.json). 你只需要指定'default-zero2', 就会使用默认zero2配置文件; 指定'default-zero3', 就会使用默认的zero3配置文件.
 - `--batch_size`: 训练时的batch_size, 默认为`1`. 增大batch_size可以增加GPU的利用率, 但不一定会增加训练速度, 因为在一个batch中, 需要对较短的句子按该batch中最长句子的长度进行padding, 从而引入无效的计算量.
 - `--eval_batch_size`: 评估时的batch_size, 默认为`None`, 即当`predict_with_generate`为True时, 设置为1, 为False时, 设置为`batch_size`.
-- `--num_train_epochs`: 训练的epoch数, 默认为`1`. 如果`max_steps >= 0`, 则覆盖`num_train_epochs`. 通常情况下设置为3 ~ 5.
+- `--num_train_epochs`: 训练的epoch数, 默认为`1`. 如果`max_steps >= 0`, 则覆盖`num_train_epochs`. 你可以设置为3, 5, 10等.
 - `--max_steps`: 训练的max_steps数, 默认为`-1`. 如果`max_steps >= 0`, 则覆盖`num_train_epochs`.
 - `--optim`: 默认为`'adamw_torch'`.
 - `--learning_rate`: 默认值为`None`, 即如果`sft_type`为lora, 则设置为1e-4, 如果`sft_type`为full, 则设置为1e-5.
-- `--weight_decay`: 默认值为`0.01`. 推荐使用`0.1`或者`0.01`.
+- `--weight_decay`: 默认值为`0.1`.
 - `--gradient_accumulation_steps`: 梯度累加, 默认值为`None`, 设置为`math.ceil(16 / self.batch_size / world_size)`. `total_batch_size =  batch_size * gradient_accumulation_steps * world_size`.
 - `--max_grad_norm`: 梯度裁剪, 默认值为`0.5`.
 - `--predict_with_generate`: 评估时是否使用生成式的方式, 默认为`False`. 如果设置为False, 则使用`loss`进行评估. 如果设置为True, 则使用`ROUGE-L`等指标进行评估. 使用生成式评估耗费的时间很长, 请谨慎选择.
@@ -100,8 +100,8 @@
 - `--repetition_penalty`: 默认为`1.`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
 - `--num_beams`: 默认为`1`. 该参数只有在`predict_with_generate`设置为True的时候才生效.
 - `--gpu_memory_fraction`: 默认为`None`. 该参数旨在指定显卡最大可用显存比例的情况下运行训练，用于极限测试.
-- `--train_dataset_mix_ratio`: 默认为`0`. 该参数定义了如何进行数据集打混训练. 指定该参数时, 训练集会以`train_dataset_mix_ratio`倍数混合`train_dataset_mix_ds`指定的通用知识数据集, 使整体数据集长度达到`train_dataset_sample`.
-- `--train_dataset_mix_ds`: 默认为`ms-bench`. 用于防止知识遗忘的通用知识数据集.
+- `--train_dataset_mix_ratio`: 默认为`0.`. 该参数定义了如何进行数据集打混训练. 指定该参数时, 会混合训练集的`train_dataset_mix_ratio`倍数的`train_dataset_mix_ds`指定的通用知识数据集.
+- `--train_dataset_mix_ds`: 默认为`['ms-bench']`. 用于防止知识遗忘的通用知识数据集.
 - `--use_loss_scale`: 默认为`False`. 生效时会将Agent的部分字段(Action/Action Input部分)的loss权重加强以强化CoT, 对普通SFT场景没有任何效果.
 
 ### LoRA+微调参数
diff --git a/swift/llm/sft.py b/swift/llm/sft.py
@@ -40,7 +40,7 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
           f'world_size: {world_size}, local_world_size: {local_world_size}')
     seed_everything(args.seed)
 
-    if args.gpu_memory_fraction:
+    if args.gpu_memory_fraction is not None:
         for device_id in range(torch.cuda.device_count()):
             torch.cuda.set_per_process_memory_fraction(
                 max(min(args.gpu_memory_fraction, 1.0), 0.01),
@@ -116,16 +116,9 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
         random_state,
         check_dataset_strategy=args.check_dataset_strategy)
     val_dataset_sample = args.val_dataset_sample
-    mix_dataset_sample = 0 if not args.train_dataset_mix_ratio else round(
-        len(train_dataset) * args.train_dataset_mix_ratio)
     if train_dataset is not None and args.train_dataset_sample >= 0:
-        total_dataset_sample = min(args.train_dataset_sample,
+        train_dataset_sample = min(args.train_dataset_sample,
                                    train_dataset.shape[0])
-        train_dataset_sample = total_dataset_sample
-        if args.train_dataset_mix_ratio:
-            train_dataset_sample = round(
-                1. / (1 + args.train_dataset_mix_ratio) * total_dataset_sample)
-            mix_dataset_sample = total_dataset_sample - train_dataset_sample
         if train_dataset.shape[0] > train_dataset_sample:
             logger.info(f'train_dataset_sample: {train_dataset_sample}')
             train_idxs = random_state.permutation(train_dataset_sample)
@@ -139,8 +132,7 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
             val_idxs = random_state.permutation(val_dataset_sample)
             val_dataset = val_dataset.select(val_idxs)
 
-    train_dataset = handle_dataset_mixture(args, train_dataset,
-                                           mix_dataset_sample)
+    train_dataset = handle_dataset_mixture(args, train_dataset)
 
     # add self-cognition dataset
     if args.self_cognition_sample > 0:
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
@@ -10,6 +10,7 @@
 import torch
 import torch.distributed as dist
 import transformers
+from datasets import Dataset as HfDataset
 from datasets import concatenate_datasets
 from packaging import version
 from torch import dtype as Dtype
@@ -76,7 +77,7 @@ class SftArguments:
     dataset_seed: int = 42
     dataset_test_ratio: float = 0.01
     train_dataset_sample: int = 20000  # -1: all dataset
-    train_dataset_mix_ratio: Optional[float] = None
+    train_dataset_mix_ratio: float = 0.
     train_dataset_mix_ds: List[str] = field(
         default_factory=lambda: ['ms-bench'])
     val_dataset_sample: Optional[int] = None  # -1: all dataset
@@ -165,7 +166,7 @@ class SftArguments:
     adam_beta1: float = 0.9
     adam_beta2: float = 0.999
     learning_rate: Optional[float] = None
-    weight_decay: float = 0.01
+    weight_decay: float = 0.1
     gradient_accumulation_steps: Optional[int] = None
     max_grad_norm: float = 0.5
     predict_with_generate: bool = False
@@ -286,6 +287,8 @@ def __post_init__(self) -> None:
         set_model_type(self)
         if isinstance(self.dataset, str):
             self.dataset = [self.dataset]
+        if isinstance(self.train_dataset_mix_ds, str):
+            self.train_dataset_mix_ds = [self.train_dataset_mix_ds]
         register_custom_dataset(self)
         check_flash_attn(self)
         handle_generation_config(self)
@@ -653,6 +656,8 @@ def __post_init__(self) -> None:
         else:
             assert self.load_dataset_config is False, 'You need to first set `--load_args_from_ckpt_dir true`.'
         set_model_type(self)
+        if isinstance(self.dataset, str):
+            self.dataset = [self.dataset]
         register_custom_dataset(self)
         check_flash_attn(self)
         handle_generation_config(self)
@@ -661,8 +666,6 @@ def __post_init__(self) -> None:
         if self.template_type == 'AUTO':
             self.template_type = get_default_template_type(self.model_type)
             logger.info(f'Setting template_type: {self.template_type}')
-        if isinstance(self.dataset, str):
-            self.dataset = [self.dataset]
         has_dataset = (
             len(self.dataset) > 0 or len(self.custom_train_dataset_path) > 0
             or len(self.custom_val_dataset_path) > 0)
@@ -1078,25 +1081,38 @@ def handle_path(args: Union[SftArguments, InferArguments]) -> None:
         setattr(args, k, value)
 
 
+def _register_local_dataset(dataset_name: str, train_dataset_path: List[str],
+                            val_dataset_path: List[str]) -> None:
+    register_dataset(
+        dataset_name,
+        '_',
+        train_dataset_path,
+        val_dataset_path,
+        get_function=get_custom_dataset,
+        exists_ok=True)
+
+
 def register_custom_dataset(args: Union[SftArguments, InferArguments]) -> None:
+    dataset = []
+    for d in args.dataset:
+        if os.path.exists(d):
+            args.custom_train_dataset_path.append(d)
+        else:
+            dataset.append(d)
+    args.dataset = dataset
+
     for key in ['custom_train_dataset_path', 'custom_val_dataset_path']:
         value = getattr(args, key)
         if isinstance(value, str):
             setattr(args, key, [value])
     if len(args.custom_train_dataset_path) == 0 and len(
             args.custom_val_dataset_path) == 0:
         return
-    register_dataset(
-        '_custom_dataset',
-        '_custom_dataset',
-        args.custom_train_dataset_path,
-        args.custom_val_dataset_path,
-        get_function=get_custom_dataset,
-        exists_ok=True)
-    if args.dataset is None:
-        args.dataset = ['_custom_dataset']
-    elif '_custom_dataset' not in args.dataset:
-        args.dataset.append('_custom_dataset')
+
+    dataset_name = '_custom_dataset'
+    _register_local_dataset(dataset_name, args.custom_train_dataset_path,
+                            args.custom_val_dataset_path)
+    args.dataset.append(dataset_name)
 
 
 def load_from_ckpt_dir(args: InferArguments) -> None:
@@ -1147,34 +1163,48 @@ def handle_generation_config(
         )
 
 
-def handle_dataset_mixture(args: SftArguments, train_dataset,
-                           mix_dataset_sample) -> None:
+def handle_dataset_mixture(args: SftArguments,
+                           train_dataset: HfDataset) -> None:
     if train_dataset is None:
         return train_dataset
-    train_length = len(train_dataset)
+    if args.train_dataset_mix_ratio <= 0 or len(
+            args.train_dataset_mix_ds) == 0:
+        return train_dataset
+
     random_state = np.random.RandomState(args.dataset_seed)
-    if mix_dataset_sample:
-        assert args.train_dataset_mix_ds is not None
-        train_dataset_mix_ds = [args.train_dataset_mix_ds] if isinstance(
-            args.train_dataset_mix_ds, str) else args.train_dataset_mix_ds
-        mixed_dataset = get_dataset(
-            train_dataset_mix_ds,
-            0.0,
-            random_state,
-            check_dataset_strategy=args.check_dataset_strategy)[0]
-        if len(mixed_dataset) < mix_dataset_sample:
-            logger.warn(
-                f'The length of dataset used for mixin: {train_dataset_mix_ds} are '
-                'lesser than the ratio required by the `train_dataset_mix_ratio` '
-                f'argument:{args.train_dataset_mix_ratio}'
-                f'the actual ratio is : {len(mixed_dataset)/float(train_length)}'
-            )
+    train_dataset_mix_ds = []
+    custom_mix_ds = []
+    for mix_ds in args.train_dataset_mix_ds:
+        if os.path.exists(mix_ds):
+            custom_mix_ds.append(mix_ds)
         else:
-            train_idxs = random_state.permutation(mix_dataset_sample)
-            mixed_dataset = mixed_dataset.select(train_idxs)
-        return concatenate_datasets([train_dataset, mixed_dataset])
+            train_dataset_mix_ds.append(mix_ds)
+
+    if len(custom_mix_ds) > 0:
+        dataset_name = '_custom_mixture'
+        _register_local_dataset(dataset_name, custom_mix_ds, [])
+        train_dataset_mix_ds.append(dataset_name)
+    mix_dataset_sample = len(train_dataset) * args.train_dataset_mix_ratio
+    logger.info(f'train_dataset_mix_ds: {train_dataset_mix_ds}')
+    logger.info(
+        f'len(train_dataset): {len(train_dataset)}, mix_dataset_sample: {mix_dataset_sample}'
+    )
+    mixed_dataset = get_dataset(
+        train_dataset_mix_ds,
+        0.0,
+        random_state,
+        check_dataset_strategy=args.check_dataset_strategy)[0]
+    if len(mixed_dataset) < mix_dataset_sample:
+        logger.warn(
+            f'The length of dataset used for mixin: {train_dataset_mix_ds} are '
+            'lesser than the ratio required by the `train_dataset_mix_ratio` '
+            f'argument: {args.train_dataset_mix_ratio}. '
+            f'the actual ratio is: {len(mixed_dataset)/len(train_dataset):.6}.'
+        )
     else:
-        return train_dataset
+        train_idxs = random_state.permutation(mix_dataset_sample)
+        mixed_dataset = mixed_dataset.select(train_idxs)
+    return concatenate_datasets([train_dataset, mixed_dataset])
 
 
 def swift_to_peft_format(lora_checkpoint_path: str) -> str:
diff --git a/tests/llm/test_run.py b/tests/llm/test_run.py
@@ -182,12 +182,17 @@ def test_custom_dataset(self):
             'alpaca.jsonl', 'alpaca2.csv', 'conversations.jsonl',
             'swift_pre.csv', 'swift_single.jsonl'
         ]
+        mixture_dataset = val_dataset_fnames
         folder = os.path.join(os.path.dirname(__file__), 'data')
         sft_args = SftArguments(
             model_type='qwen-7b-chat',
             custom_train_dataset_path=[
                 os.path.join(folder, fname) for fname in train_dataset_fnames
             ],
+            train_dataset_mix_ds=[
+                os.path.join(folder, fname) for fname in mixture_dataset
+            ],
+            train_dataset_mix_ratio=2,
             check_dataset_strategy='warning')
         torch.cuda.empty_cache()
         best_model_checkpoint = sft_main(sft_args)['best_model_checkpoint']