Support bin packing and rope scaling (#1079)

tastelikefeet · web-flow · commit 75ac11e7ed9b · 2024-06-05T23:18:27.000+08:00
diff --git a/docs/source/LLM/命令行参数.md b/docs/source/LLM/命令行参数.md
@@ -126,7 +126,9 @@
 - `--custom_dataset_info`: 默认为`None`, 传入外置dataset_info.json的路径、json字符串或者dict. 用于拓展数据集. 格式参考: https://github.com/modelscope/swift/blob/main/swift/llm/data/dataset_info.json
 - `--device_map_config_path`: 从本地文件中手动配置模型的device_map, 默认为None
 
+### Long Context
 
+- `--rope_scaling`: 默认值`None`, 支持`linear`和`dynamic`两种scaling方式.当`max_length`超过`max_position_embeddings`时使用.
 
 ### FSDP参数
 
@@ -281,6 +283,7 @@ dpo参数继承了sft参数, 除此之外增加了以下参数:
 - `--lora_modules`: 默认为`[]`, 输入的格式为`'{lora_name}={lora_path}'`, e.g. `--lora_modules lora_name1=lora_path1 lora_name2=lora_path2`. `ckpt_dir`会以`f'default-lora={args.ckpt_dir}'`的形式加入args.lora_modules.
 - `--custom_register_path`: 默认为`None`. 传入`.py`文件, 用于注册模板、模型和数据集.
 - `--custom_dataset_info`: 默认为`None`, 传入外置dataset_info.json的路径、json字符串或者dict. 用于拓展数据集.
+- `--rope_scaling`: 默认值`None`, 支持`linear`和`dynamic`两种scaling方式, 当`max_length`超过`max_position_embeddings`时使用.
 
 
 ## export 参数
diff --git a/docs/source_en/LLM/Command-line-parameters.md b/docs/source_en/LLM/Command-line-parameters.md
@@ -126,6 +126,10 @@
 - `--custom_dataset_info`: Default is `None`. Pass in the path to an external `dataset_info.json`, a JSON string, or a dictionary. Used to register custom datasets. The format example: https://github.com/modelscope/swift/blob/main/swift/llm/data/dataset_info.json
 - `device_map_config_path`: Manually configure the model's device map from a local file, defaults to None.
 
+### Long Context
+
+- `--rope_scaling`: Default `None`, Support `linear` and `dynamic` to scale positional embeddings. Use when `max_length` exceeds `max_position_embeddings`.
+
 ### FSDP Parameters
 
 - `--fsdp`: Default value `''`, the FSDP type, please check [this documentation](https://huggingface.co/docs/transformers/v4.39.3/en/main_classes/trainer#transformers.TrainingArguments.fsdp) for details.
@@ -280,6 +284,7 @@ dpo parameters inherit from sft parameters, with the following added parameters:
 - `--lora_modules`: Default`[]`, the input format is `'{lora_name}={lora_path}'`, e.g. `--lora_modules lora_name1=lora_path1 lora_name2=lora_path2`. `ckpt_dir` will be added with `f'default-lora={args.ckpt_dir}'` by default.
 - `--custom_register_path`: Default is `None`. Pass in a `.py` file used to register templates, models, and datasets.
 - `--custom_dataset_info`: Default is `None`. Pass in the path to an external `dataset_info.json`, a JSON string, or a dictionary. Used for expanding datasets.
+- `--rope_scaling`: Default `None`, Support `linear` and `dynamic` to scale positional embeddings. Use when `max_length` exceeds `max_position_embeddings`.
 
 
 ## export Parameters
diff --git a/requirements/framework.txt b/requirements/framework.txt
@@ -1,4 +1,5 @@
 accelerate
+binpacking
 dacite
 jieba
 matplotlib
diff --git a/swift/llm/dpo.py b/swift/llm/dpo.py
@@ -82,6 +82,9 @@ def llm_dpo(args: DPOArguments) -> str:
     }
     if args.use_flash_attn is not None:
         kwargs['use_flash_attn'] = args.use_flash_attn
+    if args.rope_scaling:
+        kwargs['rope_scaling'] = args.rope_scaling
+        kwargs['max_length'] = args.max_length
     model, tokenizer = get_model_tokenizer(
         args.model_type,
         args.torch_dtype,
diff --git a/swift/llm/infer.py b/swift/llm/infer.py
@@ -163,6 +163,9 @@ def prepare_model_template(args: InferArguments,
         kwargs['automodel_class'] = automodel_class
     if args.local_repo_path:
         kwargs['local_repo_path'] = args.local_repo_path
+    if args.rope_scaling:
+        kwargs['rope_scaling'] = args.rope_scaling
+        kwargs['max_length'] = args.max_length
     model, tokenizer = get_model_tokenizer(
         args.model_type,
         args.torch_dtype,
diff --git a/swift/llm/orpo.py b/swift/llm/orpo.py
@@ -83,6 +83,9 @@ def llm_orpo(args: ORPOArguments) -> str:
     }
     if args.use_flash_attn is not None:
         kwargs['use_flash_attn'] = args.use_flash_attn
+    if args.rope_scaling:
+        kwargs['rope_scaling'] = args.rope_scaling
+        kwargs['max_length'] = args.max_length
     model, tokenizer = get_model_tokenizer(
         args.model_type,
         args.torch_dtype,
diff --git a/swift/llm/sft.py b/swift/llm/sft.py
@@ -108,6 +108,10 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
     elif args.quant_method == 'gptq':
         kwargs['is_gptq'] = True
 
+    if args.rope_scaling:
+        kwargs['rope_scaling'] = args.rope_scaling
+        kwargs['max_length'] = args.max_length
+
     model, tokenizer = get_model_tokenizer(
         args.model_type,
         args.torch_dtype,
diff --git a/swift/llm/simpo.py b/swift/llm/simpo.py
@@ -82,6 +82,9 @@ def llm_simpo(args: SimPOArguments) -> str:
     }
     if args.use_flash_attn is not None:
         kwargs['use_flash_attn'] = args.use_flash_attn
+    if args.rope_scaling:
+        kwargs['rope_scaling'] = args.rope_scaling
+        kwargs['max_length'] = args.max_length
     model, tokenizer = get_model_tokenizer(
         args.model_type,
         args.torch_dtype,
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
@@ -487,6 +487,9 @@ class SftArguments(ArgumentsBase):
     # Literal['gaussian', 'pissa', 'pissa_niter_[number of iters]', 'loftq', 'true', 'false']
     init_lora_weights: str = 'true'
 
+    # rope-scaling
+    rope_scaling: Literal['linear', 'dynamic'] = None
+
     # BOFT
     boft_block_size: int = 4
     boft_block_num: int = 0
@@ -1036,6 +1039,9 @@ class InferArguments(ArgumentsBase):
     num_beams: int = 1
     stop_words: List[str] = None
 
+    # rope-scaling
+    rope_scaling: Literal['linear', 'dynamic'] = None
+
     # other
     use_flash_attn: Optional[bool] = None
     ignore_args_error: bool = False  # True: notebook compatibility
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import inspect
+import math
 import os
 import sys
 from contextlib import nullcontext
@@ -864,6 +865,14 @@ def get_model_tokenizer_from_repo(model_dir: str,
         tokenizer.placeholder_tokens = placeholder_tokens
         tokenizer.placeholder_tokens_id = [tokenizer.convert_tokens_to_ids(token) for token in placeholder_tokens]
     model = None
+
+    rope_scaling = kwargs.pop('rope_scaling', None)
+    max_position_embeddings = getattr(model_config, 'max_position_embeddings', None)
+    if rope_scaling and max_position_embeddings:
+        max_length = kwargs.get('max_length') or max_position_embeddings
+        rope_scaling_factor = max(float(math.ceil(max_length / max_position_embeddings)), 1.0)
+        setattr(model_config, 'rope_scaling', {'type': rope_scaling, 'factor': rope_scaling_factor})
+
     if load_model:
         if kwargs.get('use_unsloth', False):
             assert is_unsloth_available(), 'please install unsloth if using `use_unsloth=True`'
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
@@ -186,7 +186,7 @@ def __init__(
     def get_packed_dataset(template: 'Template',
                            dataset,
                            seq_length=1024,
-                           num_of_sequences=1024,
+                           num_of_sequences=2048,
                            chars_per_token=3.6,
                            append_concat_token=True,
                            add_special_tokens=True,
@@ -197,21 +197,26 @@ def get_packed_dataset(template: 'Template',
         if lazy_tokenize:
             return constant_length_iterator
 
-        def data_generator(constant_length_iterator):
-            yield from constant_length_iterator
-
-        try:
-            packed_dataset = HfDataset.from_generator(
-                data_generator, gen_kwargs={'constant_length_iterator': constant_length_iterator})
-        except (DatasetGenerationError, SchemaInferenceError) as exc:
-            raise ValueError(
-                'Error occurred while packing the dataset. '
-                'Make sure that your dataset has enough samples to at least yield one packed sequence.') from exc
-        return packed_dataset
+        dataset_list = []
+        for item in constant_length_iterator:
+            dataset_list.append(item)
+        return HfDataset.from_list(dataset_list)
 
     def __len__(self):
         return len(self.dataset)
 
+    def calculate_matched_group(self, sequences: Dict[str, List[int]]):
+        # https://arxiv.org/pdf/2404.10830
+        import binpacking
+        binpacked = binpacking.to_constant_volume(sequences, self.seq_length, weight_pos=1)
+        packed_sequence = []
+        for sequence in binpacked:
+            packed = {}
+            for key in sequence[0][0].keys():
+                packed[key] = np.concatenate([s[0][key] for s in sequence])
+            packed_sequence.append(packed)
+        return packed_sequence
+
     def __iter__(self):
         iterator = iter(self.dataset)
         more_examples = True
@@ -229,18 +234,14 @@ def __iter__(self):
                     more_examples = False
                     break
 
-            packed_sequences = {}
+            sequences = []
             for example in buffer:
                 input, _ = self.template.encode(example)
-                for key in input.keys():
-                    if key not in packed_sequences:
-                        packed_sequences[key] = []
-                    packed_sequences[key].extend(input[key])
-
-            lens = len(packed_sequences[list(packed_sequences.keys())[0]])
-            for i in range(0, lens, self.seq_length):
-                example = {key: value[i:i + self.seq_length] for key, value in packed_sequences.items()}
-                yield example
+                sequences.append((input, len(input['input_ids'])))
+
+            packed_sequences = self.calculate_matched_group(sequences)
+            for sequence in packed_sequences:
+                yield sequence
 
 
 class LazyLLMDataset(Dataset):

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`accelerate`
	`2`	`+binpacking`
`2`	`3`	`dacite`
`3`	`4`	`jieba`
`4`	`5`	`matplotlib`