modelscope · tastelikefeet · May 13, 2025 · May 13, 2025
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -31,7 +31,7 @@
   - 注意：这三种实现并不一定都支持，这取决于对应模型的支持情况。
 - num_labels: 分类模型（即`--task_type seq_cls`）需要指定该参数。代表标签数量，默认为None。
 - problem_type: 分类模型（即`--task_type seq_cls`）需要指定该参数。可选为'regression', 'single_label_classification', 'multi_label_classification'。默认为None，根据num_labels和数据集类型进行自动设置。
-- rope_scaling: rope类型，支持`linear`和`dynamic`，请配合`max_length`共同使用。默认为None。
+- rope_scaling: rope类型，支持`linear`和`dynamic`和`yarn`，请配合`max_length`共同使用。默认为None。
 - device_map: 模型使用的device_map配置，例如：'auto'、'cpu'、json字符串、json文件路径。默认为None，根据设备和分布式训练情况自动设置。
 - max_memory: device_map设置为'auto'或者'sequential'时，会根据max_memory进行模型权重的device分配，例如：`--max_memory '{0: "20GB", 1: "20GB"}'`。默认为None。
 - local_repo_path: 部分模型在加载时依赖于github repo。为了避免`git clone`时遇到网络问题，可以直接使用本地repo。该参数需要传入本地repo的路径, 默认为`None`。

diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -32,7 +32,7 @@ Hints:
   - Note: These three implementations may not all be supported, depending on the support of the corresponding model.
 - num_labels: This parameter is required for classification models (i.e., `--task_type seq_cls`). It represents the number of labels, with a default value of None.
 - problem_type: This parameter is required for classification models (i.e., `--task_type seq_cls`). The options are 'regression', 'single_label_classification', and 'multi_label_classification'. The default value is None, and it will be automatically set based on the number of labels and the dataset type.
-- rope_scaling: Type of rope, supports `linear` and `dynamic`, should be used in conjunction with `max_length`. Default is None.
+- rope_scaling: Type of rope, supports `linear` and `dynamic` and `yarn`, should be used in conjunction with `max_length`. Default is None.
 - device_map: Device map configuration used by the model, such as 'auto', 'cpu', JSON string, or the path of a JSON file. The default is None, automatically set based on the device and distributed training conditions.
 - max_memory: When device_map is set to 'auto' or 'sequential', the model weights will be allocated to devices based on max_memory, for example: `--max_memory '{0: "20GB", 1: "20GB"}'`. The default value is None.
 - local_repo_path: Some models depend on a GitHub repo when loading. To avoid network issues during `git clone`, a local repo can be used directly. This parameter needs to be passed with the path to the local repo, with the default being `None`.

diff --git a/examples/train/long_text/sequence_parallel.sh b/examples/train/long_text/sequence_parallel.sh
@@ -1,10 +1,10 @@
 # Env: 4 * A100
-# Max Length: 16K
-# GPU Memory: 4 * 43GiB, Training Speed 12s/it
+# Max Length: 65536
+# GPU Memory: 4 * 53GiB, Training Speed 50s/it
 NPROC_PER_NODE=4 \
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
 swift sft \
-    --model Qwen/Qwen2.5-7B \
+    --model Qwen/Qwen2.5-7B-Instruct \
     --train_type full \
     --dataset 'AI-ModelScope/LongAlpaca-12k' \
     --torch_dtype bfloat16 \
@@ -13,16 +13,17 @@ swift sft \
     --learning_rate 1e-5 \
     --gradient_accumulation_steps 8 \
     --packing true \
+    --rope_scaling yarn \
+    --max_length 65536 \
     --eval_steps 200 \
     --save_steps 200 \
     --logging_steps 5 \
-    --max_length 16384 \
     --warmup_ratio 0.05 \
     --dataloader_num_workers 8 \
     --dataset_num_proc 8 \
     --save_total_limit 2 \
     --save_only_model true \
-    --output_dir output/Qwen2.5-7B \
+    --output_dir output/Qwen2.5-7B-Instruct \
     --deepspeed zero3 \
     --attn_impl flash_attn \
     --sequence_parallel_size 4
diff --git a/swift/llm/argument/base_args/model_args.py b/swift/llm/argument/base_args/model_args.py
@@ -45,7 +45,7 @@ class ModelArguments:
 
     num_labels: Optional[int] = None
     problem_type: Literal['regression', 'single_label_classification', 'multi_label_classification'] = None
-    rope_scaling: Literal['linear', 'dynamic'] = None
+    rope_scaling: Literal['linear', 'dynamic', 'yarn'] = None
     device_map: Optional[Union[dict, str]] = None
     max_memory: Optional[Union[dict, str]] = None
     # When some model code needs to be downloaded from GitHub,