Skip to content

Commit 8812886

Browse files
add missing files (#631)
1 parent 18d1503 commit 8812886

File tree

3 files changed

+112
-0
lines changed

3 files changed

+112
-0
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Experimental environment: 8 * A100
2+
# Memory cost: 8 * 80G
3+
PYTHONPATH=../../.. \
4+
python llm_infer.py \
5+
--ckpt_dir output/grok-1/vxx-xxxx-xxxx/checkpoint-xxx \
6+
--dtype bf16 \
7+
--load_dataset_config true \
8+
--max_new_tokens 64 \
9+
--do_sample true \
10+
--dtype bf16 \
11+
--eval_human false \
12+
--merge_lora false \
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Experimental environment: 8 * A100
2+
# Memory cost: 8 * 21G
3+
nproc_per_node=8
4+
5+
PYTHONPATH=../../.. \
6+
torchrun \
7+
--nproc_per_node=$nproc_per_node \
8+
--master_port 29500 \
9+
llm_sft.py \
10+
--model_type grok-1 \
11+
--sft_type lora \
12+
--tuner_backend swift \
13+
--dtype bf16 \
14+
--output_dir output \
15+
--ddp_backend nccl \
16+
--dataset dureader-robust-zh \
17+
--train_dataset_sample -1 \
18+
--num_train_epochs 1 \
19+
--max_length 512 \
20+
--check_dataset_strategy warning \
21+
--lora_rank 8 \
22+
--lora_alpha 32 \
23+
--lora_dropout_p 0.05 \
24+
--lora_dtype bf16 \
25+
--lora_target_modules DEFAULT \
26+
--gradient_checkpointing true \
27+
--batch_size 2 \
28+
--weight_decay 0.1 \
29+
--learning_rate 1e-4 \
30+
--gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
31+
--max_grad_norm 0.5 \
32+
--warmup_ratio 0.03 \
33+
--eval_steps 100 \
34+
--save_steps 100 \
35+
--save_total_limit 2 \
36+
--logging_steps 10 \
37+
--deepspeed_config_path scripts/grok-1/lora_ddp_ds/zero3.json \
38+
--save_only_model true \
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
{
2+
"fp16": {
3+
"enabled": "auto",
4+
"loss_scale": 0,
5+
"loss_scale_window": 1000,
6+
"initial_scale_power": 16,
7+
"hysteresis": 2,
8+
"min_loss_scale": 1
9+
},
10+
11+
"bf16": {
12+
"enabled": "auto"
13+
},
14+
15+
"optimizer": {
16+
"type": "AdamW",
17+
"params": {
18+
"lr": "auto",
19+
"betas": "auto",
20+
"eps": "auto",
21+
"weight_decay": "auto"
22+
}
23+
},
24+
25+
"scheduler": {
26+
"type": "WarmupDecayLR",
27+
"params": {
28+
"total_num_steps": "auto",
29+
"warmup_min_lr": "auto",
30+
"warmup_max_lr": "auto",
31+
"warmup_num_steps": "auto"
32+
}
33+
},
34+
35+
"zero_optimization": {
36+
"stage": 3,
37+
"offload_optimizer": {
38+
"device": "none",
39+
"pin_memory": true
40+
},
41+
"offload_param": {
42+
"device": "cpu",
43+
"pin_memory": true
44+
},
45+
"overlap_comm": true,
46+
"contiguous_gradients": true,
47+
"sub_group_size": 1e9,
48+
"reduce_bucket_size": "auto",
49+
"stage3_prefetch_bucket_size": "auto",
50+
"stage3_param_persistence_threshold": "auto",
51+
"stage3_max_live_parameters": 1e9,
52+
"stage3_max_reuse_distance": 1e9,
53+
"stage3_gather_16bit_weights_on_model_save": true
54+
},
55+
56+
"gradient_accumulation_steps": "auto",
57+
"gradient_clipping": "auto",
58+
"steps_per_print": 2000,
59+
"train_batch_size": "auto",
60+
"train_micro_batch_size_per_gpu": "auto",
61+
"wall_clock_breakdown": false
62+
}

0 commit comments

Comments
 (0)