Skip to content

Commit 9c18c38

Browse files
committed
test config
1 parent 71df4d5 commit 9c18c38

File tree

2 files changed

+223
-0
lines changed

2 files changed

+223
-0
lines changed

configs/poly_pythia/2-8B_Seed_1.yml

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
{
2+
"pipe_parallel_size": 1,
3+
"model_parallel_size": 1,
4+
5+
"num_layers": 32,
6+
"hidden_size": 2560,
7+
"num_attention_heads": 32,
8+
"seq_length": 2048,
9+
"max_position_embeddings": 2048,
10+
"pos_emb": "rotary",
11+
"rotary_pct": 0.25,
12+
"no_weight_tying": true,
13+
"gpt_j_residual": true,
14+
"output_layer_parallelism": "column",
15+
16+
"attention_config": [[["flash"], 32]],
17+
18+
"scaled_upper_triang_masked_softmax_fusion": true,
19+
"bias_gelu_fusion": true,
20+
21+
"init_method": "small_init",
22+
"output_layer_init_method": "wang_init",
23+
24+
"optimizer": {
25+
"type": "Adam",
26+
"params": {
27+
"lr": 0.00016,
28+
"betas": [0.9, 0.95],
29+
"eps": 1.0e-8
30+
}
31+
},
32+
"min_lr": 0.000016,
33+
34+
"zero_optimization": {
35+
"stage": 1,
36+
"allgather_partitions": true,
37+
"allgather_bucket_size": 500000000,
38+
"overlap_comm": true,
39+
"reduce_scatter": true,
40+
"reduce_bucket_size": 500000000,
41+
"contiguous_gradients": true,
42+
"cpu_offload": false
43+
},
44+
45+
"train_micro_batch_size_per_gpu": 8,
46+
"gradient_accumulation_steps": 2,
47+
"data_impl": "mmap",
48+
"num_workers": 1,
49+
50+
"checkpoint_activations": true,
51+
"checkpoint_num_layers": 1,
52+
"partition_activations": true,
53+
"synchronize_each_layer": true,
54+
55+
"gradient_clipping": 1.0,
56+
"weight_decay": 0.1,
57+
"hidden_dropout": 0,
58+
"attention_dropout": 0,
59+
60+
"fp16": {
61+
"fp16": true,
62+
"enabled": true,
63+
"loss_scale": 0,
64+
"loss_scale_window": 1000,
65+
"initial_scale_power": 12,
66+
"hysteresis": 2,
67+
"min_loss_scale": 1
68+
},
69+
70+
"train_iters": 143000,
71+
"lr_decay_iters": 143000,
72+
"distributed_backend": "nccl",
73+
"lr_decay_style": "cosine",
74+
"warmup": 0.01,
75+
"checkpoint_factor": 1000,
76+
"extra_save_iters": [0,1,2,4,8,16,32,64,128,256,512],
77+
"eval_interval": 40000,
78+
"eval_iters": 10,
79+
80+
"log_grad_norm": true,
81+
82+
"log_interval": 10,
83+
"steps_per_print": 10,
84+
"wall_clock_breakdown": true,
85+
86+
"tokenizer_type": "HFTokenizer",
87+
88+
# Run-Specific
89+
"seed": 1,
90+
91+
# Test Dataset
92+
"data_path": "/data/enwik8/enwik8_text_document",
93+
"vocab_file": "/data/neox_tokenizer/tokenizer.json",
94+
95+
# Checkpoints
96+
"save": "/checkpoints/2-8b-seed-1",
97+
"load": "/checkpoints/2-8b-seed-1",
98+
"checkpoint_validation_with_forward_pass": False,
99+
100+
# Wandb
101+
"use_wandb": True,
102+
"wandb_host": "https://api.wandb.ai",
103+
"wandb_project": "pythia-extra-seeds",
104+
"wandb_team": "eleutherai",
105+
"wandb_run_name": "2-8b-seed-1-enwik8",
106+
107+
# Distributed Training
108+
"hostfile": "/workspace/hostfile",
109+
"deepspeed_mpi": True,
110+
"launcher": "openmpi",
111+
"deepspeed_extra_args": { "ssh_port": 2222 },
112+
}
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
{
2+
"pipe_parallel_size": 1,
3+
"model_parallel_size": 1,
4+
5+
"num_layers": 32,
6+
"hidden_size": 2560,
7+
"num_attention_heads": 32,
8+
"seq_length": 2048,
9+
"max_position_embeddings": 2048,
10+
"pos_emb": "rotary",
11+
"rotary_pct": 0.25,
12+
"no_weight_tying": true,
13+
"gpt_j_residual": true,
14+
"output_layer_parallelism": "column",
15+
16+
"attention_config": [[["flash"], 32]],
17+
18+
"scaled_upper_triang_masked_softmax_fusion": true,
19+
"bias_gelu_fusion": true,
20+
21+
"init_method": "small_init",
22+
"output_layer_init_method": "wang_init",
23+
24+
"optimizer": {
25+
"type": "Adam",
26+
"params": {
27+
"lr": 0.00016,
28+
"betas": [0.9, 0.95],
29+
"eps": 1.0e-8
30+
}
31+
},
32+
"min_lr": 0.000016,
33+
34+
"zero_optimization": {
35+
"stage": 1,
36+
"allgather_partitions": true,
37+
"allgather_bucket_size": 500000000,
38+
"overlap_comm": true,
39+
"reduce_scatter": true,
40+
"reduce_bucket_size": 500000000,
41+
"contiguous_gradients": true,
42+
"cpu_offload": false
43+
},
44+
45+
"train_micro_batch_size_per_gpu": 48,
46+
"gradient_accumulation_steps": 1,
47+
"data_impl": "mmap",
48+
"num_workers": 1,
49+
50+
"checkpoint_activations": true,
51+
"checkpoint_num_layers": 1,
52+
"partition_activations": true,
53+
"synchronize_each_layer": true,
54+
55+
"gradient_clipping": 1.0,
56+
"weight_decay": 0.1,
57+
"hidden_dropout": 0,
58+
"attention_dropout": 0,
59+
60+
"fp16": {
61+
"fp16": true,
62+
"enabled": true,
63+
"loss_scale": 0,
64+
"loss_scale_window": 1000,
65+
"initial_scale_power": 12,
66+
"hysteresis": 2,
67+
"min_loss_scale": 1
68+
},
69+
70+
"train_iters": 1000,
71+
"lr_decay_iters": 1000,
72+
"distributed_backend": "nccl",
73+
"lr_decay_style": "cosine",
74+
"warmup": 0.01,
75+
"checkpoint_factor": null,
76+
"eval_interval": 40000,
77+
"eval_iters": 10,
78+
79+
"log_grad_norm": true,
80+
81+
"log_interval": 10,
82+
"steps_per_print": 10,
83+
"wall_clock_breakdown": true,
84+
85+
"tokenizer_type": "HFTokenizer",
86+
87+
# Run-Specific
88+
"seed": 1,
89+
90+
# Test Dataset
91+
"data_path": "/data/enwik8/enwik8_text_document",
92+
"vocab_file": "/data/neox_tokenizer/tokenizer.json",
93+
94+
# Checkpoints
95+
"save": null,
96+
"load": null,
97+
"checkpoint_validation_with_forward_pass": False,
98+
99+
# Wandb
100+
"use_wandb": True,
101+
"wandb_host": "https://api.wandb.ai",
102+
"wandb_project": "pythia-extra-seeds",
103+
"wandb_team": "eleutherai",
104+
"wandb_run_name": "2-8b-seed-1-enwik8-single-node-test",
105+
106+
# Distributed Training
107+
"hostfile": "/workspace/hostfile",
108+
"deepspeed_mpi": True,
109+
"launcher": "openmpi",
110+
"deepspeed_extra_args": { "ssh_port": 2222 },
111+
}

0 commit comments

Comments
 (0)