Skip to content

Commit f1e2191

Browse files
tjruwaseminjiaz
andauthored
PLD Client (#61) (#62)
* test commits in DSE * Support for porgressive layer dropping * Minor changes on PLD * update the finetune script * PLD client * Remove theta option Co-authored-by: Minjia Zhang <[email protected]> Co-authored-by: Minjia Zhang <[email protected]>
1 parent b9bb3ac commit f1e2191

12 files changed

+4639
-6
lines changed

BingBertSquad/turing/modelingpreln_layerdrop.py

Lines changed: 1652 additions & 0 deletions
Large diffs are not rendered by default.

bing_bert/bert_base.json

100644100755
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"name": "bing_bert_base_lamb_seq",
2+
"name": "bing_bert_base_seq",
33
"bert_token_file": "bert-base-uncased",
44
"bert_model_file": "bert-base-uncased",
55
"bert_model_config": {

bing_bert/bert_base_large_lr.json

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
{
2+
"name": "bing_bert_base_seq",
3+
"bert_token_file": "bert-base-uncased",
4+
"bert_model_file": "bert-base-uncased",
5+
"bert_model_config": {
6+
"vocab_size_or_config_json_file": 119547,
7+
"hidden_size": 768,
8+
"num_hidden_layers": 12,
9+
"num_attention_heads": 12,
10+
"intermediate_size": 3072,
11+
"hidden_act": "gelu",
12+
"hidden_dropout_prob": 0.1,
13+
"attention_probs_dropout_prob": 0.1,
14+
"max_position_embeddings": 512,
15+
"type_vocab_size": 2,
16+
"initializer_range": 0.02
17+
},
18+
"data": {
19+
"flags": {
20+
"pretrain_dataset": true,
21+
"pretrain_type": "wiki_bc"
22+
},
23+
"mixed_seq_datasets": {
24+
"128": {
25+
"wiki_pretrain_dataset": "bnorick_format/128/wiki_pretrain",
26+
"bc_pretrain_dataset": "bnorick_format/128/bookcorpus_pretrain"
27+
},
28+
"512": {
29+
"wiki_pretrain_dataset": "bnorick_format/512/wiki_pretrain",
30+
"bc_pretrain_dataset": "bnorick_format/512/bookcorpus_pretrain"
31+
}
32+
}
33+
},
34+
"mixed_seq_training": {
35+
"128": {
36+
"num_epochs": 200,
37+
"warmup_proportion": 0.02,
38+
"learning_rate": 1e-3,
39+
"num_workers": 0,
40+
"async_worker": true,
41+
"decay_rate": 0.99,
42+
"decay_step": 1000,
43+
"total_training_steps": 200000
44+
}
45+
},
46+
"validation": {
47+
"path": "validation_set/"
48+
}
49+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{
2+
"train_batch_size": 4096,
3+
"train_micro_batch_size_per_gpu": 16,
4+
"steps_per_print": 1000,
5+
"prescale_gradients": true,
6+
"gradient_predivide_factor": 8,
7+
"optimizer": {
8+
"type": "Adam",
9+
"params": {
10+
"lr": 1e-3,
11+
"weight_decay": 0.01,
12+
"bias_correction": false
13+
}
14+
},
15+
"gradient_clipping": 1.0,
16+
"wall_clock_breakdown": false,
17+
"fp16": {
18+
"enabled": true,
19+
"loss_scale": 0
20+
},
21+
"progressive_layer_drop": {
22+
"enabled": true,
23+
"theta": 0.5,
24+
"gamma": 0.001
25+
}
26+
}

bing_bert/deepspeed_train.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,8 +182,8 @@ def train(args,
182182
lr_this_step = update_learning_rate(
183183
args, config, global_step, optimizer)
184184

185-
report_step_metrics(args, lr_this_step, unscaled_loss,
186-
global_step, current_data_sample_count)
185+
report_step_metrics(args, lr_this_step, unscaled_loss,
186+
global_step, current_data_sample_count)
187187

188188
model.network.step()
189189

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/bin/bash
2+
3+
base_dir=`pwd`
4+
5+
# Where should we save checkpoints and tensorboard events?
6+
JOB_NAME=adam_4k_seq128_progressive_layer_drop
7+
OUTPUT_DIR=${base_dir}/bert_model_outputs
8+
9+
mkdir -p $OUTPUT_DIR
10+
11+
config="--progressive_layer_drop"
12+
13+
NCCL_TREE_THRESHOLD=0 deepspeed \
14+
${base_dir}/deepspeed_train.py \
15+
--cf ${base_dir}/bert_base_large_lr.json \
16+
--max_seq_length 128 \
17+
--output_dir $OUTPUT_DIR \
18+
--deepspeed \
19+
--print_steps 100 \
20+
--lr_schedule "LE" \
21+
--job_name $JOB_NAME \
22+
--deepspeed_config ${base_dir}/deepspeed_bsz4k_progressive_layer_drop_config_seq128.json \
23+
--data_path_prefix /data/bert \
24+
${config} \
25+
&> ${JOB_NAME}.log

0 commit comments

Comments
 (0)