Skip to content
This repository was archived by the owner on Jan 15, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions scripts/bert/finetune_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,11 @@
type=int,
default=128,
help='Maximum length of the sentence pairs')
parser.add_argument(
'--round_to', type=int, default=None,
help='The length of padded sequences will be rounded up to be multiple of this argument.'
'When round to is set to 8, training throughput may increase for mixed precision'
'training on GPUs with tensorcores.')
parser.add_argument(
'--seed', type=int, default=2, help='Random seed')
parser.add_argument(
Expand Down Expand Up @@ -379,8 +384,8 @@ def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab)
# bucket sampler for training
pad_val = vocabulary[vocabulary.padding_token]
batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Pad(axis=0, pad_val=pad_val), # input
nlp.data.batchify.Pad(axis=0, pad_val=0), # segment
nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to), # input
nlp.data.batchify.Pad(axis=0, pad_val=0, round_to=args.round_to), # segment
nlp.data.batchify.Stack(), # length
nlp.data.batchify.Stack(label_dtype)) # label
batch_sampler = nlp.data.sampler.FixedBucketSampler(data_train_len, batch_size=batch_size,
Expand All @@ -400,9 +405,10 @@ def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab)
loader_dev_list.append((segment, loader_dev))

# batchify for data test
test_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0, pad_val=pad_val),
nlp.data.batchify.Pad(axis=0, pad_val=0),
nlp.data.batchify.Stack())
test_batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to),
nlp.data.batchify.Pad(axis=0, pad_val=0, round_to=args.round_to),
nlp.data.batchify.Stack())
# transform for data test
test_trans = partial(convert_examples_to_features, tokenizer=tokenizer, truncate_length=max_len,
cls_token=vocab.cls_token if not use_roberta else vocab.bos_token,
Expand Down
16 changes: 8 additions & 8 deletions scripts/bert/finetune_squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,10 @@
'than this will be padded. default is 384')

parser.add_argument(
'--pad',
action='store_true',
help='Whether to pad to maximum length when preparing data batches. Default is False.')
'--round_to', type=int, default=None,
help='The length of padded sequences will be rounded up to be multiple of this argument.'
'When round to is set to 8, training throughput may increase for mixed precision'
'training on GPUs with tensorcores.')

parser.add_argument('--doc_stride',
type=int,
Expand Down Expand Up @@ -304,7 +305,6 @@
null_score_diff_threshold = args.null_score_diff_threshold

max_seq_length = args.max_seq_length
pad = args.pad
doc_stride = args.doc_stride
max_query_length = args.max_query_length
n_best_size = args.n_best_size
Expand Down Expand Up @@ -343,8 +343,8 @@

batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Stack(),
nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
nlp.data.batchify.Stack('float32'),
nlp.data.batchify.Stack('float32'),
nlp.data.batchify.Stack('float32'))
Expand Down Expand Up @@ -559,8 +559,8 @@ def calibration(net, num_calib_batches, quantized_dtype, calib_mode):
log.info('Number of records in dev data:{}'.format(len(dev_data)))

batchify_fn_calib = nlp.data.batchify.Tuple(
nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
nlp.data.batchify.Stack('float32'),
nlp.data.batchify.Stack('float32'))

Expand Down
19 changes: 10 additions & 9 deletions scripts/language_model/run_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,12 @@
type=int,
default=128,
help='Maximum length of the sentence pairs')

parser.add_argument(
'--pad',
default=True,
action='store_true',
help='Whether to pad to maximum length when preparing data batches. '
'Have to be true currently due to left padding')
'--round_to', type=int, default=None,
help='The length of padded sequences will be rounded up to be multiple of this argument.'
'When round to is set to 8, training throughput may increase for mixed precision'
'training on GPUs with tensorcores.')

parser.add_argument(
'--only_inference',
Expand Down Expand Up @@ -263,9 +263,9 @@ def preprocess_data(_tokenizer,
# bucket sampler for training
pad_val = _vocab[_vocab.padding_token]
batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Pad(axis=0, pad_val=pad_val), # input
nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to), # input
nlp.data.batchify.Stack(), # length
nlp.data.batchify.Pad(axis=0, pad_val=4), # segment
nlp.data.batchify.Pad(axis=0, pad_val=4, round_to=args.round_to), # segment
nlp.data.batchify.Stack(label_dtype)) # label
batch_sampler = nlp.data.sampler.FixedBucketSampler(data_train_len,
batch_size=batch_size,
Expand Down Expand Up @@ -293,8 +293,9 @@ def preprocess_data(_tokenizer,

# batchify for data test
test_batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Pad(axis=0, pad_val=pad_val),
nlp.data.batchify.Stack(), nlp.data.batchify.Pad(axis=0, pad_val=0))
nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to),
nlp.data.batchify.Stack(),
nlp.data.batchify.Pad(axis=0, pad_val=0, round_to=args.round_to))

# transform for data test
test_trans = partial(convert_examples_to_features,
Expand Down
23 changes: 23 additions & 0 deletions scripts/tests/test_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,29 @@ def test_bert_icsl():
process = subprocess.check_call([sys.executable, script] + arguments)
time.sleep(5)

@pytest.mark.serial
@pytest.mark.gpu
@pytest.mark.remote_required
@pytest.mark.integration
@pytest.mark.parametrize('dataset', ['MRPC'])
def test_xlnet_finetune_glue_with_round_to(dataset):
arguments = ['--batch_size', '32', '--task_name', dataset,
'--gpu', '1', '--epochs', '1', '--max_len', '32', '--round_to', '8']
process = subprocess.check_call([sys.executable, './scripts/language_model/run_glue.py']
+ arguments)
time.sleep(5)

@pytest.mark.serial
@pytest.mark.gpu
@pytest.mark.remote_required
@pytest.mark.integration
def test_finetune_squad_with_round_to():
arguments = ['--optimizer', 'adam', '--batch_size', '32',
'--gpu', '--epochs', '1', '--debug', '--max_seq_length', '32',
'--max_query_length', '8', '--doc_stride', '384', '--round_to', '8']
process = subprocess.check_call([sys.executable, './scripts/bert/finetune_squad.py']
+ arguments)
time.sleep(5)

@pytest.mark.serial
@pytest.mark.gpu
Expand Down