dmlc · leezu · Feb 3, 2020 · Oct 25, 2019 · Oct 25, 2019 · Oct 25, 2019
@@ -116,6 +116,11 @@
     type=int,
     default=128,
     help='Maximum length of the sentence pairs')
+parser.add_argument(
+    '--round_to', type=int, default=None,
+    help='The length of padded sequences will be rounded up to be multiple of this argument.'
+         'When round to is set to 8, training throughput may increase for mixed precision'
+         'training on GPUs with tensorcores.')
 parser.add_argument(
     '--seed', type=int, default=2, help='Random seed')
 parser.add_argument(
@@ -379,8 +384,8 @@ def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab)
     # bucket sampler for training
     pad_val = vocabulary[vocabulary.padding_token]
     batchify_fn = nlp.data.batchify.Tuple(
-        nlp.data.batchify.Pad(axis=0, pad_val=pad_val),  # input
-        nlp.data.batchify.Pad(axis=0, pad_val=0),  # segment
+        nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to),  # input
+        nlp.data.batchify.Pad(axis=0, pad_val=0, round_to=args.round_to),  # segment
         nlp.data.batchify.Stack(),  # length
         nlp.data.batchify.Stack(label_dtype))  # label
     batch_sampler = nlp.data.sampler.FixedBucketSampler(data_train_len, batch_size=batch_size,
@@ -400,9 +405,10 @@ def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab)
         loader_dev_list.append((segment, loader_dev))
 
     # batchify for data test
-    test_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0, pad_val=pad_val),
-                                               nlp.data.batchify.Pad(axis=0, pad_val=0),
-                                               nlp.data.batchify.Stack())
+    test_batchify_fn = nlp.data.batchify.Tuple(
+        nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to),
+        nlp.data.batchify.Pad(axis=0, pad_val=0, round_to=args.round_to),
+        nlp.data.batchify.Stack())
     # transform for data test
     test_trans = partial(convert_examples_to_features, tokenizer=tokenizer, truncate_length=max_len,
                          cls_token=vocab.cls_token if not use_roberta else vocab.bos_token,

@@ -158,9 +158,10 @@
                     'than this will be padded. default is 384')
 
 parser.add_argument(
-    '--pad',
-    action='store_true',
-    help='Whether to pad to maximum length when preparing data batches. Default is False.')
+    '--round_to', type=int, default=None,
+    help='The length of padded sequences will be rounded up to be multiple of this argument.'
+         'When round to is set to 8, training throughput may increase for mixed precision'
+         'training on GPUs with tensorcores.')
 
 parser.add_argument('--doc_stride',
                     type=int,
@@ -304,7 +305,6 @@
 null_score_diff_threshold = args.null_score_diff_threshold
 
 max_seq_length = args.max_seq_length
-pad = args.pad
 doc_stride = args.doc_stride
 max_query_length = args.max_query_length
 n_best_size = args.n_best_size
@@ -343,8 +343,8 @@
 
 batchify_fn = nlp.data.batchify.Tuple(
     nlp.data.batchify.Stack(),
-    nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
-    nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
+    nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
+    nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
     nlp.data.batchify.Stack('float32'),
     nlp.data.batchify.Stack('float32'),
     nlp.data.batchify.Stack('float32'))
@@ -559,8 +559,8 @@ def calibration(net, num_calib_batches, quantized_dtype, calib_mode):
     log.info('Number of records in dev data:{}'.format(len(dev_data)))
 
     batchify_fn_calib = nlp.data.batchify.Tuple(
-        nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
-        nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
+        nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
+        nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
         nlp.data.batchify.Stack('float32'),
         nlp.data.batchify.Stack('float32'))
 

@@ -129,12 +129,12 @@
                     type=int,
                     default=128,
                     help='Maximum length of the sentence pairs')
+
 parser.add_argument(
-    '--pad',
-    default=True,
-    action='store_true',
-    help='Whether to pad to maximum length when preparing data batches. '
-    'Have to be true currently due to left padding')
+    '--round_to', type=int, default=None,
+    help='The length of padded sequences will be rounded up to be multiple of this argument.'
+         'When round to is set to 8, training throughput may increase for mixed precision'
+         'training on GPUs with tensorcores.')
 
 parser.add_argument(
     '--only_inference',
@@ -263,9 +263,9 @@ def preprocess_data(_tokenizer,
     # bucket sampler for training
     pad_val = _vocab[_vocab.padding_token]
     batchify_fn = nlp.data.batchify.Tuple(
-        nlp.data.batchify.Pad(axis=0, pad_val=pad_val),  # input
+        nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to),  # input
         nlp.data.batchify.Stack(),  # length
-        nlp.data.batchify.Pad(axis=0, pad_val=4),  # segment
+        nlp.data.batchify.Pad(axis=0, pad_val=4, round_to=args.round_to),  # segment
         nlp.data.batchify.Stack(label_dtype))  # label
     batch_sampler = nlp.data.sampler.FixedBucketSampler(data_train_len,
                                                         batch_size=batch_size,
@@ -293,8 +293,9 @@ def preprocess_data(_tokenizer,
 
     # batchify for data test
     test_batchify_fn = nlp.data.batchify.Tuple(
-        nlp.data.batchify.Pad(axis=0, pad_val=pad_val),
-        nlp.data.batchify.Stack(), nlp.data.batchify.Pad(axis=0, pad_val=0))
+        nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to),
+        nlp.data.batchify.Stack(),
+        nlp.data.batchify.Pad(axis=0, pad_val=0, round_to=args.round_to))
 
     # transform for data test
     test_trans = partial(convert_examples_to_features,

@@ -386,6 +386,29 @@ def test_bert_icsl():
     process = subprocess.check_call([sys.executable, script] + arguments)
     time.sleep(5)
 
+@pytest.mark.serial
+@pytest.mark.gpu
+@pytest.mark.remote_required
+@pytest.mark.integration
+@pytest.mark.parametrize('dataset', ['MRPC'])
+def test_xlnet_finetune_glue_with_round_to(dataset):
+    arguments = ['--batch_size', '32', '--task_name', dataset,
+                 '--gpu', '1', '--epochs', '1', '--max_len', '32', '--round_to', '8']
+    process = subprocess.check_call([sys.executable, './scripts/language_model/run_glue.py']
+                                    + arguments)
+    time.sleep(5)
+
+@pytest.mark.serial
+@pytest.mark.gpu
+@pytest.mark.remote_required
+@pytest.mark.integration
+def test_finetune_squad_with_round_to():
+    arguments = ['--optimizer', 'adam', '--batch_size', '32',
+                 '--gpu', '--epochs', '1', '--debug', '--max_seq_length', '32',
+                 '--max_query_length', '8', '--doc_stride', '384', '--round_to', '8']
+    process = subprocess.check_call([sys.executable, './scripts/bert/finetune_squad.py']
+                                    + arguments)
+    time.sleep(5)
 
 @pytest.mark.serial
 @pytest.mark.gpu