Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion benchmark/fluid/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,14 @@ Currently supported `--model` argument include:

* Run the following command to start a benchmark job locally:
```bash
python fluid_benchmark.py --model mnist --device GPU
python fluid_benchmark.py --model mnist --device GPU
```
You can choose to use GPU/CPU training. With GPU training, you can specify
`--gpus <gpu_num>` to run multi GPU training.
You can set gradient clipping. With gradient clipping, you can specify
`--gradient_clipping_method GlobalNorm` to clip the gradient with global norm.
You can set regularizer to optimizer. With regularization, you can specify
`--weight_decay_regularizer_method L1` to add regularizer to optimizer.
* Run distributed training with parameter servers:
* start parameter servers:
```bash
Expand Down
69 changes: 69 additions & 0 deletions benchmark/fluid/fluid_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,67 @@ def parse_args():
help='The model to run benchmark with.')
parser.add_argument(
'--batch_size', type=int, default=32, help='The minibatch size.')
# args related to learning rate
parser.add_argument(
'--learning_rate', type=float, default=0.001, help='The learning rate.')
parser.add_argument(
'--learning_rate_decay_method',
type=str,
default=None,
choices=['exponential', 'natural_exp', 'inverse_time'],
help='Learning rate decay method, can be exponential, natural_exp, inverse_time'
)
parser.add_argument(
'--learning_rate_decay_steps',
type=int,
default=100000,
help='Decay steps for learning rate decay method')
parser.add_argument(
'--learning_rate_decay_rate',
type=float,
default=0.5,
help='Decay rate for learning rate decay method')
# args related to regularization
parser.add_argument(
'--weight_decay_regularizer_method',
type=str,
default=None,
choices=['L1', 'L2'],
help='Weight decay regularizer method, can be L1, L2')
parser.add_argument(
'--weight_decay_regularizer_coeff',
type=float,
default=0.1,
help='Weight decay regularizer coeff, 0.1 for default')
# args related to gradient clipping
parser.add_argument(
'--gradient_clip_method',
type=str,
default=None,
choices=['Norm', 'GlobalNorm'],
help='Gradient clipping method, can be Norm, GlobalNorm')
parser.add_argument(
'--gradient_clip_norm',
type=float,
default=1.,
help='Gradient clipping norm, 1. for default')
# args related to error clipping
parser.add_argument(
'--error_clip_method',
type=str,
default=None,
choices=['Value'],
help='Error clipping method, can be Value')
parser.add_argument(
'--error_clip_min',
type=float,
default=1e-6,
help='Error clipping min value, 1e-6 for default')
parser.add_argument(
'--error_clip_max',
type=float,
default=2e-6,
help='Error clipping max value, 2e-6 for default')
# TODO(wuyi): add "--use_fake_data" option back.
parser.add_argument(
'--skip_batch_num',
Expand Down Expand Up @@ -103,6 +162,16 @@ def parse_args():
default='local',
choices=['local', 'pserver', 'nccl2'],
help='Choose parameter update method, can be local, pserver, nccl2.')
parser.add_argument(
'--no_split_var',
action='store_true',
default=False,
help='Whether split variables into blocks when update_method is pserver')
parser.add_argument(
'--async_mode',
action='store_true',
default=False,
help='Whether start pserver in async mode to support ASGD')
args = parser.parse_args()
return args

Expand Down
24 changes: 21 additions & 3 deletions benchmark/fluid/models/machine_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
import paddle.fluid.core as core
import paddle.fluid.framework as framework
from paddle.fluid.executor import Executor
from models.model_base import get_decay_learning_rate
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

model_base is not uploaded?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for review, I added the benchmark/fluid/models/model_base.py file in next commit

from models.model_base import get_regularization
from models.model_base import set_error_clip
from models.model_base import set_gradient_clip


def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
Expand All @@ -50,7 +54,7 @@ def linear(inputs):


def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
target_dict_dim, is_generating, beam_size, max_length):
target_dict_dim, is_generating, beam_size, max_length, args):
"""Construct a seq2seq network."""

def bi_lstm_encoder(input_seq, gate_size):
Expand Down Expand Up @@ -99,6 +103,8 @@ def bi_lstm_encoder(input_seq, gate_size):
size=decoder_size,
bias_attr=False,
act='tanh')
set_error_clip(args.error_clip_method, encoded_proj.name,
args.error_clip_min, args.error_clip_max)

def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj,
decoder_boot, decoder_size):
Expand Down Expand Up @@ -211,12 +217,24 @@ def get_model(args):
dict_size,
False,
beam_size=beam_size,
max_length=max_length)
max_length=max_length,
args=args)

# clone from default main program
inference_program = fluid.default_main_program().clone()

optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
# set gradient clip
set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a way that we can disable these settings if the args is empty?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if clip_method in args is None, these settings will be disabled, and if user do NOT specify the args --gradient_clip_method, the args will be None in the case of default.

the code was like below

def set_gradient_clip(clip_method, clip_norm=1.):
    if not clip_method:
        return None


optimizer = fluid.optimizer.Adam(
learning_rate=get_decay_learning_rate(
decay_method=args.learning_rate_decay_method,
learning_rate=args.learning_rate,
decay_steps=args.learning_rate_decay_steps,
decay_rate=args.learning_rate_decay_rate),
regularization=get_regularization(
regularizer_method=args.weight_decay_regularizer_method,
regularizer_coeff=args.weight_decay_regularizer_coeff))

train_batch_generator = paddle.batch(
paddle.reader.shuffle(
Expand Down
26 changes: 23 additions & 3 deletions benchmark/fluid/models/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from models.model_base import get_decay_learning_rate
from models.model_base import get_regularization
from models.model_base import set_error_clip
from models.model_base import set_gradient_clip

SEED = 1
DTYPE = "float32"
Expand All @@ -32,7 +36,7 @@
# fluid.default_startup_program().random_seed = SEED


def cnn_model(data):
def cnn_model(data, args):
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=data,
filter_size=5,
Expand All @@ -48,6 +52,9 @@ def cnn_model(data):
pool_stride=2,
act="relu")

set_error_clip(args.error_clip_method, conv_pool_1.name,
args.error_clip_min, args.error_clip_max)

# TODO(dzhwinter) : refine the initializer and random seed settting
SIZE = 10
input_shape = conv_pool_2.shape
Expand All @@ -70,7 +77,8 @@ def get_model(args):
label = fluid.layers.data(name='label', shape=[1], dtype='int64')

# Train program
predict = cnn_model(images)
predict = cnn_model(images, args)

cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)

Expand All @@ -82,9 +90,21 @@ def get_model(args):
# inference program
inference_program = fluid.default_main_program().clone()

# set gradient clip
# set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm)

# Optimization
opt = fluid.optimizer.AdamOptimizer(
learning_rate=0.001, beta1=0.9, beta2=0.999)
learning_rate=get_decay_learning_rate(
decay_method=args.learning_rate_decay_method,
learning_rate=0.001,
decay_steps=args.learning_rate_decay_steps,
decay_rate=args.learning_rate_decay_rate),
regularization=get_regularization(
regularizer_method=args.weight_decay_regularizer_method,
regularizer_coeff=args.weight_decay_regularizer_coeff),
beta1=0.9,
beta2=0.999)

# Reader
train_reader = paddle.batch(
Expand Down
28 changes: 24 additions & 4 deletions benchmark/fluid/models/resnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.profiler as profiler
from models.model_base import get_decay_learning_rate
from models.model_base import get_regularization
from models.model_base import set_error_clip
from models.model_base import set_gradient_clip


def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
Expand Down Expand Up @@ -70,7 +74,7 @@ def layer_warp(block_func, input, ch_out, count, stride):
return res_out


def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
def resnet_imagenet(input, class_dim, args, depth=50, data_format='NCHW'):

cfg = {
18: ([2, 2, 2, 1], basicblock),
Expand All @@ -94,10 +98,12 @@ def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
pool_stride=1,
global_pooling=True)
out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
set_error_clip(args.error_clip_method, out.name, args.error_clip_min,
args.error_clip_max)
return out


def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
def resnet_cifar10(input, class_dim, args, depth=32, data_format='NCHW'):
assert (depth - 2) % 6 == 0

n = (depth - 2) // 6
Expand All @@ -110,6 +116,8 @@ def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
pool = fluid.layers.pool2d(
input=res3, pool_size=8, pool_type='avg', pool_stride=1)
out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
set_error_clip(args.error_clip_method, out.name, args.error_clip_min,
args.error_clip_max)
return out


Expand All @@ -132,7 +140,7 @@ def get_model(args):

input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
predict = model(input, class_dim)
predict = model(input, class_dim, args=args)
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)

Expand All @@ -145,7 +153,19 @@ def get_model(args):
inference_program = fluid.io.get_inference_program(
target_vars=[batch_acc, batch_size_tensor])

optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
# set gradient clip
set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm)

optimizer = fluid.optimizer.Momentum(
learning_rate=get_decay_learning_rate(
decay_method=args.learning_rate_decay_method,
learning_rate=0.01,
decay_steps=args.learning_rate_decay_steps,
decay_rate=args.learning_rate_decay_rate),
regularization=get_regularization(
regularizer_method=args.weight_decay_regularizer_method,
regularizer_coeff=args.weight_decay_regularizer_coeff),
momentum=0.9)

train_reader = paddle.batch(
paddle.reader.shuffle(
Expand Down
20 changes: 19 additions & 1 deletion benchmark/fluid/models/stacked_dynamic_lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@
import paddle.fluid as fluid
import paddle.batch as batch
import paddle.fluid.profiler as profiler
from models.model_base import get_decay_learning_rate
from models.model_base import get_regularization
from models.model_base import set_error_clip
from models.model_base import set_gradient_clip

word_dict = imdb.word_dict()

Expand Down Expand Up @@ -55,6 +59,9 @@ def get_model(args):

sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')

set_error_clip(args.error_clip_method, sentence.name, args.error_clip_min,
args.error_clip_max)

rnn = fluid.layers.DynamicRNN()
with rnn.block():
word = rnn.step_input(sentence)
Expand Down Expand Up @@ -110,7 +117,18 @@ def gate_common(
inference_program = fluid.io.get_inference_program(
target_vars=[batch_acc, batch_size_tensor])

adam = fluid.optimizer.Adam()
# set gradient clip
set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm)

adam = fluid.optimizer.Adam(
learning_rate=get_decay_learning_rate(
decay_method=args.learning_rate_decay_method,
learning_rate=0.001,
decay_steps=args.learning_rate_decay_steps,
decay_rate=args.learning_rate_decay_rate),
regularization=get_regularization(
regularizer_method=args.weight_decay_regularizer_method,
regularizer_coeff=args.weight_decay_regularizer_coeff))

train_reader = batch(
paddle.reader.shuffle(
Expand Down
23 changes: 20 additions & 3 deletions benchmark/fluid/models/vgg.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,13 @@
import paddle.fluid.core as core
import argparse
import functools
from models.model_base import get_decay_learning_rate
from models.model_base import get_regularization
from models.model_base import set_error_clip
from models.model_base import set_gradient_clip


def vgg16_bn_drop(input):
def vgg16_bn_drop(input, args):
def conv_block(input, num_filter, groups, dropouts):
return fluid.nets.img_conv_group(
input=input,
Expand All @@ -48,6 +52,8 @@ def conv_block(input, num_filter, groups, dropouts):
bn = fluid.layers.batch_norm(input=fc1, act='relu')
drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
set_error_clip(args.error_clip_method, fc1.name, args.error_clip_min,
args.error_clip_max)
return fc2


Expand All @@ -70,7 +76,7 @@ def get_model(args):
label = fluid.layers.data(name='label', shape=[1], dtype='int64')

# Train program
net = vgg16_bn_drop(images)
net = vgg16_bn_drop(images, args=args)
predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
Expand All @@ -86,8 +92,19 @@ def get_model(args):
inference_program = fluid.io.get_inference_program(
target_vars=[batch_acc, batch_size_tensor])

# set gradient clip
set_gradient_clip(args.gradient_clip_method, args.gradient_clip_norm)

# Optimization
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
optimizer = fluid.optimizer.Adam(
learning_rate=get_decay_learning_rate(
decay_method=args.learning_rate_decay_method,
learning_rate=args.learning_rate,
decay_steps=args.learning_rate_decay_steps,
decay_rate=args.learning_rate_decay_rate),
regularization=get_regularization(
regularizer_method=args.weight_decay_regularizer_method,
regularizer_coeff=args.weight_decay_regularizer_coeff))

# data reader
train_reader = paddle.batch(
Expand Down