Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
5c27c2c
[NPU]shard index op for npu (#35281)
sljlp Sep 1, 2021
5baccfd
add support ops for quantization (#35312)
juncaipeng Sep 1, 2021
7ca28bb
support KL label smooth (#35177)
QingshuChen Sep 1, 2021
3c21f26
Stablize depthwise conv (#35161)
jerrywgz Sep 1, 2021
b53887f
fix bug:When axes in paddle.slice is a tuple, an error occurs. (#35267)
hbwx24 Sep 1, 2021
b24f84c
[Dy2stat]modify dy2stat error message in compile time (#35320)
0x45f Sep 1, 2021
7f17f9a
bugfix for mp accuracy (#35326)
JZ-LIANG Sep 1, 2021
5eefc8c
[NPU] skip NPU UT if no npu files changed, test=develop (#35338)
qili93 Sep 1, 2021
4f54891
add input and output description docs for vision transform (#34926)
LielinJiang Sep 1, 2021
5fa7d9c
support setting linewidth when printing tensor (#35175)
zhiqiu Sep 1, 2021
7743cdf
add strided_slice_grad op for npu (#35204)
baoachun Sep 1, 2021
c56d697
modify fetch logic, use D2H Stream (#35191)
wanghuancoder Sep 1, 2021
bee511d
[NPU] set constant before copy data (#35335)
FeixLiu Sep 1, 2021
264ff9e
[HybridParallel]Support finetinue model for PipelineParallel (#35287)
ForFishes Sep 1, 2021
a647b80
[HeterPs] merge dense && data norm && g2sum (#35029)
Thunderbrook Sep 1, 2021
070cab1
Added slice BF16/FP32 FWD/BWD kernels (#34332)
jakpiase Sep 1, 2021
df57df9
add AsExtra for grid_sampler_op (#35339)
LielinJiang Sep 1, 2021
280d742
[npu] add update_loss_scaling npu min value (#35270)
Baibaifan Sep 2, 2021
a622b70
[Auto Parallel] Logical Partition & Dist Op (#35117)
JZ-LIANG Sep 2, 2021
25871e0
add axis check for elementwise op while the dimension of x is equal t…
wangxinxin08 Sep 2, 2021
b28cc73
fix static error in summary (#35303)
wangna11BD Sep 2, 2021
8525dd1
add npu code not exec linux/windows cases (#35363)
lelelelelez Sep 2, 2021
6e638d7
Refactor transpose cuda kernel impl. (#35308)
limin2021 Sep 2, 2021
67ed7e1
[hybrid] [npu] fit npu nan/inf check (#35171)
FeixLiu Sep 2, 2021
e57a88b
[NPU] Add label_smooth_op (#34828)
Ray2020BD Sep 2, 2021
7e5fb46
Add SVD Op and it's GPU and CPU kernel (#34953)
2742195759 Sep 2, 2021
bb63396
[NPU] Support npu kernel for gather_nd op (#34800)
JZZ-NOTE Sep 2, 2021
ba6a312
add log_softmax_op_npu (#35006)
juneweng Sep 3, 2021
648e377
add AsExtra() mark for layer_norm (#35415)
zoooo0820 Sep 3, 2021
e913796
[NPU] Add elementwise_pow_grad npu op (#35278)
wjj19950828 Sep 3, 2021
668bfb3
[NPU] add 32 extra bytes for npu memory slot (#35347)
zhiqiu Sep 3, 2021
4fe1bb4
fix mean/variance when is_test=True (#35328)
ceci3 Sep 3, 2021
fc8d46c
add lookup_table_v2_op AsExtra (#35265)
juneweng Sep 3, 2021
ef7bc36
Gather op (#35353)
juneweng Sep 3, 2021
a6cc567
[Dy2Stat]Modify dy2stat error message in runtime and format error mes…
0x45f Sep 3, 2021
ccd42db
fix flatten infershape (#35321)
danleifeng Sep 3, 2021
e77fd2e
tensor formatter (#35399)
Thunderbrook Sep 3, 2021
cec1abc
[NPU] update npu ut skil scirpts, test=develop (#35360)
qili93 Sep 3, 2021
0712611
fix a quantization bug (#35407)
XGZhang11 Sep 3, 2021
6cdc1dc
add AsExtra to the reshape op (#35358)
zmxdream Sep 3, 2021
42d3650
add AsExtra to concat op (#35380)
zmxdream Sep 3, 2021
8ba58eb
add AsExtra to partial_sum op (#35381)
zmxdream Sep 3, 2021
8305ba3
fix bn_infer and optimize momentum for kunlun (#35250)
tangzhiyi11 Sep 3, 2021
a9dfebb
[NPU]add conv2d_transpose npu op (#35232)
Sep 3, 2021
c171eca
Unify the implementation of AlignedVector and simplify the codes of d…
Xreki Sep 3, 2021
b333dac
[iscan] bugfix: DLTP-33615 / DLTP-33953 / DLTP-33968 / DLTP-34166 (#3…
WorgenZhang Sep 3, 2021
f13dcfb
Add AsExtra for transpose, lstm, gru (#35317)
joey12300 Sep 3, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ ELSE ()
ENDIF()

SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210818")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210830")
SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
Expand Down
1 change: 1 addition & 0 deletions cmake/operators.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ function(op_library TARGET)
list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc")
list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc")
list(REMOVE_ITEM hip_srcs "cholesky_op.cu")
list(REMOVE_ITEM hip_srcs "svd_op.cu")
list(REMOVE_ITEM hip_srcs "multinomial_op.cu")
list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/framework/distributed_strategy.proto
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ message PipelineConfig {
optional int32 micro_batch_size = 1 [ default = 1 ];
optional int32 accumulate_steps = 2 [ default = 1 ];
optional string schedule_mode = 3 [ default = '1F1B' ];
optional bool p2p_cache_shape = 4 [ default = true ];
}

message TensorParallelConfig {
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class Optimizer {
if (w < optimizer_config::min_bound) w = optimizer_config::min_bound;
if (w > optimizer_config::max_bound) w = optimizer_config::max_bound;

add_g2sum = scaled_grad * scaled_grad;
add_g2sum += scaled_grad * scaled_grad;

g2sum += add_g2sum;
}
Expand All @@ -64,7 +64,7 @@ class Optimizer {
w[i] = optimizer_config::mf_min_bound;
if (w[i] > optimizer_config::mf_max_bound)
w[i] = optimizer_config::mf_max_bound;
add_g2sum = scaled_grad * scaled_grad;
add_g2sum += scaled_grad * scaled_grad;
}

g2sum += add_g2sum / n;
Expand Down
17 changes: 4 additions & 13 deletions paddle/fluid/framework/new_executor/interpretercore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,7 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
main_program_(main_prog),
global_scope_(global_scope),
d2h_ctx_pool_({place}),
h2d_ctx_pool_({place}),
fetch_context_pool_({place}) {
h2d_ctx_pool_({place}) {
is_build_ = false;

garbages_.reset(new GarbageQueue());
Expand Down Expand Up @@ -339,9 +338,6 @@ void InterpreterCore::BuildInstructionCtx(Instruction* instr_node,
new RuntimeInferShapeContext(*op_base, *instr_node->runtime_ctx_.get()));

auto* dev_ctx = instr_node->dev_ctx_;
if (instr_node->kernel_func_.operator_base_->Type() == "fetch_v2") {
dev_ctx = fetch_context_pool_.Get(place);
}
Scope scope;

instr_node->execution_ctx_.reset(new ExecutionContext(
Expand All @@ -356,12 +352,6 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
instr_node.kernel_func_.operator_base_)
->InferShape(instr_node.infershape_ctx_.get());

if (instr_node.kernel_func_.operator_base_->Type() == "fetch_v2") {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place_);
dev_ctx->Wait(); // TODO(wanghuancoder)
}

instr_node.kernel_func_.compute_func_(*instr_node.execution_ctx_.get());
}

Expand Down Expand Up @@ -411,8 +401,6 @@ void InterpreterCore::ExecuteInstructionList(
working_var_ref);
}

fetch_context_pool_.Get(place)->Wait();

for (size_t i = 0; i < working_var_ref.size(); ++i) {
if (working_var_ref[i].var_ref_count_ != 0) {
std::cerr << " var ref is not zero " << i << std::endl;
Expand Down Expand Up @@ -671,6 +659,9 @@ void InterpreterCore::BuildOpFuncList(const platform::Place& place,
expected_kernel_key);
if (!platform::is_same_place(kernel_type_for_var.place_,
expected_kernel_key.place_)) {
if (op_base->Type() == "fetch_v2") {
op_base->SetAttr("deepcopy", false);
}
// need trans place
// 1. add var in scope
// 2. add copy op
Expand Down
2 changes: 0 additions & 2 deletions paddle/fluid/framework/new_executor/interpretercore.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,6 @@ class InterpreterCore {
size_t max_memory_size_;
size_t cur_memory_size_;
std::unique_ptr<WorkQueue> gc_queue_;

platform::DeviceContextPool fetch_context_pool_;
};
} // namespace framework
} // namespace paddle
8 changes: 4 additions & 4 deletions paddle/fluid/framework/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1254,10 +1254,10 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
}
#endif
#ifdef PADDLE_WITH_XPU
if ((kernel_iter == kernels.end() &&
is_xpu_place(expected_kernel_key.place_) &&
!paddle::platform::is_xpu_support_op(type_, expected_kernel_key)) ||
paddle::platform::is_in_xpu_black_list(type_)) {
if (is_xpu_place(expected_kernel_key.place_) &&
(kernel_iter == kernels.end() ||
!paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
paddle::platform::is_in_xpu_black_list(type_))) {
VLOG(3) << "missing XPU kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
Expand Down
36 changes: 26 additions & 10 deletions paddle/fluid/framework/ps_gpu_trainer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
trainer_desc.downpour_param().stat_var_names(i));
}
VLOG(3) << "going to initialize pull dense worker";
pull_dense_worker_ = PullDenseWorker::GetInstance();
pull_dense_worker_->Initialize(trainer_desc);
SetDebug(trainer_desc.debug());
trainer_desc_ = trainer_desc;
workers_.resize(place_num);
Expand Down Expand Up @@ -112,15 +110,21 @@ void PSGPUTrainer::InitTrainerEnv(const ProgramDesc& main_program,
}
}
}
for (auto& var : main_program.Block(0).AllVars()) {
if (var->Persistable()) {
auto it = std::find(need_merge_var_names_.begin(),
need_merge_var_names_.end(), var->Name());
if (it == need_merge_var_names_.end()) {
VLOG(2) << "train param: " << var->Name();
trainable_param_.push_back(var->Name());
}
}
}
place_ = place;
return;
}

void PSGPUTrainer::InitOtherEnv(const ProgramDesc& main_program) {
pull_dense_worker_->SetRootScope(root_scope_);
for (size_t i = 0; i < places_.size(); ++i) {
pull_dense_worker_->AddThreadScope(workers_[i]->GetThreadScope());
}
VLOG(3) << "init other env done.";
}

Expand All @@ -141,15 +145,27 @@ Scope* PSGPUTrainer::GetWorkerScope(int thread_id) { return nullptr; }
template <typename T>
void PSGPUTrainer::MergeToRootScope(LoDTensor* root_tensor, LoDTensor* tensor) {
LoDTensor tmp_root;
TensorCopy(*root_tensor, platform::CPUPlace(), &tmp_root);
TensorCopySync(*root_tensor, platform::CPUPlace(), &tmp_root);
T* tmp_root_data = tmp_root.data<T>();
LoDTensor tmp_tensor;
TensorCopy(*tensor, platform::CPUPlace(), &tmp_tensor);
TensorCopySync(*tensor, platform::CPUPlace(), &tmp_tensor);
T* data = tmp_tensor.data<T>();
for (int i = 0; i < tmp_tensor.numel(); i++) {
tmp_root_data[i] += data[i];
}
TensorCopy(tmp_root, platform::CPUPlace(), root_tensor);
TensorCopySync(tmp_root, platform::CPUPlace(), root_tensor);
}

void PSGPUTrainer::MergeDenseParam() {
auto thread_scope = workers_[0]->GetThreadScope();
for (auto& name : trainable_param_) {
VLOG(2) << "merge var " << name << " to root scope";
Variable* root_var = root_scope_->FindVar(name);
LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
Variable* var = thread_scope->FindVar(name);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
TensorCopySync((*tensor), root_tensor->place(), root_tensor);
}
}

void PSGPUTrainer::Finalize() {
Expand Down Expand Up @@ -187,7 +203,7 @@ void PSGPUTrainer::Finalize() {
_ForEachDataType_(MergeCallback);
}
}
pull_dense_worker_->MergeDenseParam();
MergeDenseParam();
root_scope_->DropKids();
}
} // namespace framework
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/framework/trainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@ class PSGPUTrainer : public TrainerBase {
}
virtual std::string GetDumpPath(int tid) { return ""; }
virtual void InitDumpEnv() {}
virtual void MergeDenseParam();

template <typename T>
void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
Expand All @@ -274,6 +275,7 @@ class PSGPUTrainer : public TrainerBase {
DownpourWorkerParameter param_;
std::map<uint64_t, std::vector<std::string>> dense_grad_names_;
std::vector<std::string> need_merge_var_names_;
std::vector<std::string> trainable_param_;
float scale_datanorm_;
paddle::platform::Place place_;
ProgramDesc program_;
Expand Down
8 changes: 4 additions & 4 deletions paddle/fluid/imperative/prepared_operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,10 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
auto& kernels = kernels_iter->second;
auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_XPU
if ((kernel_iter == kernels.end() &&
is_xpu_place(expected_kernel_key.place_) &&
!paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key)) ||
paddle::platform::is_in_xpu_black_list(op.Type())) {
if (is_xpu_place(expected_kernel_key.place_) &&
(kernel_iter == kernels.end() ||
!paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
paddle::platform::is_in_xpu_black_list(op.Type()))) {
VLOG(3) << "missing XPU kernel: " << op.Type()
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
Expand Down
10 changes: 6 additions & 4 deletions paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {

// For Ascend NPU
#ifdef PADDLE_WITH_ASCEND_CL
constexpr int EXTRA_PADDING_SIZE = 32;
class NPUBuddyAllocatorList {
private:
NPUBuddyAllocatorList() : devices_(platform::GetSelectedNPUDevices()) {
Expand Down Expand Up @@ -257,10 +258,11 @@ class NPUBuddyAllocatorList {

std::call_once(*init_flags_[pos], [this, pos] {
platform::SetNPUDeviceId(devices_[pos]);
allocators_[pos].reset(new BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(
new detail::NPUAllocator(devices_[pos])),
platform::NPUMinChunkSize(), platform::NPUMaxChunkSize()));
allocators_[pos].reset(
new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::NPUAllocator(devices_[pos])),
platform::NPUMinChunkSize(),
platform::NPUMaxChunkSize(), EXTRA_PADDING_SIZE));
VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable "
<< "'FLAGS_fraction_of_gpu_memory_to_use' "
Expand Down
12 changes: 9 additions & 3 deletions paddle/fluid/memory/detail/buddy_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,10 @@ namespace detail {

BuddyAllocator::BuddyAllocator(
std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
size_t max_chunk_size)
size_t max_chunk_size, size_t extra_padding_size)
: min_chunk_size_(min_chunk_size),
max_chunk_size_(max_chunk_size),
extra_padding_size_(extra_padding_size),
cache_(system_allocator->UseGpu()),
system_allocator_(std::move(system_allocator)) {}

Expand All @@ -59,9 +60,14 @@ inline size_t align(size_t size, size_t alignment) {

void* BuddyAllocator::Alloc(size_t unaligned_size) {
// adjust allocation alignment
size_t size =
align(unaligned_size + sizeof(MemoryBlock::Desc), min_chunk_size_);

size_t size =
align(unaligned_size + sizeof(MemoryBlock::Desc) + extra_padding_size_,
min_chunk_size_);
VLOG(10) << "alloc: " << unaligned_size
<< ", padding for desc: " << sizeof(MemoryBlock::Desc)
<< ", extra padding: " << extra_padding_size_
<< ", alignment: " << min_chunk_size_;
// acquire the allocator lock
std::lock_guard<std::mutex> lock(mutex_);

Expand Down
7 changes: 5 additions & 2 deletions paddle/fluid/memory/detail/buddy_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ namespace detail {
class BuddyAllocator {
public:
BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
size_t min_chunk_size, size_t max_chunk_size);
size_t min_chunk_size, size_t max_chunk_size,
size_t extra_padding_size = 0);

~BuddyAllocator();

Expand Down Expand Up @@ -86,7 +87,9 @@ class BuddyAllocator {
size_t min_chunk_size_; // the minimum size of each chunk
size_t max_chunk_size_; // the maximum size of each chunk

size_t realloc_size_ = 0; // the size of re-allocated chunk
size_t realloc_size_ = 0; // the size of re-allocated chunk
size_t extra_padding_size_ = 0; // the size of padding to the size of memory
// to alloc, especially used in NPU

private:
/**
Expand Down
23 changes: 15 additions & 8 deletions paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/npu_op_runner.h"

DECLARE_int32(min_loss_scaling);

namespace paddle {
namespace operators {

Expand Down Expand Up @@ -49,7 +51,7 @@ void Update(const platform::NPUDeviceContext& ctx,

std::vector<int> bad_out_data;
TensorToVector(*bad_out_tensor, ctx, &bad_out_data);
if (bad_out_data[0] == decr_every_n_nan_or_inf) {
if (bad_out_data[0] >= decr_every_n_nan_or_inf) {
const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)},
Expand All @@ -60,13 +62,18 @@ void Update(const platform::NPUDeviceContext& ctx,

std::vector<T> new_loss_scaling;
TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
if (new_loss_scaling[0] < static_cast<T>(1)) {
float min_value = 1.0;
if (FLAGS_min_loss_scaling > 1) {
min_value = static_cast<float>(FLAGS_min_loss_scaling);
}

if (new_loss_scaling[0] < min_value) {
// updated_loss_scaling_data = 1
const auto& runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)},
{"scale", static_cast<float>(0)},
{"shift", static_cast<float>(1)}});
const auto& runner_p4 = NpuOpRunner(
"Power", {*pre_loss_scaling_tensor}, {*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)},
{"scale", static_cast<float>(0)},
{"shift", static_cast<float>(min_value)}});

runner_p4.Run(stream);
}
Expand All @@ -93,7 +100,7 @@ void Update(const platform::NPUDeviceContext& ctx,
std::vector<int> good_out_data;
TensorToVector(*good_out_tensor, ctx, &good_out_data);

if (good_out_data[0] == incr_every_n_steps) {
if (good_out_data[0] >= incr_every_n_steps) {
const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
{*updated_loss_scaling_tensor},
{{"power", static_cast<float>(1)},
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/operators/batch_norm_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -848,7 +848,8 @@ void BatchNormGradMaker<T>::Apply(GradOpPtr<T> op) const {
}

// used when setting use_global_stats True during training
if (BOOST_GET_CONST(bool, this->GetAttr("use_global_stats"))) {
if (BOOST_GET_CONST(bool, this->GetAttr("use_global_stats")) ||
BOOST_GET_CONST(bool, this->GetAttr("is_test"))) {
op->SetInput("Mean", this->Output("MeanOut"));
op->SetInput("Variance", this->Output("VarianceOut"));
}
Expand Down
Loading