Skip to content

Commit 8878765

Browse files
Merge branch 'develop' into opt_fetch
2 parents c4faefd + 32c1ec4 commit 8878765

35 files changed

+777
-1026
lines changed

cmake/configure.cmake

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,6 @@ if(WITH_TESTING)
2020
add_definitions(-DPADDLE_WITH_TESTING)
2121
endif(WITH_TESTING)
2222

23-
if(WITH_INFERENCE_API_TEST)
24-
add_definitions(-DPADDLE_WITH_INFERENCE_API_TEST)
25-
endif(WITH_INFERENCE_API_TEST)
26-
2723
if(NOT WITH_PROFILER)
2824
add_definitions(-DPADDLE_DISABLE_PROFILER)
2925
endif(NOT WITH_PROFILER)

paddle/fluid/framework/executor_gc_helper.cc

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -31,49 +31,6 @@
3131
namespace paddle {
3232
namespace framework {
3333

34-
struct OpInOutInfo {
35-
public:
36-
void Build(const OperatorBase *op) {
37-
is_built_ = true;
38-
auto &inferer = op->Info().NoNeedBufferVarsInferer();
39-
if (inferer) {
40-
no_need_buffer_ins_ = inferer(op->Inputs(), op->Outputs(), op->Attrs());
41-
42-
if (no_need_buffer_ins_.empty()) return;
43-
44-
for (auto &in_name_pair : op->Inputs()) {
45-
if (no_need_buffer_ins_.count(in_name_pair.first) != 0) {
46-
continue;
47-
}
48-
49-
for (auto &in_arg_name : in_name_pair.second) {
50-
other_args_set_.insert(in_arg_name);
51-
}
52-
}
53-
54-
for (auto &out_name_pair : op->Outputs()) {
55-
for (auto &out_arg_name : out_name_pair.second) {
56-
other_args_set_.insert(out_arg_name);
57-
}
58-
}
59-
}
60-
}
61-
62-
bool IsBuilt() const { return is_built_; }
63-
64-
bool IsInArgBufferNeeded(const std::string &in_arg_name) const {
65-
return no_need_buffer_ins_.empty() ||
66-
other_args_set_.count(in_arg_name) != 0;
67-
}
68-
69-
private:
70-
// A set to record unused buffer input vars of op
71-
std::unordered_set<std::string> no_need_buffer_ins_;
72-
// A set to record other args of op (including in, out)
73-
std::unordered_set<std::string> other_args_set_;
74-
bool is_built_{false};
75-
};
76-
7734
static bool VarCanBeDeleted(const std::string &name, const BlockDesc &block,
7835
const std::unordered_set<std::string> &skip_vars) {
7936
if (skip_vars.count(name) != 0) {

paddle/fluid/framework/executor_gc_helper.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,49 @@ class GarbageCollector;
3131
class OperatorBase;
3232
class Scope;
3333

34+
struct OpInOutInfo {
35+
public:
36+
void Build(const OperatorBase *op) {
37+
is_built_ = true;
38+
auto &inferer = op->Info().NoNeedBufferVarsInferer();
39+
if (inferer) {
40+
no_need_buffer_ins_ = inferer(op->Inputs(), op->Outputs(), op->Attrs());
41+
42+
if (no_need_buffer_ins_.empty()) return;
43+
44+
for (auto &in_name_pair : op->Inputs()) {
45+
if (no_need_buffer_ins_.count(in_name_pair.first) != 0) {
46+
continue;
47+
}
48+
49+
for (auto &in_arg_name : in_name_pair.second) {
50+
other_args_set_.insert(in_arg_name);
51+
}
52+
}
53+
54+
for (auto &out_name_pair : op->Outputs()) {
55+
for (auto &out_arg_name : out_name_pair.second) {
56+
other_args_set_.insert(out_arg_name);
57+
}
58+
}
59+
}
60+
}
61+
62+
bool IsBuilt() const { return is_built_; }
63+
64+
bool IsInArgBufferNeeded(const std::string &in_arg_name) const {
65+
return no_need_buffer_ins_.empty() ||
66+
other_args_set_.count(in_arg_name) != 0;
67+
}
68+
69+
private:
70+
// A set to record unused buffer input vars of op
71+
std::unordered_set<std::string> no_need_buffer_ins_;
72+
// A set to record other args of op (including in, out)
73+
std::unordered_set<std::string> other_args_set_;
74+
bool is_built_{false};
75+
};
76+
3477
std::unordered_map<const OperatorBase *, std::vector<std::string>>
3578
GetUnusedVars(const BlockDesc &block,
3679
const std::vector<std::unique_ptr<OperatorBase>> &ops,
Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
1+
cc_library(workqueue SRCS workqueue.cc)
12
cc_library(interpretercore SRCS interpretercore.cc DEPS op_registry
23
device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
34
lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
4-
graph_to_program_pass variable_helper timer monitor)
5+
graph_to_program_pass variable_helper timer monitor workqueue device_event device_event_gpu)
56
cc_library(standalone_executor SRCS standalone_executor.cc DEPS interpretercore)
6-
cc_library(workqueue SRCS workqueue.cc)
7-
87
cc_test(workqueue_test SRCS workqueue_test.cc DEPS workqueue)
98
# cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)

paddle/fluid/framework/new_executor/interpretercore.cc

Lines changed: 183 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,13 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414
#include "paddle/fluid/framework/new_executor/interpretercore.h"
15+
#include "paddle/fluid/framework/executor_gc_helper.h"
16+
#include "paddle/fluid/framework/new_executor/interpretercore_gc_helper.h"
17+
18+
#if defined(PADDLE_WITH_CUDA)
19+
using ::paddle::platform::kCUDA;
20+
USE_EVENT(kCUDA);
21+
#endif
1522

1623
#include <unordered_set>
1724

@@ -145,6 +152,12 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
145152
d2h_ctx_pool_({place}),
146153
h2d_ctx_pool_({place}) {
147154
is_build_ = false;
155+
156+
garbages_.reset(new GarbageQueue());
157+
max_memory_size_ = static_cast<size_t>(GetEagerDeletionThreshold());
158+
cur_memory_size_ = 0;
159+
gc_queue_ = CreateSingleThreadedWorkQueue();
160+
148161
feed_names_ = feed_names;
149162

150163
// Step1: add feedop and fetchop to main_program
@@ -215,11 +228,24 @@ void InterpreterCore::Convert() {
215228
temp_inst.input_index_ = vec_func_list_[i].input_index;
216229
temp_inst.output_index_ = vec_func_list_[i].output_index;
217230

231+
OpInOutInfo info;
232+
218233
std::vector<size_t> gc_check_input_list;
219234
for (auto& item : vec_func_list_[i].input_index) {
220235
for (auto id : item.second) {
221236
input_var2op_info_[id].push_back(i);
222-
gc_check_input_list.push_back(id);
237+
// var can be gc-ed
238+
if (!info.IsBuilt()) {
239+
info.Build(op_list_[i]);
240+
}
241+
if (global_scope_->vec_meta_info_[id].vardesc_) {
242+
if (info.IsInArgBufferNeeded(
243+
global_scope_->vec_meta_info_[id].vardesc_->Name())) {
244+
gc_check_input_list.push_back(id);
245+
}
246+
} else {
247+
gc_check_input_list.push_back(id);
248+
}
223249
}
224250
}
225251
std::sort(gc_check_input_list.begin(), gc_check_input_list.end());
@@ -236,6 +262,13 @@ void InterpreterCore::Convert() {
236262
}
237263

238264
for (size_t i = 0; i < vec_instruction_.size(); ++i) {
265+
#if defined(PADDLE_WITH_CUDA)
266+
int device_type = static_cast<int>(paddle::platform::DeviceType::CUDA);
267+
paddle::platform::DeviceOption dev_opt(
268+
device_type, BOOST_GET_CONST(platform::CUDAPlace, place_).device);
269+
gc_event_.emplace_back(dev_opt);
270+
#endif
271+
239272
std::vector<size_t> vec_temp;
240273
for (auto& item : vec_instruction_[i].output_index_) {
241274
for (auto id : item.second) {
@@ -365,11 +398,8 @@ void InterpreterCore::ExecuteInstructionList(
365398
}
366399

367400
// GC infomation
368-
369-
auto& gc_check_list = instr_node.gc_check_var_list;
370-
for (auto var_id : gc_check_list) {
371-
--working_var_ref[var_id].var_ref_count_;
372-
}
401+
CheckGC(instr_id, instr_node.gc_check_var_list, var_scope, place,
402+
working_var_ref);
373403
}
374404

375405
for (size_t i = 0; i < working_var_ref.size(); ++i) {
@@ -379,6 +409,87 @@ void InterpreterCore::ExecuteInstructionList(
379409
}
380410
}
381411

412+
void InterpreterCore::CheckGC(size_t instr_id,
413+
const std::vector<size_t>& gc_check_list,
414+
const VariableScope& var_scope,
415+
const platform::Place& place,
416+
std::vector<VariableMetaInfo>& working_var_ref) {
417+
for (auto var_id : gc_check_list) {
418+
--working_var_ref[var_id].var_ref_count_;
419+
if (var_scope.vec_meta_info_[var_id].vardesc_ &&
420+
!var_scope.vec_meta_info_[var_id].vardesc_->Persistable() &&
421+
working_var_ref[var_id].var_ref_count_ == 0) {
422+
Variable* var = var_scope.var_list[var_id];
423+
if (var->IsType<LoDTensor>()) {
424+
garbages_->emplace_back(
425+
var->GetMutable<LoDTensor>()->MoveMemoryHolder());
426+
if (garbages_->back()) {
427+
cur_memory_size_ += garbages_->back()->size();
428+
}
429+
} else if (var->IsType<SelectedRows>()) {
430+
garbages_->emplace_back(var->GetMutable<SelectedRows>()
431+
->mutable_value()
432+
->MoveMemoryHolder());
433+
if (garbages_->back()) {
434+
cur_memory_size_ += garbages_->back()->size();
435+
}
436+
} else if (var->IsType<LoDTensorArray>()) {
437+
auto* tensor_arr = var->GetMutable<LoDTensorArray>();
438+
for (auto& t : *tensor_arr) {
439+
garbages_->emplace_back(t.MoveMemoryHolder());
440+
if (garbages_->back()) {
441+
cur_memory_size_ += garbages_->back()->size();
442+
}
443+
}
444+
} else {
445+
PADDLE_THROW(platform::errors::Unimplemented(
446+
"The variable(%s) is not supported in eager deletion.",
447+
framework::ToTypeName(var->Type())));
448+
}
449+
}
450+
}
451+
452+
if (!garbages_->empty()) {
453+
if (max_memory_size_ <= 1) {
454+
#if defined(PADDLE_WITH_CUDA)
455+
auto* dev_ctx = reinterpret_cast<platform::CUDADeviceContext*>(
456+
platform::DeviceContextPool::Instance().Get(place));
457+
gc_event_[instr_id].Record(place, dev_ctx);
458+
gc_queue_->AddTask(
459+
[ container = garbages_.release(), event = &gc_event_[instr_id] ]() {
460+
while (!event->Query()) {
461+
continue;
462+
}
463+
delete container;
464+
});
465+
garbages_.reset(new GarbageQueue());
466+
#else
467+
delete garbages_.release();
468+
garbages_.reset(new GarbageQueue());
469+
#endif
470+
} else if (cur_memory_size_ >= max_memory_size_) {
471+
#if defined(PADDLE_WITH_CUDA)
472+
auto* dev_ctx = reinterpret_cast<platform::CUDADeviceContext*>(
473+
platform::DeviceContextPool::Instance().Get(place));
474+
gc_event_[instr_id].Record(place, dev_ctx);
475+
gc_queue_->AddTask(
476+
[ container = garbages_.release(), event = &gc_event_[instr_id] ]() {
477+
while (!event->Query()) {
478+
continue;
479+
}
480+
delete container;
481+
});
482+
garbages_.reset(new GarbageQueue());
483+
cur_memory_size_ = 0;
484+
#else
485+
delete garbages_.release();
486+
garbages_.reset(new GarbageQueue());
487+
cur_memory_size_ = 0;
488+
#endif
489+
}
490+
}
491+
}
492+
382493
std::vector<size_t> InterpreterCore::MergeVector(
383494
const std::vector<size_t>& first, const std::vector<size_t>& second) {
384495
std::vector<size_t> out(first.size() + second.size());
@@ -407,6 +518,11 @@ void InterpreterCore::BuildVariableScope(const framework::ProgramDesc& pdesc,
407518
auto v = new Variable();
408519
InitializeVariable(v, var->GetType());
409520
var_scope->var_list.push_back(v);
521+
522+
VariableMetaInfo info;
523+
info.var_ref_count_ = 0;
524+
info.vardesc_ = var;
525+
var_scope->vec_meta_info_.push_back(info);
410526
}
411527
}
412528
}
@@ -419,6 +535,7 @@ void InterpreterCore::BuildOpFuncList(const platform::Place& place,
419535
auto& global_block = pdesc.Block(0);
420536
auto& all_op_kernels = OperatorWithKernel::AllOpKernels();
421537

538+
std::vector<OperatorBase*> ops;
422539
for (auto& op : global_block.AllOps()) {
423540
VLOG(3) << "Build OpFuncNode from : " << op->Type();
424541

@@ -434,6 +551,20 @@ void InterpreterCore::BuildOpFuncList(const platform::Place& place,
434551
// step 1. Prepare VariableValueMap of input/output
435552
auto op_base =
436553
info.Creator()(op->Type(), inputs_names, outputs_names, op_attr_map);
554+
ops.push_back(op_base);
555+
}
556+
557+
auto unused_var_map = get_unused_vars(global_block, ops);
558+
559+
size_t ops_index = 0;
560+
for (auto& op : global_block.AllOps()) {
561+
VLOG(3) << op->Type();
562+
// << op->Type() << endl;
563+
564+
auto op_base = ops[ops_index++];
565+
566+
auto inputs_names = op->Inputs();
567+
auto outputs_names = op->Outputs();
437568

438569
VariableValueMap ins_map;
439570
std::map<std::string, std::vector<int>> ins_name2id;
@@ -542,6 +673,11 @@ void InterpreterCore::BuildOpFuncList(const platform::Place& place,
542673
var_scope->name2id[new_var_name] = var_scope->var_list.size();
543674
var_scope->var_list.push_back(v);
544675

676+
VariableMetaInfo info;
677+
info.var_ref_count_ = 0;
678+
info.vardesc_ = nullptr;
679+
var_scope->vec_meta_info_.push_back(info);
680+
545681
VariableNameMap copy_in_map;
546682
auto x_iter = inputs_names.find(var_name_item.first);
547683
copy_in_map["X"] = {x_iter->second[i]};
@@ -647,6 +783,47 @@ void InterpreterCore::BuildOpFuncList(const platform::Place& place,
647783
op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second);
648784
op_func_node.kernel_func_(exec_ctx);
649785
vec_func_list->push_back(op_func_node);
786+
787+
// gc---------------------------------------------------------------------------
788+
auto iter = unused_var_map.find(op_base);
789+
if (iter == unused_var_map.end()) {
790+
continue;
791+
}
792+
793+
auto& delete_vars = iter->second;
794+
std::deque<std::shared_ptr<memory::Allocation>>* garbages =
795+
new std::deque<std::shared_ptr<memory::Allocation>>();
796+
797+
for (auto& var_name : delete_vars) {
798+
auto it = var_scope->name2id.find(var_name);
799+
assert(it != var_scope->name2id.end());
800+
auto* var = var_scope->var_list[it->second];
801+
if (var == nullptr) {
802+
continue;
803+
}
804+
805+
VLOG(2) << "Erase variable " << var_name;
806+
if (var->IsType<LoDTensor>()) {
807+
garbages->emplace_back(
808+
var->GetMutable<LoDTensor>()->MoveMemoryHolder());
809+
} else if (var->IsType<SelectedRows>()) {
810+
garbages->emplace_back(var->GetMutable<SelectedRows>()
811+
->mutable_value()
812+
->MoveMemoryHolder());
813+
} else if (var->IsType<LoDTensorArray>()) {
814+
auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
815+
for (auto& t : *lod_tensor_arr) {
816+
garbages->emplace_back(t.MoveMemoryHolder());
817+
}
818+
} else {
819+
PADDLE_THROW(platform::errors::Unimplemented(
820+
"Type %s of variable %s is not supported eager deletion.",
821+
framework::ToTypeName(var->Type()), var_name));
822+
}
823+
}
824+
825+
delete garbages; // free mem
826+
650827
VLOG(3) << "run " << op_base->Type() << " done.";
651828
}
652829
}

0 commit comments

Comments
 (0)