@@ -71,8 +71,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
7171
7272 AppendAddReaderDependencyPass ();
7373 AppendMultiDevPass ();
74- AppendMultiGraphOptPasses ();
75-
7674 AppendPassToSetMkldnnAttr (" onednn_placement_pass" );
7775 // runtime_context_cache pass should be the last pass to enable the attr of
7876 // all original and fused operators. But no operators can be enabled this
@@ -81,8 +79,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
8179 " runtime_context_cache_pass" );
8280 AppendPassWithCheck (strategy_.remove_unnecessary_lock_ ,
8381 " modify_op_lock_and_record_event_pass" );
84- // Note: This pass is used to check whether the multi_device_graph is right.
85- AppendPass (" multi_devices_check_pass" );
8682
8783 SetCollectiveContext ();
8884 }
@@ -144,29 +140,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
144140 }
145141 }
146142
147- void AppendMultiGraphOptPasses () {
148- // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator
149- // first, if the number is zero, fuse_all_reduce_ops will do nothing.
150- AppendPassWithCheck (strategy_.fuse_all_reduce_ops_ ,
151- " fuse_all_reduce_op_pass" );
152- AppendPrintGraphPass (" multi_devices_print_pass" , " _multi_devices_graph" );
153-
154- // experimental shows that the program will be faster if append
155- // all_reduce_deps_pass here.
156- bool append_all_reduce_deps_pass =
157- !strategy_.enable_parallel_graph_ &&
158- (SeqOnlyAllReduceOps (strategy_) ||
159- strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce );
160- AppendPassWithCheck (append_all_reduce_deps_pass, " all_reduce_deps_pass" );
161-
162- bool append_backward_optimizer_op_deps_pass =
163- strategy_.num_trainers_ > 1 && !strategy_.async_mode_ &&
164- !strategy_.is_distribution_ &&
165- strategy_.enable_backward_optimizer_op_deps_ ;
166- AppendPassWithCheck (append_backward_optimizer_op_deps_pass,
167- " backward_optimizer_op_deps_pass" );
168- }
169-
170143 void AppendOpFusePasses () {
171144 // 1. infernce pass if enabled.
172145 AppendPassWithCheck (
@@ -279,7 +252,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
279252 multi_devices_pass->SetNotOwned <const BuildStrategy>(" strategy" ,
280253 &strategy_);
281254 }
282-
283255 void AppendPrintGraphPass (const std::string &pass_name,
284256 const std::string &debug_file_suffix) {
285257 if (!strategy_.debug_graphviz_path_ .empty ()) {
@@ -391,66 +363,13 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
391363 (use_device == p::kXPU ) ? bkcl_ctxs : nullptr ;
392364 pass->Erase (kBKCLCtxs );
393365 pass->SetNotOwned <platform::BKCLCommunicator>(kBKCLCtxs , bkcl_ctx);
394- #endif
395- } else if (pass->Type () == " fuse_all_reduce_op_pass" ) {
396- pass->Erase (kNRanks );
397- pass->Set <size_t >(kNRanks , new size_t (nranks));
398- pass->Erase (kPlaces );
399- pass->SetNotOwned <const std::vector<platform::Place>>(kPlaces , &places);
400- pass->Erase (kLocalScopes );
401- pass->SetNotOwned <const std::vector<Scope *>>(kLocalScopes ,
402- &local_scopes);
403- #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
404- platform::NCCLCommunicator *nctx =
405- (use_device == p::kCUDA ) ? nccl_ctxs : nullptr ;
406- pass->Erase (kNCCLCtxs );
407- pass->SetNotOwned <platform::NCCLCommunicator>(kNCCLCtxs , nctx);
408- pass->Erase (kUseHierarchicalAllReduce );
409- pass->Set <bool >(kUseHierarchicalAllReduce ,
410- new bool (use_hierarchical_allreduce_));
411- #elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
412- platform::BKCLCommunicator *nctx =
413- (use_device == p::kXPU ) ? bkcl_ctxs : nullptr ;
414- pass->Erase (kBKCLCtxs );
415- pass->SetNotOwned <platform::BKCLCommunicator>(kBKCLCtxs , nctx);
416- pass->Erase (kUseHierarchicalAllReduce );
417- PADDLE_ENFORCE_EQ (use_hierarchical_allreduce_,
418- false ,
419- platform::errors::Unimplemented (
420- " xpu doesn't support hierarchical_allreduce" ));
421- pass->Set <bool >(kUseHierarchicalAllReduce ,
422- new bool (use_hierarchical_allreduce_));
423366#endif
424367 } else if (pass->Type () == " coalesce_grad_tensor_pass" ) {
425368 pass->Erase (kNRanks );
426369 pass->Set <size_t >(kNRanks , new size_t (nranks));
427370 } else if (pass->Type () == " sequential_execution_pass" ) {
428371 LOG (INFO) << " set enable_sequential_execution:"
429372 << enable_sequential_execution_;
430- } else if (pass->Type () == " all_reduce_deps_pass" ) {
431- #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
432- platform::NCCLCommunicator *nctx =
433- (use_device == p::kCUDA ) ? nccl_ctxs : nullptr ;
434- pass->Erase (kNCCLCtxs );
435- pass->SetNotOwned <platform::NCCLCommunicator>(kNCCLCtxs , nctx);
436- pass->Erase (kUseHierarchicalAllReduce );
437- pass->Set <bool >(kUseHierarchicalAllReduce ,
438- new bool (use_hierarchical_allreduce_));
439- #elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
440- platform::BKCLCommunicator *nctx =
441- (use_device == p::kXPU ) ? bkcl_ctxs : nullptr ;
442- pass->Erase (kBKCLCtxs );
443- pass->SetNotOwned <platform::BKCLCommunicator>(kBKCLCtxs , nctx);
444- pass->Erase (kUseHierarchicalAllReduce );
445- PADDLE_ENFORCE_EQ (use_hierarchical_allreduce_,
446- false ,
447- platform::errors::Unimplemented (
448- " xpu doesn't support hierarchical_allreduce" ));
449- pass->Set <bool >(kUseHierarchicalAllReduce ,
450- new bool (use_hierarchical_allreduce_));
451- #endif
452- VLOG (1 ) << " SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps (*this )
453- << " , num_trainers:" << num_trainers_;
454373 } else if (pass->Type () == " fuse_relu_depthwise_conv_pass" ) {
455374 if (use_device != p::kCUDA ) {
456375 VLOG (1 ) << " fuse_relu_depthwise_conv_pass is only supported on "
@@ -478,12 +397,6 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
478397 } else if (pass->Type () == " onednn_placement_pass" ) {
479398 pass->Set (" mkldnn_enabled_op_types" ,
480399 new std::unordered_set<std::string>(mkldnn_enabled_op_types_));
481- } else if (pass->Type () == " backward_optimizer_op_deps_pass" ) {
482- if (use_device != p::kCUDA ) {
483- VLOG (1 ) << " backward_optimizer_op_deps_pass is only supported on "
484- " GPU, skipped." ;
485- continue ;
486- }
487400 }
488401 VLOG (1 ) << " Start Apply Pass " << pass->Type ();
489402 if (FLAGS_convert_all_blocks) {
@@ -513,19 +426,14 @@ USE_PASS(no_reduce_multi_devices_pass);
513426USE_PASS (reduce_mode_multi_devices_pass);
514427USE_PASS (all_reduce_mode_multi_devices_pass);
515428USE_PASS (dist_multi_devices_pass);
516- USE_PASS (multi_devices_check_pass);
517- USE_PASS (multi_devices_print_pass);
518429USE_PASS (sequential_execution_pass);
519- USE_PASS (all_reduce_deps_pass);
520- USE_PASS (backward_optimizer_op_deps_pass);
521430USE_PASS (modify_op_lock_and_record_event_pass);
522431USE_PASS (lock_free_optimize_pass);
523432USE_PASS (coalesce_grad_tensor_pass);
524433USE_PASS (graph_to_program_pass);
525434USE_PASS (fuse_adam_op_pass);
526435USE_PASS (fuse_sgd_op_pass);
527436USE_PASS (fuse_momentum_op_pass);
528- USE_PASS (fuse_all_reduce_op_pass);
529437USE_PASS (runtime_context_cache_pass);
530438USE_PASS (add_reader_dependency_pass);
531439USE_PASS (delete_dropout_op_x_pass);
0 commit comments