PaddlePaddle · zhiqiu · Oct 31, 2022 · Sep 2, 2022 · Sep 6, 2022 · Sep 13, 2022
diff --git a/paddle/fluid/framework/new_executor/interpreter/execution_config.h b/paddle/fluid/framework/new_executor/interpreter/execution_config.h
@@ -26,6 +26,7 @@ namespace interpreter {
 struct ExecutionConfig {
   bool used_for_jit{false};
   bool create_local_scope{true};
+  bool used_for_control_flow_op{false};
 
   size_t host_num_threads;
   size_t deivce_num_threads;

diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -19,6 +19,7 @@
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/new_executor/interpreter/data_transfer.h"
+#include "paddle/fluid/framework/new_executor/interpreter/execution_config.h"
 #include "paddle/fluid/memory/stats.h"
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
 #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
@@ -227,7 +228,14 @@ void BuildVariableScope(const framework::BlockDesc& block,
     }
 
     if (var_desc->Persistable()) {
-      auto* ptr = inner_scope->Var(var_name);
+      // In principle, we should put all trainable parameters in global scope,
+      // which means the root of the scope tree. Some cases like quantization
+      // will look up these parameters in global scope.
+      const Scope* ancestor_scope = inner_scope;
+      while (ancestor_scope->parent()) {
+        ancestor_scope = ancestor_scope->parent();
+      }
+      auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var_name);
 
       VLOG(3) << "Initialize Variable " << var_name;
       // NOTE(zhiqiu): if var exists in scope and the type is right,
@@ -291,7 +299,7 @@ std::tuple<VariableValueMap, VariableIdMap> BuildVariableMap(
     const VariableNameMap& var_name_map,
     VariableScope* var_scope,
     Scope* local_scope,
-    bool allow_var_not_in_program = false,
+    bool find_var_recursively = false,
     bool allow_var_not_in_scope = false) {
   VariableValueMap name2var;
   VariableIdMap name2id;
@@ -301,16 +309,17 @@ std::tuple<VariableValueMap, VariableIdMap> BuildVariableMap(
     vars.reserve(item.second.size());
 
     for (auto& var_name : item.second) {
+      auto* var = local_scope->FindVar(var_name);
+
       if (!var_scope->HasVar(var_name)) {
-        if (allow_var_not_in_program && local_scope->FindVar(var_name)) {
+        if (find_var_recursively && var) {
           VLOG(3) << "Add " << var_name << " to var_scope";
           var_scope->AddVar(var_name, nullptr);
         } else if (allow_var_not_in_scope) {
           VLOG(4) << var_name << " don't exist in variable scope, skip it!";
           continue;
         }
       }
-      auto* var = local_scope->FindVar(var_name);
       auto var_id = var_scope->VarId(var_name);
       vars.push_back(var);
       ids.push_back(var_id);
@@ -419,16 +428,16 @@ void BuildOpFuncList(const platform::Place& place,
                      const std::set<std::string>& skip_gc_vars,
                      std::vector<OpFuncNode>* vec_func_list,
                      VariableScope* var_scope,
-                     bool use_local_scope,
-                     bool used_for_jit) {
+                     const ExecutionConfig& execution_config,
+                     bool use_local_scope) {
   Scope* local_scope = use_local_scope ? var_scope->GetMutableLocalScope()
                                        : var_scope->GetMutableScope();
   std::vector<std::unique_ptr<OperatorBase>>
       ops_unique;  // its elements will be moved to vec_func_list
   // Step 1: create all ops for current block.
   CreateAllOps(block, &ops_unique);
 
-  if (!used_for_jit) {
+  if (!execution_config.used_for_jit) {
     // If gc is enabled and block size > 1
     const ProgramDesc& main_program = *block.Program();
     operators::PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
@@ -479,14 +488,18 @@ void BuildOpFuncList(const platform::Place& place,
     bool allow_var_not_in_program = ops_with_var_not_in_program.count(op_type);
     bool allow_var_not_in_scope = ops_with_var_not_in_scope.count(op_type);
 
+    // ops in the control flow block may not find its inputs or outputs
+    // in VariableScope of the sub-block, so we need search it in parent scope.
+
     framework::VariableNameMap& input_name_map = op->Inputs();
     VariableValueMap ins_map;
     VariableIdMap ins_name2id;
-    std::tie(ins_map, ins_name2id) = BuildVariableMap(input_name_map,
-                                                      var_scope,
-                                                      local_scope,
-                                                      allow_var_not_in_program,
-                                                      allow_var_not_in_scope);
+    std::tie(ins_map, ins_name2id) = BuildVariableMap(
+        input_name_map,
+        var_scope,
+        local_scope,
+        execution_config.used_for_control_flow_op || allow_var_not_in_program,
+        allow_var_not_in_scope);
 
     framework::VariableNameMap& output_name_map = op->Outputs();
     VariableValueMap outs_map;
@@ -495,7 +508,7 @@ void BuildOpFuncList(const platform::Place& place,
         BuildVariableMap(output_name_map,
                          var_scope,
                          local_scope,
-                         /*allow_var_not_in_program=*/false,
+                         execution_config.used_for_control_flow_op,
                          allow_var_not_in_scope);
 
     // step 1: build OpFuncNode

diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h
@@ -24,6 +24,7 @@
 
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/new_executor/interpreter/execution_config.h"
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue.h"
 #include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
@@ -75,8 +76,8 @@ void BuildOpFuncList(const platform::Place& place,
                      const std::set<std::string>& skip_gc_vars,
                      std::vector<OpFuncNode>* vec_func_list,
                      VariableScope* scope,
-                     bool use_local_scope = true,
-                     bool used_for_jit = false);
+                     const ExecutionConfig& execution_config,
+                     bool use_local_scope = true);
 
 void AddFetch(const std::vector<std::string>& fetch_names,
               framework::BlockDesc* block);

diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -16,6 +16,8 @@
 
 #include <unordered_set>
 
+#include "gflags/gflags.h"
+
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
@@ -47,6 +49,12 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
                             true,
                             "Use local_scope in new executor(especially used "
                             "in UT), can turn off for better performance");
+PADDLE_DEFINE_EXPORTED_bool(control_flow_use_new_executor,
+                            true,
+                            "Use new executor in control flow op");
+PADDLE_DEFINE_EXPORTED_bool(control_flow_use_new_executor_cache,
+                            true,
+                            "Cache new executor in control flow op");
 
 DECLARE_bool(check_nan_inf);
 DECLARE_bool(benchmark);
@@ -107,7 +115,8 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
                                  const BlockDesc& block,
                                  const std::set<std::string>& skip_gc_vars,
                                  framework::Scope* scope,
-                                 bool used_for_jit)
+                                 bool used_for_jit,
+                                 bool used_for_control_flow_op)
     : place_(place),
       block_(block),
       execution_config_(place, block.OpSize()),
@@ -119,8 +128,10 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
   completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion);
 
   execution_config_.used_for_jit = used_for_jit;
-  execution_config_.create_local_scope =
-      !used_for_jit && FLAGS_new_executor_use_local_scope;
+  execution_config_.used_for_control_flow_op = used_for_control_flow_op;
+  execution_config_.create_local_scope = !used_for_jit &&
+                                         FLAGS_new_executor_use_local_scope &&
+                                         !used_for_control_flow_op;
   execution_config_.skip_gc_vars = skip_gc_vars;
   execution_config_.Log(/*log_level=*/8);
 
@@ -224,7 +235,7 @@ paddle::framework::FetchList InterpreterCore::Run(
 }
 
 paddle::framework::FetchList InterpreterCore::Run(
-    const std::vector<std::string>& feed_names) {
+    const std::vector<std::string>& feed_names, bool need_fetch) {
   SetDeviceId(place_);
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -243,12 +254,12 @@ paddle::framework::FetchList InterpreterCore::Run(
         execution_config_.skip_gc_vars,
         &op_func_nodes,
         &var_scope_,
-        HasLocalScope(),
-        execution_config_.used_for_jit);
-    is_build_ = true;
+        execution_config_,
+        HasLocalScope());
     SetFeedVarsInplaceSkip(feed_names);
     // convert vec func_list to graph
     Convert(&op_func_nodes);
+    is_build_ = true;
   } else {
     // For the program that only run once, it is no need to
     // create work_queue, so the async_work_queue_ is created
@@ -281,7 +292,7 @@ paddle::framework::FetchList InterpreterCore::Run(
   Scope* inner_scope =
       HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
   auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName);
-  if (fetch_var) {
+  if (fetch_var && need_fetch) {
     return std::move(*fetch_var->GetMutable<framework::FetchList>());
   } else {
     return {};
@@ -311,9 +322,18 @@ void InterpreterCore::reset_scope(Scope* new_scope) {
   var_scope_.SetScope(new_scope);
   auto& var_list = var_scope_.MutableVarList();
   for (size_t i = 0; i < var_list.size(); i++) {
-    var_list[i] = new_scope->FindVar(var_scope_.GetNameById(i));
+    const auto& var_name = var_scope_.GetNameById(i);
+    var_list[i] = new_scope->FindVar(var_name);
   }
-  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
+  // The index should assured valid, cause the InterpreterCore may not be fully
+  // built, but was still cached and used. For example, see unit test
+  // `test_assert.py`, it may exit before `InterpreterCore::Convert`, but still
+  // was cached and used by later tests.
+  for (size_t i = 0; i < std::min(refs_.size(), var_list.size()); i++) {
+    refs_[i]->ResetVariable(var_list[i]);
+  }
+
+  for (size_t i = 0; i < vec_instruction_.size(); i++) {
     BuildAndCacheInstructionCtx(&vec_instruction_[i]);
   }
 }
@@ -540,6 +560,10 @@ void InterpreterCore::Convert(
         if (var_desc && ins.count(item.first) &&
             !info.IsInArgBufferNeeded(var_desc->Name())) {
           continue;
+        } else if (!block_.HasVar(var_scope_.GetNameById(id))) {
+          VLOG(10) << "[gc_check_inputs] skip gc: "
+                   << var_scope_.GetNameById(id);
+          continue;
         }
         gc_check_vars.insert(id);
       }
@@ -661,9 +685,9 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
 
 #ifdef PADDLE_WITH_ASCEND_CL
   if (platform::is_npu_place(place)) {
-    // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
-    // values, but only through special `float_status` to checks whether
-    // the operation is overflow. More about `float_status`, see:
+    // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the
+    // variable values, but only through special `float_status` to checks
+    // whether the operation is overflow. More about `float_status`, see:
     // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
     if (FLAGS_check_nan_inf) {
       framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
@@ -734,7 +758,7 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
     }
   }
 
-  VLOG(4) << "End run " << place << " " << op->DebugStringEx(local_scope_);
+  VLOG(4) << "End run " << place << " " << op->DebugStringEx(local_scope);
 
   if (!instr_node.InplaceBackMap().empty()) {
     platform::RecordEvent inplaceback_event(
@@ -965,9 +989,9 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
     if (platform::is_gpu_place(place)) {
       memory::RecordStream(allocation, stream);
     } else if (platform::is_cuda_pinned_place(place)) {
-      // TODO(Ruibiao): Here should do something to make sure that the tensor is
-      // not freed until the H2D copies done. However, simplely launch a CUDA
-      // runtime callback to the H2D stream may lead a high performance
+      // TODO(Ruibiao): Here should do something to make sure that the tensor
+      // is not freed until the H2D copies done. However, simplely launch a
+      // CUDA runtime callback to the H2D stream may lead a high performance
       // overhead. As all the cases we meet in H2D are copies from CPUPlace at
       // present, we just log a WARNING here. A better design is required.
       LOG(WARNING) << "Copy data from a CUDAPinned tensor in an asynchronous "
@@ -984,8 +1008,8 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
    * instr.GCCheckVars.
    * 2. The stream which initializes this tensor is different from the stream
    * which the instruction run in.
-   * 3. The tensor is the instruction's input, cause we assume that instruction
-   * will initialize all output tensors with its running stream.
+   * 3. The tensor is the instruction's input, cause we assume that
+   * instruction will initialize all output tensors with its running stream.
    * 4. In the OP function of this instruction, the tensor is an input of a
    * async CUDA kernel.
    *
@@ -995,8 +1019,8 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
    * initialized this tensor has less time overhead. Conversely, it may take
    * more time if we try to extract those cross-stream input vars from
    * instr.GCCheckVars.
-   * 2. Now the instruction has no idea of which vars involving async running in
-   * OP function, and thus we can not recognize condition 4. It should be
+   * 2. Now the instruction has no idea of which vars involving async running
+   * in OP function, and thus we can not recognize condition 4. It should be
    * supported later.
    */
   for (int var_id : instr.GCCheckVars()) {
@@ -1099,12 +1123,12 @@ void InterpreterCore::Prepare(const std::vector<std::string>& feed_names,
         execution_config_.skip_gc_vars,
         &op_func_nodes,
         &var_scope_,
-        HasLocalScope(),
-        execution_config_.used_for_jit);
-    is_build_ = true;
+        execution_config_,
+        HasLocalScope());
     SetFeedVarsInplaceSkip(feed_names);
     // convert vec func_list to graph
     Convert(&op_func_nodes);
+    is_build_ = true;
   }
   // NOTE: Because feed_tensor will be GC after
   // paddle::framework::BuildOpFuncList, so we should

diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -34,6 +34,10 @@
 #include "paddle/fluid/memory/allocation/spin_lock.h"
 #include "paddle/fluid/platform/device_event.h"
 
+DECLARE_bool(new_executor_use_local_scope);
+DECLARE_bool(control_flow_use_new_executor);
+DECLARE_bool(control_flow_use_new_executor_cache);
+
 namespace paddle {
 namespace framework {
 
@@ -43,7 +47,8 @@ class InterpreterCore {
                   const BlockDesc& block,
                   const std::set<std::string>& skip_gc_vars,
                   Scope* scope,
-                  bool used_for_jit = false);
+                  bool used_for_jit = false,
+                  bool used_for_control_flow_op = false);
 
   ~InterpreterCore();
 
@@ -55,7 +60,8 @@ class InterpreterCore {
       const std::vector<std::string>& feed_names,
       const std::vector<phi::DenseTensor>& feed_tensors);
 
-  paddle::framework::FetchList Run(const std::vector<std::string>& feed_names);
+  paddle::framework::FetchList Run(const std::vector<std::string>& feed_names,
+                                   bool need_fetch = true);
 
   void ShareWorkQueueFrom(std::shared_ptr<InterpreterCore> src);
 
@@ -67,6 +73,8 @@ class InterpreterCore {
 
   void reset_scope(Scope* new_scope);
 
+  const platform::Place& GetPlace() const { return place_; }
+
  private:
   // build graph
   void Convert(std::vector<paddle::framework::OpFuncNode>* op_func_nodes);

diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -418,6 +418,7 @@ class VarRefInfo {
       dynamic_ref_ = static_ref_;
     }
   }
+  void ResetVariable(Variable* new_var) { var_ = new_var; }
   bool CheckAndDecrease() {
     return static_ref_ == 1 || (dynamic_ref_.fetch_sub(1) == 1);
   }

diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -28,8 +28,8 @@ paddle::framework::FetchList StandaloneExecutor::Run(
     const std::vector<std::string>& fetch_names) {
   platform::RecordEvent record_event(
       "StandaloneExecutor::run", platform::TracerEventType::UserDefined, 1);
-
   auto core = GetInterpreterCore(scope, prog_, feed_names, fetch_names, false);
+
   VLOG(4) << "StandaloneExecutor: " << this << ", InterpreterCore: " << core;
   return core->Run(feed_names);
 }

diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -8,7 +8,7 @@ register_operators(EXCLUDES conditional_block_op DEPS naive_executor)
 cc_library(
   conditional_block_op
   SRCS conditional_block_op.cc
-  DEPS executor)
+  DEPS standalone_executor executor)
 cc_library(
   op_variant
   SRCS op_variant.cc
@@ -29,7 +29,7 @@ cc_library(
 cc_test(
   conditional_block_op_test
   SRCS conditional_block_op_test.cc
-  DEPS conditional_block_op executor)
+  DEPS conditional_block_op standalone_executor executor)
 
 if(WITH_UNITY_BUILD)
   target_link_libraries(paddle_operators_controlflow_unity conditional_block_op)