【Hackathon 6th Fundable Projects 3 No.101】 [fluid_ops] distributed_fused_lamb (#66573)

co63oc · web-flow · commit 2408c4786ff4 · 2024-07-29T15:44:56.000+08:00
* Fix

* Fix

* Fix
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
@@ -169,64 +169,3 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(distributed_fused_lamb,
                              ops::DistributedFusedLambOp,
                              ops::DistributedFusedLambOpMaker);
-
-namespace phi {
-namespace fusion {
-
-template <typename T, typename Context>
-void DistributedFusedLambKernel(const Context &dev_ctx,
-                                const std::vector<const DenseTensor *> &param,
-                                const std::vector<const DenseTensor *> &grad,
-                                const paddle::optional<DenseTensor> &fp32_param,
-                                const paddle::optional<DenseTensor> &fp32_grad,
-                                const paddle::optional<DenseTensor> &fp16_param,
-                                const paddle::optional<DenseTensor> &fp16_grad,
-                                const DenseTensor &moment1,
-                                const DenseTensor &moment2,
-                                const DenseTensor &beta1_pow,
-                                const DenseTensor &beta2_pow,
-                                const DenseTensor &param_offsets,
-                                const DenseTensor &fp32_partial_offsets,
-                                const DenseTensor &fp16_partial_offsets,
-                                const DenseTensor &param_info,
-                                const DenseTensor &param_order,
-                                const DenseTensor &learning_rate,
-                                const DenseTensor &global_scale,
-                                int acc_steps,
-                                float beta1,
-                                float beta2,
-                                float epsilon,
-                                float max_global_grad_norm,
-                                float weight_decay,
-                                bool clip_after_allreduce,
-                                bool use_master_param_norm,
-                                bool use_master_acc_grad,
-                                bool is_grad_scaled_by_nranks,
-                                bool use_hierarchical_allreduce,
-                                int64_t nranks,
-                                const std::vector<int> &ring_ids,
-                                DenseTensor *fp32_param_out,
-                                DenseTensor *fp16_param_out,
-                                DenseTensor *fp32_acc_grad,
-                                DenseTensor *fp16_acc_grad,
-                                DenseTensor *moment1_out,
-                                DenseTensor *moment2_out,
-                                DenseTensor *beta1_pow_out,
-                                DenseTensor *beta2_pow_out,
-                                DenseTensor *param_out,
-                                DenseTensor *found_inf,
-                                DenseTensor *acc_step,
-                                DenseTensor *stop_update,
-                                DenseTensor *step) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "The distributed_fused_lamb operator does not support CPU yet."));
-}
-
-}  // namespace fusion
-}  // namespace phi
-
-PD_REGISTER_KERNEL(distributed_fused_lamb,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::fusion::DistributedFusedLambKernel,
-                   float) {}
diff --git a/paddle/phi/kernels/cpu/distributed_fused_lamb_kernel.cc b/paddle/phi/kernels/cpu/distributed_fused_lamb_kernel.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T, typename Context>
+void DistributedFusedLambKernel(const Context &dev_ctx,
+                                const std::vector<const DenseTensor *> &param,
+                                const std::vector<const DenseTensor *> &grad,
+                                const paddle::optional<DenseTensor> &fp32_param,
+                                const paddle::optional<DenseTensor> &fp32_grad,
+                                const paddle::optional<DenseTensor> &fp16_param,
+                                const paddle::optional<DenseTensor> &fp16_grad,
+                                const DenseTensor &moment1,
+                                const DenseTensor &moment2,
+                                const DenseTensor &beta1_pow,
+                                const DenseTensor &beta2_pow,
+                                const DenseTensor &param_offsets,
+                                const DenseTensor &fp32_partial_offsets,
+                                const DenseTensor &fp16_partial_offsets,
+                                const DenseTensor &param_info,
+                                const DenseTensor &param_order,
+                                const DenseTensor &learning_rate,
+                                const DenseTensor &global_scale,
+                                int acc_steps,
+                                float beta1,
+                                float beta2,
+                                float epsilon,
+                                float max_global_grad_norm,
+                                float weight_decay,
+                                bool clip_after_allreduce,
+                                bool use_master_param_norm,
+                                bool use_master_acc_grad,
+                                bool is_grad_scaled_by_nranks,
+                                bool use_hierarchical_allreduce,
+                                int64_t nranks,
+                                const std::vector<int> &ring_ids,
+                                DenseTensor *fp32_param_out,
+                                DenseTensor *fp16_param_out,
+                                DenseTensor *fp32_acc_grad,
+                                DenseTensor *fp16_acc_grad,
+                                DenseTensor *moment1_out,
+                                DenseTensor *moment2_out,
+                                DenseTensor *beta1_pow_out,
+                                DenseTensor *beta2_pow_out,
+                                DenseTensor *param_out,
+                                DenseTensor *found_inf,
+                                DenseTensor *acc_step,
+                                DenseTensor *stop_update,
+                                DenseTensor *step) {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "The distributed_fused_lamb operator does not support CPU yet."));
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(distributed_fused_lamb,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::fusion::DistributedFusedLambKernel,
+                   float) {}
diff --git a/paddle/phi/kernels/funcs/math.h b/paddle/phi/kernels/funcs/math.h
@@ -16,6 +16,7 @@
 
 #include "math.h"  // NOLINT
 #include "paddle/common/hostdevice.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/distributed_fused_lamb_kernel.cu b/paddle/phi/kernels/gpu/distributed_fused_lamb_kernel.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/phi/kernels/funcs/multi_tensor_apply_util.h"
 
 #include "paddle/phi/backends/context_pool.h"
@@ -33,7 +32,6 @@
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/common/flags.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
-COMMON_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
 #ifdef __NVCC__
@@ -903,30 +901,19 @@ static bool CreatePreMulScaleOpIfSupported(
     ncclRedOp_t *op,
     distributed::NCCLCommContext *comm_ctx = nullptr) {
 #if NCCL_VERSION_CODE >= 21100
-  if (FLAGS_dynamic_static_unified_comm) {
-    PADDLE_ENFORCE_NOT_NULL(
-        comm_ctx,
-        phi::errors::InvalidArgument(
-            "You choose to use new communication library by "
-            "setting environment "
-            "variable FLAGS_dynamic_static_unified_comm True. "
-            "But parameter of comm_ctx should not be nullptr."));
-    int ver = comm_ctx->GetNcclVersion();
-    if (ver >= 21100) {
-      VLOG(10) << "ncclRedOpCreatePreMulSum is supported.";
-      comm_ctx->RedOpCreatePreMulSum(
-          op, const_cast<void *>(scale), dtype, ncclScalarDevice);
-      return true;
-    }
-  } else {
-    int ver;
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetVersion(&ver));
-    if (ver >= 21100) {
-      VLOG(10) << "ncclRedOpCreatePreMulSum is supported.";
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpCreatePreMulSum(
-          op, const_cast<void *>(scale), dtype, ncclScalarDevice, comm));
-      return true;
-    }
+  PADDLE_ENFORCE_NOT_NULL(
+      comm_ctx,
+      phi::errors::InvalidArgument(
+          "You choose to use new communication library by "
+          "setting environment "
+          "variable FLAGS_dynamic_static_unified_comm True. "
+          "But parameter of comm_ctx should not be nullptr."));
+  int ver = comm_ctx->GetNcclVersion();
+  if (ver >= 21100) {
+    VLOG(10) << "ncclRedOpCreatePreMulSum is supported.";
+    comm_ctx->RedOpCreatePreMulSum(
+        op, const_cast<void *>(scale), dtype, ncclScalarDevice);
+    return true;
   }
 #endif
   VLOG(10) << "ncclRedOpCreatePreMulSum is not supported.";
@@ -940,18 +927,14 @@ static void DestoryOpIfSupported(
 #if NCCL_VERSION_CODE >= 21100
   VLOG(10) << "ncclRedOpDestroy starts";
 
-  if (FLAGS_dynamic_static_unified_comm) {
-    PADDLE_ENFORCE_NOT_NULL(
-        comm_ctx,
-        phi::errors::InvalidArgument(
-            "You choose to use new communication library by "
-            "setting environment "
-            "variable FLAGS_dynamic_static_unified_comm True. "
-            "But parameter of comm_ctx should not be nullptr."));
-    comm_ctx->RedOpDestroy(op);
-  } else {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpDestroy(op, comm));
-  }
+  PADDLE_ENFORCE_NOT_NULL(
+      comm_ctx,
+      phi::errors::InvalidArgument(
+          "You choose to use new communication library by "
+          "setting environment "
+          "variable FLAGS_dynamic_static_unified_comm True. "
+          "But parameter of comm_ctx should not be nullptr."));
+  comm_ctx->RedOpDestroy(op);
   VLOG(10) << "ncclRedOpDestroy ends";
 
 #endif
@@ -989,15 +972,13 @@ static void NCCLSumWithScaleBase(const T *sendbuff,
                                  const phi::GPUContext &dev_ctx,
                                  distributed::NCCLCommContext *comm_ctx,
                                  const T *scale = nullptr) {
-  if (FLAGS_dynamic_static_unified_comm) {
-    PADDLE_ENFORCE_NOT_NULL(
-        comm_ctx,
-        phi::errors::InvalidArgument(
-            "You choose to use new communication library by "
-            "setting environment "
-            "variable FLAGS_dynamic_static_unified_comm True. "
-            "But parameter of comm_ctx should not be nullptr."));
-  }
+  PADDLE_ENFORCE_NOT_NULL(
+      comm_ctx,
+      phi::errors::InvalidArgument(
+          "You choose to use new communication library by "
+          "setting environment "
+          "variable FLAGS_dynamic_static_unified_comm True. "
+          "But parameter of comm_ctx should not be nullptr."));
 
   static_assert(
       std::is_same<T, float>::value || std::is_same<T, dtype::float16>::value,
@@ -1758,71 +1739,45 @@ void DistributedFusedLambKernel(
   int64_t global_rank = 0, local_rank = 0;
   ncclComm_t global_comm = nullptr, local_comm = nullptr,
              external_comm = nullptr;
-  paddle::platform::NCCLComm *nccl_comm_handle = nullptr,
-                             *local_nccl_comm_handle = nullptr;
   distributed::NCCLCommContext *comm_ctx = nullptr, *local_comm_ctx = nullptr,
                                *external_comm_ctx = nullptr;
 
   const auto &comm_context_manager =
       phi::distributed::CommContextManager::GetInstance();
 
-  if (FLAGS_dynamic_static_unified_comm) {
-    CheckCommContextHasRingId(comm_context_manager, ring_ids[0]);
-
-    comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
-        comm_context_manager.Get(std::to_string(ring_ids[0])));
-    PADDLE_ENFORCE_NE(comm_ctx,
-                      nullptr,
-                      phi::errors::Unavailable(
-                          "NCCLCommContext is nullptr, collective op should "
-                          "has ring_id attr."));
-
-    global_comm = comm_ctx->GetNcclComm();
-    global_rank = comm_ctx->GetRank();
-    if (local_shard) {
-      CheckCommContextHasRingId(comm_context_manager, ring_ids[1]);
+  CheckCommContextHasRingId(comm_context_manager, ring_ids[0]);
 
-      local_comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
-          comm_context_manager.Get(std::to_string(ring_ids[1])));
-      local_comm = local_comm_ctx->GetNcclComm();
-      local_rank = local_comm_ctx->GetRank();
-      if (use_hierarchical_allreduce) {
-        CheckCommContextHasRingId(comm_context_manager, ring_ids[2]);
+  comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
+      comm_context_manager.Get(std::to_string(ring_ids[0])));
+  PADDLE_ENFORCE_NE(comm_ctx,
+                    nullptr,
+                    phi::errors::Unavailable(
+                        "NCCLCommContext is nullptr, collective op should "
+                        "has ring_id attr."));
 
-        external_comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
-            comm_context_manager.Get(std::to_string(ring_ids[2])));
-        external_comm = external_comm_ctx->GetNcclComm();
-      }
-    } else {
-      local_comm = global_comm;
-      local_rank = global_rank;
+  global_comm = comm_ctx->GetNcclComm();
+  global_rank = comm_ctx->GetRank();
+  if (local_shard) {
+    CheckCommContextHasRingId(comm_context_manager, ring_ids[1]);
+
+    local_comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
+        comm_context_manager.Get(std::to_string(ring_ids[1])));
+    local_comm = local_comm_ctx->GetNcclComm();
+    local_rank = local_comm_ctx->GetRank();
+    if (use_hierarchical_allreduce) {
+      CheckCommContextHasRingId(comm_context_manager, ring_ids[2]);
+
+      external_comm_ctx = static_cast<phi::distributed::NCCLCommContext *>(
+          comm_context_manager.Get(std::to_string(ring_ids[2])));
+      external_comm = external_comm_ctx->GetNcclComm();
     }
-
-    VLOG(3) << "new comm_context_manager has ring_id " << ring_ids[0];
   } else {
-    if (nranks > 1) {
-      nccl_comm_handle =
-          paddle::platform::NCCLCommContext::Instance().Get(ring_ids[0], place);
-      global_comm = nccl_comm_handle->comm();
-      global_rank = nccl_comm_handle->rank();
-      if (local_shard) {
-        local_nccl_comm_handle =
-            paddle::platform::NCCLCommContext::Instance().Get(ring_ids[1],
-                                                              place);
-        local_comm = local_nccl_comm_handle->comm();
-        local_rank = local_nccl_comm_handle->rank();
-        if (use_hierarchical_allreduce) {
-          external_comm = paddle::platform::NCCLCommContext::Instance()
-                              .Get(ring_ids[2], place)
-                              ->comm();
-        }
-      } else {
-        local_comm = global_comm;
-        local_rank = global_rank;
-      }
-    }
+    local_comm = global_comm;
+    local_rank = global_rank;
   }
 
+  VLOG(3) << "new comm_context_manager has ring_id " << ring_ids[0];
+
   memory_utils::Buffer grad_norm_square_buffer(place);
   auto *fp32_square_grad_norm = grad_norm_square_buffer.Alloc<float>(2);
   memory_utils::Buffer cub_tmp_buffer(place);
diff --git a/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py b/test/legacy_test/test_distributed_fused_lamb_op_with_clip.py
@@ -68,7 +68,7 @@ def run_test(
     os.environ['MAX_GLOBAL_NORM'] = str(max_global_norm)
     os.environ['GRADIENT_MERGE_STEPS'] = str(gradient_merge_steps)
     os.environ['USE_MASTER_ACC_GRAD'] = str(1 if use_master_acc_grad else 0)
-    os.environ["FLAGS_dynamic_static_unified_comm"] = "0"
+    os.environ["FLAGS_dynamic_static_unified_comm"] = "1"
     os.environ.update(need_env)
 
     touch_file_env = 'SUCCESS_TOUCH_FILE'