[CINN][New Hardware]Fix dcu compile (#69159)

feixi21 · web-flow · commit 65e5eb40b544 · 2024-11-08T15:17:18.000+08:00
* hip_squeezenet

* fix paddle hip compile

* fix_hip_compile

* fix_dcu_compile

* fix_dcu_compile

* fix_dcu_compile

* fix_dcu_compile

* fix_dcu_compile

* fix_dcu_compile
diff --git a/paddle/cinn/backends/codegen_device_util.cc b/paddle/cinn/backends/codegen_device_util.cc
@@ -217,7 +217,7 @@ void detail::CollectBucketStrategyHostFunctionVisitor::ProcessLoweredFunc(
       },
       [&](common::HygonDCUArchHIP) {
 #ifdef CINN_WITH_HIP
-        shared_mem_bytes = hip::CalculateSharedMemory(func);
+        shared_mem_bytes = CalculateSharedMemory(func);
 #endif
       });
 
diff --git a/paddle/cinn/common/dev_info_manager.h b/paddle/cinn/common/dev_info_manager.h
@@ -20,6 +20,7 @@
 #include "paddle/cinn/common/macros.h"
 #include "paddle/cinn/common/nvgpu_dev_info.h"
 #include "paddle/cinn/common/target.h"
+#include "paddle/common/enforce.h"
 
 namespace cinn {
 namespace common {
diff --git a/paddle/cinn/common/target.cc b/paddle/cinn/common/target.cc
@@ -242,6 +242,7 @@ std::string Target::arch_str() const {
 }
 
 std::string Target::device_name_str() const {
+#ifdef CINN_WITH_CUDA
   int device_idx = 0;
   cudaError_t result = cudaGetDevice(&device_idx);
   if (result != cudaSuccess) {
@@ -265,6 +266,9 @@ std::string Target::device_name_str() const {
   std::string device_name = properties.name;
   device_name = std::regex_replace(device_name, std::regex(" "), "_");
   return std::regex_replace(device_name, std::regex("-"), "_");
+#else
+  CINN_NOT_IMPLEMENTED
+#endif
 }
 
 std::ostream &operator<<(std::ostream &os, const Target &target) {
diff --git a/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/pir/op_lowering_impl.cc
@@ -798,7 +798,7 @@ ir::Expr OpLowererImpl::LowerX86(const OpLoweringGroupPtr& group,
 
   std::vector<ir::Expr> func_bodies =
       LowerOps(group, ops, &group_func_arg_tensors, &tensor_map);
-  this->target_ = common::DefaultNVGPUTarget();
+  this->target_ = common::DefaultDeviceTarget();
   cinn::runtime::CurrentTarget::SetCurrentTarget(this->target_);
   ir::ModuleExpr mod_expr(func_bodies);
   ir::IRSchedule ir_sch(
diff --git a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc
@@ -20,6 +20,8 @@
 #include "paddle/common/errors.h"
 #include "paddle/common/performance_statistician.h"
 #include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_resources.h"
 #if defined(PADDLE_WITH_CUDA)
 #include "paddle/cinn/runtime/cinn_runtime.h"
 #endif
@@ -112,32 +114,39 @@ class CinnJitInstruction::FnPtrImpl {
       ::common::PerformanceStatistician& ps =
           ::common::PerformanceStatistician::Instance();
       auto data_p = static_cast<void*>(func_args_.data());
-      cudaStream_t stream;
-      cudaStreamCreate(&stream);
-      cudaDeviceSynchronize();
+      phi::gpuStream_t stream;
+      phi::InitStream(&stream);
+      phi::backends::gpu::GpuDeviceSync();
       if (is_gpu) {
         ps.SetGraphNodesNum(25);
         int graph_nodes_num = ps.GetGraphNodesNum();
-        cudaGraph_t graph;
-        cudaGraphExec_t instance;
-        cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
+        phi::gpuGraph_t graph;
+        phi::gpuGraphExec_t instance;
+        phi::gpuStreamBeginCapture(
+            stream, gpuStreamCaptureMode(0));  // StreamCaptureModeGlobal
         for (int ikrnl = 0; ikrnl < graph_nodes_num; ikrnl++) {
           ((lower_func_ptr_g)cinn_kernel_info_.fn_ptr)(
               static_cast<void*>(func_args_.data()), func_args_.size(), stream);
         }
-        cudaStreamEndCapture(stream, &graph);
+        phi::gpuStreamEndCapture(stream, &graph);
+#ifdef PADDLE_WITH_CUDA
         cudaGraphInstantiate(&instance, graph, NULL, NULL, 0);
+#elif defined(PADDLE_WITH_HIP)
+        hipGraphInstantiate(&instance, graph, NULL, NULL, 0);
+#else
+        CINN_NOT_IMPLEMENTED
+#endif
         ps.CudaStart(FLAGS_cinn_kernel_execution_label);
-        cudaGraphLaunch(instance, stream);
+        phi::gpuGraphLaunch(instance, stream);
         ps.CudaEnd(FLAGS_cinn_kernel_execution_label);
-        cudaGraphDestroy(graph);
-        cudaGraphExecDestroy(instance);
-        cudaStreamDestroy(stream);
+        phi::gpuGraphDestroy(graph);
+        phi::gpuGraphExecDestroy(instance);
+        phi::DestoryStream(stream);
       } else {
         ((lower_func_ptr_g)cinn_kernel_info_.CX86_fn_ptr)(
             static_cast<void*>(func_args_.data()), func_args_.size(), stream);
       }
-      cudaDeviceSynchronize();
+      phi::backends::gpu::GpuDeviceSync();
     } else {
       if (is_gpu) {
         ((lower_func_ptr_g)cinn_kernel_info_.fn_ptr)(
@@ -267,7 +276,7 @@ CinnJitInstruction::CinnJitInstruction(
 }
 
 void CinnJitInstruction::Run() {
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void* running_stream = nullptr;
   bool is_gpu = false;
 
@@ -298,7 +307,7 @@ void CinnJitInstruction::Run() {
   }
 #else
   VLOG(0) << "Not Supported: cinn jit instruction currently does not "
-             "support non-CUDA kernel";
+             "support CUDA/HIP kernel";
 #endif
 }