[XPU] add interpolate fp16, fix reshape bug, add l3_autotune api (#9259)

AlbertVan · web-flow · commit dfd2b2845531 · 2022-07-19T20:38:44.000+08:00
diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
@@ -561,6 +561,16 @@ void CxxConfig::set_xpu_l3_cache_method(size_t l3_size, bool locked) {
 #endif
 }
 
+void CxxConfig::set_xpu_l3_cache_autotune(bool autotune) {
+#ifdef LITE_WITH_XPU
+  lite::TargetWrapperXPU::local_l3_autotune = autotune;
+#else
+  LOG(WARNING) << "The invoking of the function "
+                  "'set_xpu_l3_cache_autotune' is ignored, please "
+                  "rebuild it with LITE_WITH_XPU=ON.";
+#endif
+}
+
 void set_xpu_gm_workspace_method(size_t gm_size) {
 #ifdef LITE_WITH_XPU
   lite::TargetWrapperXPU::local_gm_size = gm_size;
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
@@ -468,6 +468,7 @@ class LITE_API CxxConfig : public ConfigBase {
   // **DEPRECATED**, use set_xpu_l3_cache_method() in the future
   void set_xpu_workspace_l3_size_per_thread(int l3_size = 0x4000000);
   void set_xpu_l3_cache_method(size_t l3_size, bool locked = false);
+  void set_xpu_l3_cache_autotune(bool autotune = true);
 
   void set_xpu_gm_workspace_method(size_t gm_size);
 
diff --git a/lite/backends/xpu/target_wrapper.cc b/lite/backends/xpu/target_wrapper.cc
@@ -127,12 +127,16 @@ void TargetWrapperXPU::FreeL3Cache() {
       local_l3_ptr_ = nullptr;
       XPU_CALL(tls_raw_ctx_->_l3_mgr.set(nullptr, 0));
     }
-    l3_planner_->run_autotune(l3_block_dict, local_l3_size);
+    if (local_l3_autotune) {
+      l3_planner_->run_autotune(l3_block_dict, local_l3_size);
+    }
   } else if (need_l3_mutex && TargetWrapperXPU::IsSharedL3Created()) {
     XPU_CALL(xpu_wait(TargetWrapperXPU::get_xpu_stream()));
     XPU_CALL(tls_raw_ctx_->_l3_mgr.set(nullptr, 0));
     mutex_l3_.unlock();
-    l3_planner_->run_autotune(l3_block_dict, shared_l3_size);
+    if (local_l3_autotune) {
+      l3_planner_->run_autotune(l3_block_dict, shared_l3_size);
+    }
   }
   for (size_t i = 0; i < l3_block_dict.size(); i++) {
     l3_block_dict[i]->clear();
@@ -168,6 +172,7 @@ LITE_THREAD_LOCAL std::string TargetWrapperXPU::conv_autotune_file;
 LITE_THREAD_LOCAL bool TargetWrapperXPU::need_l3_mutex{false};
 LITE_THREAD_LOCAL size_t TargetWrapperXPU::local_l3_size{
     std::numeric_limits<size_t>::max()};
+LITE_THREAD_LOCAL bool TargetWrapperXPU::local_l3_autotune{true};
 LITE_THREAD_LOCAL size_t TargetWrapperXPU::local_gm_size{
     0x4000000};  // 64 * 1024 * 1024
 LITE_THREAD_LOCAL void* TargetWrapperXPU::local_l3_ptr_{nullptr};
diff --git a/lite/backends/xpu/target_wrapper.h b/lite/backends/xpu/target_wrapper.h
@@ -179,6 +179,7 @@ class TargetWrapper<TARGET(kXPU)> {
   // l3 cache config
   static LITE_THREAD_LOCAL bool need_l3_mutex;    // model level l3 size
   static LITE_THREAD_LOCAL size_t local_l3_size;  // model level l3 size
+  static LITE_THREAD_LOCAL bool local_l3_autotune;
   static LITE_THREAD_LOCAL size_t local_gm_size;
   static size_t shared_l3_size;  // model level l3 size
   static LITE_THREAD_LOCAL std::vector<XPUL3CacheBlock*>
diff --git a/lite/backends/xpu/xpu_quantizer.cc b/lite/backends/xpu/xpu_quantizer.cc
@@ -21,12 +21,26 @@
 namespace paddle {
 namespace lite {
 
-static size_t Hashed(const void* cpu_data,
+template <typename T>
+static double AveGrowCompute(const T* in, const size_t length) {
+  const double eps = 1e-5;
+  double ave_grow_rate = 0.0f;
+  for (size_t i = 1; i < length; ++i) {
+    ave_grow_rate += (in[i] - in[i - 1]) / (in[i - 1] + eps);
+  }
+  ave_grow_rate /= (length + eps);
+  return ave_grow_rate;
+}
+
+template <typename T>
+static size_t Hashed(const T* cpu_data,
                      int numel,
                      const std::string& precision,
                      bool trans) {
   std::hash<const void*> ptr_hasher;
-  auto hash_res = ptr_hasher(cpu_data);
+  auto hash_res = ptr_hasher(reinterpret_cast<const void*>(cpu_data));
+  double ave_grow_rate = AveGrowCompute(cpu_data, numel);
+  CombineHash(ave_grow_rate, &hash_res);
   CombineHash(numel, &hash_res);
   CombineHash(precision, &hash_res);
   CombineHash(trans, &hash_res);
@@ -187,7 +201,7 @@ XPUQuantData XPUQuantizer::quant(const Tcpu* cpu_data,
   const std::string cpu_dtype = CppTypeToString<Tcpu>();
   const std::string xpu_dtype = CppTypeToString<Txpu>();
   const std::string precision = cpu_dtype + xpu_dtype;
-  auto hashed_key = Hashed(cpu_data, numel, precision, data_transpose);
+  auto hashed_key = Hashed<Tcpu>(cpu_data, numel, precision, data_transpose);
   VLOG(3) << "cpu_data=" << cpu_data << ", numel=" << numel
           << ", precision=" << precision << ", transpose=" << data_transpose
           << ", hashed_key=" << hashed_key;
diff --git a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h
@@ -327,7 +327,11 @@ class XPUStaticKernelPickPass : public mir::StmtPass {
                                               "conv2d_transpose",
                                               "elementwise_mul",
                                               "elementwise_add",
-                                              "reduce_mean"};
+                                              "reduce_mean",
+                                              "bilinear_interp",
+                                              "bilinear_interp_v2",
+                                              "nearest_interp",
+                                              "nearest_interp_v2"};
   const std::set<std::string> xpu_inplace_op_{"reshape",
                                               "reshape2",
                                               "flatten",
diff --git a/lite/kernels/xpu/interpolate_compute.cc b/lite/kernels/xpu/interpolate_compute.cc
@@ -24,7 +24,8 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-void BilinearInterpCompute::Run() {
+template <typename InType, PrecisionType PType>
+void BilinearInterpCompute<InType, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
   lite::Tensor* X = param.X;
@@ -47,22 +48,23 @@ void BilinearInterpCompute::Run() {
   } else {
     trans_mode = 2;
   }
-  int r = xdnn::interpolate2d<float>(ctx.GetRawContext(),
-                                     X->data<float>(),
-                                     Out->mutable_data<float>(TARGET(kXPU)),
-                                     n,
-                                     c,
-                                     in_h,
-                                     in_w,
-                                     out_h,
-                                     out_w,
-                                     false,
-                                     trans_mode,
-                                     true);
+  int r = xdnn::interpolate2d<InType>(ctx.GetRawContext(),
+                                      X->data<InType>(),
+                                      Out->mutable_data<InType>(TARGET(kXPU)),
+                                      n,
+                                      c,
+                                      in_h,
+                                      in_w,
+                                      out_h,
+                                      out_w,
+                                      false,
+                                      trans_mode,
+                                      true);
   CHECK_EQ(r, 0);
 }
 
-void NearestInterpCompute::Run() {
+template <typename InType, PrecisionType PType>
+void NearestInterpCompute<InType, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
   lite::Tensor* X = param.X;
@@ -77,18 +79,18 @@ void NearestInterpCompute::Run() {
   bool align_corners = param.align_corners;
   int trans_mode = (align_corners == true) ? 0 : 2;
 
-  int r = xdnn::interpolate2d<float>(ctx.GetRawContext(),
-                                     X->data<float>(),
-                                     Out->mutable_data<float>(TARGET(kXPU)),
-                                     n,
-                                     c,
-                                     in_h,
-                                     in_w,
-                                     out_h,
-                                     out_w,
-                                     true,
-                                     trans_mode,
-                                     true);
+  int r = xdnn::interpolate2d<InType>(ctx.GetRawContext(),
+                                      X->data<InType>(),
+                                      Out->mutable_data<InType>(TARGET(kXPU)),
+                                      n,
+                                      c,
+                                      in_h,
+                                      in_w,
+                                      out_h,
+                                      out_w,
+                                      true,
+                                      trans_mode,
+                                      true);
 
   CHECK_EQ(r, 0);
 }
@@ -98,12 +100,40 @@ void NearestInterpCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
+namespace xpu = paddle::lite::kernels::xpu;
+
+using BiliInterp_FP32 = xpu::BilinearInterpCompute<float, PRECISION(kFloat)>;
+using BiliInterp_FP16 = xpu::BilinearInterpCompute<float16, PRECISION(kFP16)>;
+using NearInterp_FP32 = xpu::NearestInterpCompute<float, PRECISION(kFloat)>;
+using NearInterp_FP16 = xpu::NearestInterpCompute<float16, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(bilinear_interp, kXPU, kFloat, kNCHW, BiliInterp_FP32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("OutSize",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(bilinear_interp,
                      kXPU,
-                     kFloat,
+                     kFP16,
                      kNCHW,
-                     paddle::lite::kernels::xpu::BilinearInterpCompute,
-                     def)
+                     BiliInterp_FP16,
+                     DISABLE_XPU1_binterp_FP16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("OutSize",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    bilinear_interp_v2, kXPU, kFloat, kNCHW, BiliInterp_FP32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("OutSize",
                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
@@ -115,10 +145,20 @@ REGISTER_LITE_KERNEL(bilinear_interp,
 
 REGISTER_LITE_KERNEL(bilinear_interp_v2,
                      kXPU,
-                     kFloat,
+                     kFP16,
                      kNCHW,
-                     paddle::lite::kernels::xpu::BilinearInterpCompute,
-                     def)
+                     BiliInterp_FP16,
+                     DISABLE_XPU1_binterp_v2_FP16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("OutSize",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(nearest_interp, kXPU, kFloat, kNCHW, NearInterp_FP32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("OutSize",
                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
@@ -130,10 +170,21 @@ REGISTER_LITE_KERNEL(bilinear_interp_v2,
 
 REGISTER_LITE_KERNEL(nearest_interp,
                      kXPU,
-                     kFloat,
+                     kFP16,
                      kNCHW,
-                     paddle::lite::kernels::xpu::NearestInterpCompute,
-                     def)
+                     NearInterp_FP16,
+                     DISABLE_XPU1_ninterp_FP16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("OutSize",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    nearest_interp_v2, kXPU, kFloat, kNCHW, NearInterp_FP32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("OutSize",
                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
@@ -145,15 +196,15 @@ REGISTER_LITE_KERNEL(nearest_interp,
 
 REGISTER_LITE_KERNEL(nearest_interp_v2,
                      kXPU,
-                     kFloat,
+                     kFP16,
                      kNCHW,
-                     paddle::lite::kernels::xpu::NearestInterpCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+                     NearInterp_FP16,
+                     DISABLE_XPU1_niterp_v2_FP16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
     .BindInput("OutSize",
                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
     .BindInput("SizeTensor",
                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
     .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
     .Finalize();
diff --git a/lite/kernels/xpu/interpolate_compute.h b/lite/kernels/xpu/interpolate_compute.h
@@ -20,16 +20,16 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-class BilinearInterpCompute
-    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+template <typename InType, PrecisionType PType>
+class BilinearInterpCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::InterpolateParam;
   void Run() override;
   virtual ~BilinearInterpCompute() = default;
 };
 
-class NearestInterpCompute
-    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+template <typename InType, PrecisionType PType>
+class NearestInterpCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::InterpolateParam;
   void Run() override;
diff --git a/lite/operators/reshape_op.cc b/lite/operators/reshape_op.cc
diff --git a/lite/operators/reshape_op.h b/lite/operators/reshape_op.h