Skip to content

Commit dfd2b28

Browse files
authored
[XPU] add interpolate fp16, fix reshape bug, add l3_autotune api (#9259)
1 parent fd9a5d3 commit dfd2b28

File tree

10 files changed

+228
-50
lines changed

10 files changed

+228
-50
lines changed

lite/api/paddle_api.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -561,6 +561,16 @@ void CxxConfig::set_xpu_l3_cache_method(size_t l3_size, bool locked) {
561561
#endif
562562
}
563563

564+
void CxxConfig::set_xpu_l3_cache_autotune(bool autotune) {
565+
#ifdef LITE_WITH_XPU
566+
lite::TargetWrapperXPU::local_l3_autotune = autotune;
567+
#else
568+
LOG(WARNING) << "The invoking of the function "
569+
"'set_xpu_l3_cache_autotune' is ignored, please "
570+
"rebuild it with LITE_WITH_XPU=ON.";
571+
#endif
572+
}
573+
564574
void set_xpu_gm_workspace_method(size_t gm_size) {
565575
#ifdef LITE_WITH_XPU
566576
lite::TargetWrapperXPU::local_gm_size = gm_size;

lite/api/paddle_api.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,7 @@ class LITE_API CxxConfig : public ConfigBase {
468468
// **DEPRECATED**, use set_xpu_l3_cache_method() in the future
469469
void set_xpu_workspace_l3_size_per_thread(int l3_size = 0x4000000);
470470
void set_xpu_l3_cache_method(size_t l3_size, bool locked = false);
471+
void set_xpu_l3_cache_autotune(bool autotune = true);
471472

472473
void set_xpu_gm_workspace_method(size_t gm_size);
473474

lite/backends/xpu/target_wrapper.cc

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,12 +127,16 @@ void TargetWrapperXPU::FreeL3Cache() {
127127
local_l3_ptr_ = nullptr;
128128
XPU_CALL(tls_raw_ctx_->_l3_mgr.set(nullptr, 0));
129129
}
130-
l3_planner_->run_autotune(l3_block_dict, local_l3_size);
130+
if (local_l3_autotune) {
131+
l3_planner_->run_autotune(l3_block_dict, local_l3_size);
132+
}
131133
} else if (need_l3_mutex && TargetWrapperXPU::IsSharedL3Created()) {
132134
XPU_CALL(xpu_wait(TargetWrapperXPU::get_xpu_stream()));
133135
XPU_CALL(tls_raw_ctx_->_l3_mgr.set(nullptr, 0));
134136
mutex_l3_.unlock();
135-
l3_planner_->run_autotune(l3_block_dict, shared_l3_size);
137+
if (local_l3_autotune) {
138+
l3_planner_->run_autotune(l3_block_dict, shared_l3_size);
139+
}
136140
}
137141
for (size_t i = 0; i < l3_block_dict.size(); i++) {
138142
l3_block_dict[i]->clear();
@@ -168,6 +172,7 @@ LITE_THREAD_LOCAL std::string TargetWrapperXPU::conv_autotune_file;
168172
LITE_THREAD_LOCAL bool TargetWrapperXPU::need_l3_mutex{false};
169173
LITE_THREAD_LOCAL size_t TargetWrapperXPU::local_l3_size{
170174
std::numeric_limits<size_t>::max()};
175+
LITE_THREAD_LOCAL bool TargetWrapperXPU::local_l3_autotune{true};
171176
LITE_THREAD_LOCAL size_t TargetWrapperXPU::local_gm_size{
172177
0x4000000}; // 64 * 1024 * 1024
173178
LITE_THREAD_LOCAL void* TargetWrapperXPU::local_l3_ptr_{nullptr};

lite/backends/xpu/target_wrapper.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ class TargetWrapper<TARGET(kXPU)> {
179179
// l3 cache config
180180
static LITE_THREAD_LOCAL bool need_l3_mutex; // model level l3 size
181181
static LITE_THREAD_LOCAL size_t local_l3_size; // model level l3 size
182+
static LITE_THREAD_LOCAL bool local_l3_autotune;
182183
static LITE_THREAD_LOCAL size_t local_gm_size;
183184
static size_t shared_l3_size; // model level l3 size
184185
static LITE_THREAD_LOCAL std::vector<XPUL3CacheBlock*>

lite/backends/xpu/xpu_quantizer.cc

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,26 @@
2121
namespace paddle {
2222
namespace lite {
2323

24-
static size_t Hashed(const void* cpu_data,
24+
template <typename T>
25+
static double AveGrowCompute(const T* in, const size_t length) {
26+
const double eps = 1e-5;
27+
double ave_grow_rate = 0.0f;
28+
for (size_t i = 1; i < length; ++i) {
29+
ave_grow_rate += (in[i] - in[i - 1]) / (in[i - 1] + eps);
30+
}
31+
ave_grow_rate /= (length + eps);
32+
return ave_grow_rate;
33+
}
34+
35+
template <typename T>
36+
static size_t Hashed(const T* cpu_data,
2537
int numel,
2638
const std::string& precision,
2739
bool trans) {
2840
std::hash<const void*> ptr_hasher;
29-
auto hash_res = ptr_hasher(cpu_data);
41+
auto hash_res = ptr_hasher(reinterpret_cast<const void*>(cpu_data));
42+
double ave_grow_rate = AveGrowCompute(cpu_data, numel);
43+
CombineHash(ave_grow_rate, &hash_res);
3044
CombineHash(numel, &hash_res);
3145
CombineHash(precision, &hash_res);
3246
CombineHash(trans, &hash_res);
@@ -187,7 +201,7 @@ XPUQuantData XPUQuantizer::quant(const Tcpu* cpu_data,
187201
const std::string cpu_dtype = CppTypeToString<Tcpu>();
188202
const std::string xpu_dtype = CppTypeToString<Txpu>();
189203
const std::string precision = cpu_dtype + xpu_dtype;
190-
auto hashed_key = Hashed(cpu_data, numel, precision, data_transpose);
204+
auto hashed_key = Hashed<Tcpu>(cpu_data, numel, precision, data_transpose);
191205
VLOG(3) << "cpu_data=" << cpu_data << ", numel=" << numel
192206
<< ", precision=" << precision << ", transpose=" << data_transpose
193207
<< ", hashed_key=" << hashed_key;

lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,11 @@ class XPUStaticKernelPickPass : public mir::StmtPass {
327327
"conv2d_transpose",
328328
"elementwise_mul",
329329
"elementwise_add",
330-
"reduce_mean"};
330+
"reduce_mean",
331+
"bilinear_interp",
332+
"bilinear_interp_v2",
333+
"nearest_interp",
334+
"nearest_interp_v2"};
331335
const std::set<std::string> xpu_inplace_op_{"reshape",
332336
"reshape2",
333337
"flatten",

lite/kernels/xpu/interpolate_compute.cc

Lines changed: 91 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ namespace lite {
2424
namespace kernels {
2525
namespace xpu {
2626

27-
void BilinearInterpCompute::Run() {
27+
template <typename InType, PrecisionType PType>
28+
void BilinearInterpCompute<InType, PType>::Run() {
2829
auto& param = this->template Param<param_t>();
2930
auto& ctx = this->ctx_->template As<XPUContext>();
3031
lite::Tensor* X = param.X;
@@ -47,22 +48,23 @@ void BilinearInterpCompute::Run() {
4748
} else {
4849
trans_mode = 2;
4950
}
50-
int r = xdnn::interpolate2d<float>(ctx.GetRawContext(),
51-
X->data<float>(),
52-
Out->mutable_data<float>(TARGET(kXPU)),
53-
n,
54-
c,
55-
in_h,
56-
in_w,
57-
out_h,
58-
out_w,
59-
false,
60-
trans_mode,
61-
true);
51+
int r = xdnn::interpolate2d<InType>(ctx.GetRawContext(),
52+
X->data<InType>(),
53+
Out->mutable_data<InType>(TARGET(kXPU)),
54+
n,
55+
c,
56+
in_h,
57+
in_w,
58+
out_h,
59+
out_w,
60+
false,
61+
trans_mode,
62+
true);
6263
CHECK_EQ(r, 0);
6364
}
6465

65-
void NearestInterpCompute::Run() {
66+
template <typename InType, PrecisionType PType>
67+
void NearestInterpCompute<InType, PType>::Run() {
6668
auto& param = this->template Param<param_t>();
6769
auto& ctx = this->ctx_->template As<XPUContext>();
6870
lite::Tensor* X = param.X;
@@ -77,18 +79,18 @@ void NearestInterpCompute::Run() {
7779
bool align_corners = param.align_corners;
7880
int trans_mode = (align_corners == true) ? 0 : 2;
7981

80-
int r = xdnn::interpolate2d<float>(ctx.GetRawContext(),
81-
X->data<float>(),
82-
Out->mutable_data<float>(TARGET(kXPU)),
83-
n,
84-
c,
85-
in_h,
86-
in_w,
87-
out_h,
88-
out_w,
89-
true,
90-
trans_mode,
91-
true);
82+
int r = xdnn::interpolate2d<InType>(ctx.GetRawContext(),
83+
X->data<InType>(),
84+
Out->mutable_data<InType>(TARGET(kXPU)),
85+
n,
86+
c,
87+
in_h,
88+
in_w,
89+
out_h,
90+
out_w,
91+
true,
92+
trans_mode,
93+
true);
9294

9395
CHECK_EQ(r, 0);
9496
}
@@ -98,12 +100,40 @@ void NearestInterpCompute::Run() {
98100
} // namespace lite
99101
} // namespace paddle
100102

103+
namespace xpu = paddle::lite::kernels::xpu;
104+
105+
using BiliInterp_FP32 = xpu::BilinearInterpCompute<float, PRECISION(kFloat)>;
106+
using BiliInterp_FP16 = xpu::BilinearInterpCompute<float16, PRECISION(kFP16)>;
107+
using NearInterp_FP32 = xpu::NearestInterpCompute<float, PRECISION(kFloat)>;
108+
using NearInterp_FP16 = xpu::NearestInterpCompute<float16, PRECISION(kFP16)>;
109+
110+
REGISTER_LITE_KERNEL(bilinear_interp, kXPU, kFloat, kNCHW, BiliInterp_FP32, def)
111+
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
112+
.BindInput("OutSize",
113+
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
114+
.BindInput("SizeTensor",
115+
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
116+
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kHost))})
117+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
118+
.Finalize();
119+
101120
REGISTER_LITE_KERNEL(bilinear_interp,
102121
kXPU,
103-
kFloat,
122+
kFP16,
104123
kNCHW,
105-
paddle::lite::kernels::xpu::BilinearInterpCompute,
106-
def)
124+
BiliInterp_FP16,
125+
DISABLE_XPU1_binterp_FP16)
126+
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
127+
.BindInput("OutSize",
128+
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
129+
.BindInput("SizeTensor",
130+
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
131+
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kHost))})
132+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
133+
.Finalize();
134+
135+
REGISTER_LITE_KERNEL(
136+
bilinear_interp_v2, kXPU, kFloat, kNCHW, BiliInterp_FP32, def)
107137
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
108138
.BindInput("OutSize",
109139
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
@@ -115,10 +145,20 @@ REGISTER_LITE_KERNEL(bilinear_interp,
115145

116146
REGISTER_LITE_KERNEL(bilinear_interp_v2,
117147
kXPU,
118-
kFloat,
148+
kFP16,
119149
kNCHW,
120-
paddle::lite::kernels::xpu::BilinearInterpCompute,
121-
def)
150+
BiliInterp_FP16,
151+
DISABLE_XPU1_binterp_v2_FP16)
152+
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
153+
.BindInput("OutSize",
154+
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
155+
.BindInput("SizeTensor",
156+
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
157+
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kHost))})
158+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
159+
.Finalize();
160+
161+
REGISTER_LITE_KERNEL(nearest_interp, kXPU, kFloat, kNCHW, NearInterp_FP32, def)
122162
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
123163
.BindInput("OutSize",
124164
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
@@ -130,10 +170,21 @@ REGISTER_LITE_KERNEL(bilinear_interp_v2,
130170

131171
REGISTER_LITE_KERNEL(nearest_interp,
132172
kXPU,
133-
kFloat,
173+
kFP16,
134174
kNCHW,
135-
paddle::lite::kernels::xpu::NearestInterpCompute,
136-
def)
175+
NearInterp_FP16,
176+
DISABLE_XPU1_ninterp_FP16)
177+
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
178+
.BindInput("OutSize",
179+
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
180+
.BindInput("SizeTensor",
181+
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
182+
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kHost))})
183+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
184+
.Finalize();
185+
186+
REGISTER_LITE_KERNEL(
187+
nearest_interp_v2, kXPU, kFloat, kNCHW, NearInterp_FP32, def)
137188
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
138189
.BindInput("OutSize",
139190
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
@@ -145,15 +196,15 @@ REGISTER_LITE_KERNEL(nearest_interp,
145196

146197
REGISTER_LITE_KERNEL(nearest_interp_v2,
147198
kXPU,
148-
kFloat,
199+
kFP16,
149200
kNCHW,
150-
paddle::lite::kernels::xpu::NearestInterpCompute,
151-
def)
152-
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
201+
NearInterp_FP16,
202+
DISABLE_XPU1_niterp_v2_FP16)
203+
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
153204
.BindInput("OutSize",
154205
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
155206
.BindInput("SizeTensor",
156207
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
157208
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kHost))})
158-
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
209+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
159210
.Finalize();

lite/kernels/xpu/interpolate_compute.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,16 @@ namespace lite {
2020
namespace kernels {
2121
namespace xpu {
2222

23-
class BilinearInterpCompute
24-
: public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
23+
template <typename InType, PrecisionType PType>
24+
class BilinearInterpCompute : public KernelLite<TARGET(kXPU), PType> {
2525
public:
2626
using param_t = operators::InterpolateParam;
2727
void Run() override;
2828
virtual ~BilinearInterpCompute() = default;
2929
};
3030

31-
class NearestInterpCompute
32-
: public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
31+
template <typename InType, PrecisionType PType>
32+
class NearestInterpCompute : public KernelLite<TARGET(kXPU), PType> {
3333
public:
3434
using param_t = operators::InterpolateParam;
3535
void Run() override;

0 commit comments

Comments
 (0)