Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions lite/api/paddle_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,16 @@ void CxxConfig::set_xpu_l3_cache_method(size_t l3_size, bool locked) {
#endif
}

void CxxConfig::set_xpu_l3_cache_autotune(bool autotune) {
#ifdef LITE_WITH_XPU
lite::TargetWrapperXPU::local_l3_autotune = autotune;
#else
LOG(WARNING) << "The invoking of the function "
"'set_xpu_l3_cache_autotune' is ignored, please "
"rebuild it with LITE_WITH_XPU=ON.";
#endif
}

void set_xpu_gm_workspace_method(size_t gm_size) {
#ifdef LITE_WITH_XPU
lite::TargetWrapperXPU::local_gm_size = gm_size;
Expand Down
1 change: 1 addition & 0 deletions lite/api/paddle_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,7 @@ class LITE_API CxxConfig : public ConfigBase {
// **DEPRECATED**, use set_xpu_l3_cache_method() in the future
void set_xpu_workspace_l3_size_per_thread(int l3_size = 0x4000000);
void set_xpu_l3_cache_method(size_t l3_size, bool locked = false);
void set_xpu_l3_cache_autotune(bool autotune = true);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

是公开api吗,公开api 需要补到C++ 文档中

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

并补充相关单测到paddle_api_test.cc

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

相关的python API 需要考虑是否补充

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

收到,该接口的使用说明在下个pr中补充


void set_xpu_gm_workspace_method(size_t gm_size);

Expand Down
9 changes: 7 additions & 2 deletions lite/backends/xpu/target_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -127,12 +127,16 @@ void TargetWrapperXPU::FreeL3Cache() {
local_l3_ptr_ = nullptr;
XPU_CALL(tls_raw_ctx_->_l3_mgr.set(nullptr, 0));
}
l3_planner_->run_autotune(l3_block_dict, local_l3_size);
if (local_l3_autotune) {
l3_planner_->run_autotune(l3_block_dict, local_l3_size);
}
} else if (need_l3_mutex && TargetWrapperXPU::IsSharedL3Created()) {
XPU_CALL(xpu_wait(TargetWrapperXPU::get_xpu_stream()));
XPU_CALL(tls_raw_ctx_->_l3_mgr.set(nullptr, 0));
mutex_l3_.unlock();
l3_planner_->run_autotune(l3_block_dict, shared_l3_size);
if (local_l3_autotune) {
l3_planner_->run_autotune(l3_block_dict, shared_l3_size);
}
}
for (size_t i = 0; i < l3_block_dict.size(); i++) {
l3_block_dict[i]->clear();
Expand Down Expand Up @@ -168,6 +172,7 @@ LITE_THREAD_LOCAL std::string TargetWrapperXPU::conv_autotune_file;
LITE_THREAD_LOCAL bool TargetWrapperXPU::need_l3_mutex{false};
LITE_THREAD_LOCAL size_t TargetWrapperXPU::local_l3_size{
std::numeric_limits<size_t>::max()};
LITE_THREAD_LOCAL bool TargetWrapperXPU::local_l3_autotune{true};
LITE_THREAD_LOCAL size_t TargetWrapperXPU::local_gm_size{
0x4000000}; // 64 * 1024 * 1024
LITE_THREAD_LOCAL void* TargetWrapperXPU::local_l3_ptr_{nullptr};
Expand Down
1 change: 1 addition & 0 deletions lite/backends/xpu/target_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ class TargetWrapper<TARGET(kXPU)> {
// l3 cache config
static LITE_THREAD_LOCAL bool need_l3_mutex; // model level l3 size
static LITE_THREAD_LOCAL size_t local_l3_size; // model level l3 size
static LITE_THREAD_LOCAL bool local_l3_autotune;
static LITE_THREAD_LOCAL size_t local_gm_size;
static size_t shared_l3_size; // model level l3 size
static LITE_THREAD_LOCAL std::vector<XPUL3CacheBlock*>
Expand Down
20 changes: 17 additions & 3 deletions lite/backends/xpu/xpu_quantizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,26 @@
namespace paddle {
namespace lite {

static size_t Hashed(const void* cpu_data,
template <typename T>
static double AveGrowCompute(const T* in, const size_t length) {
const double eps = 1e-5;
double ave_grow_rate = 0.0f;
for (size_t i = 1; i < length; ++i) {
ave_grow_rate += (in[i] - in[i - 1]) / (in[i - 1] + eps);
}
ave_grow_rate /= (length + eps);
return ave_grow_rate;
}

template <typename T>
static size_t Hashed(const T* cpu_data,
int numel,
const std::string& precision,
bool trans) {
std::hash<const void*> ptr_hasher;
auto hash_res = ptr_hasher(cpu_data);
auto hash_res = ptr_hasher(reinterpret_cast<const void*>(cpu_data));
double ave_grow_rate = AveGrowCompute(cpu_data, numel);
CombineHash(ave_grow_rate, &hash_res);
CombineHash(numel, &hash_res);
CombineHash(precision, &hash_res);
CombineHash(trans, &hash_res);
Expand Down Expand Up @@ -187,7 +201,7 @@ XPUQuantData XPUQuantizer::quant(const Tcpu* cpu_data,
const std::string cpu_dtype = CppTypeToString<Tcpu>();
const std::string xpu_dtype = CppTypeToString<Txpu>();
const std::string precision = cpu_dtype + xpu_dtype;
auto hashed_key = Hashed(cpu_data, numel, precision, data_transpose);
auto hashed_key = Hashed<Tcpu>(cpu_data, numel, precision, data_transpose);
VLOG(3) << "cpu_data=" << cpu_data << ", numel=" << numel
<< ", precision=" << precision << ", transpose=" << data_transpose
<< ", hashed_key=" << hashed_key;
Expand Down
6 changes: 5 additions & 1 deletion lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,11 @@ class XPUStaticKernelPickPass : public mir::StmtPass {
"conv2d_transpose",
"elementwise_mul",
"elementwise_add",
"reduce_mean"};
"reduce_mean",
"bilinear_interp",
"bilinear_interp_v2",
"nearest_interp",
"nearest_interp_v2"};
const std::set<std::string> xpu_inplace_op_{"reshape",
"reshape2",
"flatten",
Expand Down
131 changes: 91 additions & 40 deletions lite/kernels/xpu/interpolate_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ namespace lite {
namespace kernels {
namespace xpu {

void BilinearInterpCompute::Run() {
template <typename InType, PrecisionType PType>
void BilinearInterpCompute<InType, PType>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();
lite::Tensor* X = param.X;
Expand All @@ -47,22 +48,23 @@ void BilinearInterpCompute::Run() {
} else {
trans_mode = 2;
}
int r = xdnn::interpolate2d<float>(ctx.GetRawContext(),
X->data<float>(),
Out->mutable_data<float>(TARGET(kXPU)),
n,
c,
in_h,
in_w,
out_h,
out_w,
false,
trans_mode,
true);
int r = xdnn::interpolate2d<InType>(ctx.GetRawContext(),
X->data<InType>(),
Out->mutable_data<InType>(TARGET(kXPU)),
n,
c,
in_h,
in_w,
out_h,
out_w,
false,
trans_mode,
true);
CHECK_EQ(r, 0);
}

void NearestInterpCompute::Run() {
template <typename InType, PrecisionType PType>
void NearestInterpCompute<InType, PType>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();
lite::Tensor* X = param.X;
Expand All @@ -77,18 +79,18 @@ void NearestInterpCompute::Run() {
bool align_corners = param.align_corners;
int trans_mode = (align_corners == true) ? 0 : 2;

int r = xdnn::interpolate2d<float>(ctx.GetRawContext(),
X->data<float>(),
Out->mutable_data<float>(TARGET(kXPU)),
n,
c,
in_h,
in_w,
out_h,
out_w,
true,
trans_mode,
true);
int r = xdnn::interpolate2d<InType>(ctx.GetRawContext(),
X->data<InType>(),
Out->mutable_data<InType>(TARGET(kXPU)),
n,
c,
in_h,
in_w,
out_h,
out_w,
true,
trans_mode,
true);

CHECK_EQ(r, 0);
}
Expand All @@ -98,12 +100,40 @@ void NearestInterpCompute::Run() {
} // namespace lite
} // namespace paddle

namespace xpu = paddle::lite::kernels::xpu;

using BiliInterp_FP32 = xpu::BilinearInterpCompute<float, PRECISION(kFloat)>;
using BiliInterp_FP16 = xpu::BilinearInterpCompute<float16, PRECISION(kFP16)>;
using NearInterp_FP32 = xpu::NearestInterpCompute<float, PRECISION(kFloat)>;
using NearInterp_FP16 = xpu::NearestInterpCompute<float16, PRECISION(kFP16)>;

REGISTER_LITE_KERNEL(bilinear_interp, kXPU, kFloat, kNCHW, BiliInterp_FP32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("OutSize",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindInput("SizeTensor",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

REGISTER_LITE_KERNEL(bilinear_interp,
kXPU,
kFloat,
kFP16,
kNCHW,
paddle::lite::kernels::xpu::BilinearInterpCompute,
def)
BiliInterp_FP16,
DISABLE_XPU1_binterp_FP16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindInput("OutSize",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindInput("SizeTensor",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(
bilinear_interp_v2, kXPU, kFloat, kNCHW, BiliInterp_FP32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("OutSize",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
Expand All @@ -115,10 +145,20 @@ REGISTER_LITE_KERNEL(bilinear_interp,

REGISTER_LITE_KERNEL(bilinear_interp_v2,
kXPU,
kFloat,
kFP16,
kNCHW,
paddle::lite::kernels::xpu::BilinearInterpCompute,
def)
BiliInterp_FP16,
DISABLE_XPU1_binterp_v2_FP16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindInput("OutSize",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindInput("SizeTensor",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(nearest_interp, kXPU, kFloat, kNCHW, NearInterp_FP32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("OutSize",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
Expand All @@ -130,10 +170,21 @@ REGISTER_LITE_KERNEL(bilinear_interp_v2,

REGISTER_LITE_KERNEL(nearest_interp,
kXPU,
kFloat,
kFP16,
kNCHW,
paddle::lite::kernels::xpu::NearestInterpCompute,
def)
NearInterp_FP16,
DISABLE_XPU1_ninterp_FP16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindInput("OutSize",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindInput("SizeTensor",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();

REGISTER_LITE_KERNEL(
nearest_interp_v2, kXPU, kFloat, kNCHW, NearInterp_FP32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("OutSize",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
Expand All @@ -145,15 +196,15 @@ REGISTER_LITE_KERNEL(nearest_interp,

REGISTER_LITE_KERNEL(nearest_interp_v2,
kXPU,
kFloat,
kFP16,
kNCHW,
paddle::lite::kernels::xpu::NearestInterpCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
NearInterp_FP16,
DISABLE_XPU1_niterp_v2_FP16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindInput("OutSize",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindInput("SizeTensor",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();
8 changes: 4 additions & 4 deletions lite/kernels/xpu/interpolate_compute.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,16 @@ namespace lite {
namespace kernels {
namespace xpu {

class BilinearInterpCompute
: public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
template <typename InType, PrecisionType PType>
class BilinearInterpCompute : public KernelLite<TARGET(kXPU), PType> {
public:
using param_t = operators::InterpolateParam;
void Run() override;
virtual ~BilinearInterpCompute() = default;
};

class NearestInterpCompute
: public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
template <typename InType, PrecisionType PType>
class NearestInterpCompute : public KernelLite<TARGET(kXPU), PType> {
public:
using param_t = operators::InterpolateParam;
void Run() override;
Expand Down
Loading