Skip to content

[OpenCL] Support layout type: kImageFolder #7143

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion lite/api/tools/opt_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,14 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
} else if (target_repr == "opencl") {
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageFolder)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageFolder)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)});
valid_places_.emplace_back(
Expand Down Expand Up @@ -145,10 +149,14 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
} else if (target_repr == "x86_opencl") {
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageFolder)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageFolder)});
valid_places_.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)});
valid_places_.emplace_back(Place{TARGET(kX86), PRECISION(kFloat)});
Expand Down Expand Up @@ -508,7 +516,7 @@ void OptBase::PrintAllSupportedOpsInMdformat() {
"英特尔FPGA",
"华为昇腾NPU",
"联发科APU",
"瑞芯微NPU ",
"瑞芯微NPU",
"华为麒麟NPU",
"颖脉NNA",
"晶晨NPU"};
Expand Down
103 changes: 103 additions & 0 deletions lite/backends/opencl/cl_kernel/image/layout_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -311,3 +311,106 @@ __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
out[index + size_ch * 3] = convert_uchar_sat(in.w);
}
}

////////////////////////////////////////////////////////
// image2d_default -> image2d_folder
////////////////////////////////////////////////////////
__kernel void image2d_default_to_image2d_folder(__read_only image2d_t input,
__write_only image2d_t output,
__private const int in_img_w,
__private const int in_img_h) {
const int pos_x = get_global_id(0);
const int pos_y = get_global_id(1);

CL_DTYPE4 in =
READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, (int2)(pos_x, pos_y));

CL_DTYPE4 in0 = 0.f;
CL_DTYPE4 in1 = 0.f;
CL_DTYPE4 in2 = 0.f;
CL_DTYPE4 in3 = 0.f;

in0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, (int2)(pos_x * 4, pos_y));
if (pos_x * 4 + 1 < in_img_w) {
in1 = READ_IMG_TYPE(
CL_DTYPE_CHAR, input, SAMPLER, (int2)(pos_x * 4 + 1, pos_y));
}
if (pos_x * 4 + 2 < in_img_w) {
in2 = READ_IMG_TYPE(
CL_DTYPE_CHAR, input, SAMPLER, (int2)(pos_x * 4 + 2, pos_y));
}
if (pos_x * 4 + 3 < in_img_w) {
in3 = READ_IMG_TYPE(
CL_DTYPE_CHAR, input, SAMPLER, (int2)(pos_x * 4 + 3, pos_y));
}

CL_DTYPE4 out = (CL_DTYPE4)(in0.x, in1.x, in2.x, in3.x);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_x, pos_y), out);
}

////////////////////////////////////////////////////////
// image2d_folder -> image2d_default
////////////////////////////////////////////////////////
__kernel void image2d_folder_to_image2d_default(__read_only image2d_t input,
__write_only image2d_t output,
__private const int out_img_w,
__private const int out_img_h) {
const int pos_x = get_global_id(0);
const int pos_y = get_global_id(1);

CL_DTYPE4 in =
READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, (int2)(pos_x, pos_y));

CL_DTYPE4 out0 = 0.f;
CL_DTYPE4 out1 = 0.f;
CL_DTYPE4 out2 = 0.f;
CL_DTYPE4 out3 = 0.f;
out0.x = in.x;
out1.x = in.y;
out2.x = in.z;
out3.x = in.w;

WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_x * 4, pos_y), out0);
if (pos_x * 4 + 1 < out_img_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_x * 4 + 1, pos_y), out1);
}
if (pos_x * 4 + 2 < out_img_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_x * 4 + 2, pos_y), out2);
}
if (pos_x * 4 + 3 < out_img_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_x * 4 + 3, pos_y), out3);
}
}

////////////////////////////////////////////////////////
// image2d_folder -> buffer
////////////////////////////////////////////////////////
__kernel void image2d_folder_to_buffer(__read_only image2d_t input,
__global float* output,
__private const int out_h,
__private const int out_w) {
const int pos_x = get_global_id(0);
const int pos_y = get_global_id(1);

CL_DTYPE4 in =
READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, (int2)(pos_x, pos_y));

float4 out0 = 0.f;
float4 out1 = 0.f;
float4 out2 = 0.f;
float4 out3 = 0.f;
float4 out = convert_float4(in);

int outpos_base = out_w * pos_y + pos_x * 4;
int length = out_w * out_h;
output[outpos_base] = out.x;
if (outpos_base + 1 < length) {
output[outpos_base + 1] = out.y;
}
if (outpos_base + 2 < length) {
output[outpos_base + 2] = out.z;
}
if (outpos_base + 3 < length) {
output[outpos_base + 3] = out.w;
}
}
24 changes: 22 additions & 2 deletions lite/core/optimizer/mir/type_layout_cast_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -89,14 +89,33 @@ void TypeLayoutTransformPass::ComplementInputs(
};
auto* in_arg_type = const_cast<Type*>(in->AsArg().type);
if (is_host(in_arg_type->target()) &&
in_arg_type->layout() == DATALAYOUT(kImageDefault)) {
(in_arg_type->layout() == DATALAYOUT(kImageDefault) ||
in_arg_type->layout() == DATALAYOUT(kImageFolder))) {
return;
}

if (!DataLayoutCompatible(*in->AsArg().type, *decl_arg_type)) {
VLOG(4) << "found Layout unmatched tensor: " << in->AsArg().name
<< " for kernel " << inst.op()->DebugString() << " "
<< *in->AsArg().type << " -> " << *decl_arg_type;

// Special case for opencl:
// Data layout of kImageDefault is the same as kImageFolder when the size of
// tensor's dims is greater than 2.
auto a = (*in->AsArg().type).layout();
auto b = (*decl_arg_type).layout();
const auto& tensor =
inst.op()->scope()->FindVar(in->AsArg().name)->Get<Tensor>();
const bool skip_flag = (((a == DATALAYOUT(kImageDefault)) &&
(b == DATALAYOUT(kImageFolder))) ||
((a == DATALAYOUT(kImageFolder)) &&
(b == DATALAYOUT(kImageDefault)))) &&
(tensor.dims().size() > 2);
if (skip_flag) {
VLOG(3) << "skip this case";
return;
}

AddLayoutInst(*in->AsArg().type,
*decl_arg_type,
in,
Expand Down Expand Up @@ -185,7 +204,8 @@ void TypeLayoutTransformPass::AddLayoutInst(
(TargetCompatibleTo(*in_arg_ty, from) &&
/* skip precision check: PrecisionCompatibleTo(*in_arg_ty, from) &&*/
DeviceCompatibleTo(*in_arg_ty, from) &&
out_arg_ty->layout() == to.layout())) {
DataLayoutCompatible(*in_arg_ty, from) &&
(out_arg_ty->layout() == to.layout()))) {
is_found = true;
} else if (TypeCompatible(*in_arg_ty, from) &&
out_arg_ty->layout() == to.layout()) {
Expand Down
54 changes: 45 additions & 9 deletions lite/core/profile/precision_profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -334,12 +334,6 @@ class PrecisionProfiler {
switch (layout_type) {
case DATALAYOUT(kImageDefault): {
auto in_dims = in->dims();
// special case
if ((in_dims.size() == 2) &&
(op_name == "fc" || op_name == "softmax")) {
in_dims = DDim(std::vector<DDim::value_type>(
{in->dims()[0], in->dims()[1], 1, 1}));
}
paddle::lite::CLImageConverterDefault default_convertor;
auto image_shape = default_convertor.InitImageDimInfoWith(in_dims);
size_t im_w = image_shape[0];
Expand All @@ -364,9 +358,6 @@ class PrecisionProfiler {
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
// TODO(zhaoyang-star): Tensor shape padding mode will change from
// high-dim padding to low-dim padding to fit image2d.
// ImageConverter will be changed.
default_convertor.ImageToNCHW(
in_data_v, real_out_v.data(), image_shape, in_dims);
CHECK(real_out_v.size() == in->numel());
Expand All @@ -386,6 +377,51 @@ class PrecisionProfiler {
}
return;
}
case DATALAYOUT(kImageFolder): {
auto in_dims = in->dims();
paddle::lite::CLImageConverterFolder folder_convertor;
auto image_shape = folder_convertor.InitImageDimInfoWith(in_dims);
size_t im_w = image_shape[0];
size_t im_h = image_shape[1];
VLOG(1) << "image shape(W,H) of " << name << ": " << im_w << " "
<< im_h;
auto* in_data_v =
use_fp16
? static_cast<void*>(
calloc(im_w * im_h * 4, sizeof(uint16_t)))
: static_cast<void*>(calloc(im_w * im_h * 4, sizeof(float)));

std::vector<float> real_out_v(in->numel());
const size_t cl_image2d_row_pitch{0};
const size_t cl_image2d_slice_pitch{0};
TargetWrapperCL::ImgcpySync(in_data_v,
use_fp16
? in->data<uint16_t, cl::Image2D>()
: in->data<float, cl::Image2D>(),
im_w,
im_h,
cl_image2d_row_pitch,
cl_image2d_slice_pitch,
IoDirection::DtoH);
folder_convertor.ImageToNCHW(
in_data_v, real_out_v.data(), image_shape, in_dims);
CHECK(real_out_v.size() == in->numel());
*mean = compute_mean<float>(real_out_v.data(), real_out_v.size());
*std_dev = compute_standard_deviation<float>(
real_out_v.data(), in->numel(), true, *mean);
*ave_grow_rate = compute_average_grow_rate<float>(real_out_v.data(),
real_out_v.size());
std::shared_ptr<lite::Tensor> real_out_t(new lite::Tensor);
real_out_t->Resize(in_dims);
float* real_out_data = real_out_t->mutable_data<float>();
memcpy(real_out_data,
real_out_v.data(),
real_out_v.size() * sizeof(float));
if (write_result_to_file) {
write_tensorfile<float>(real_out_t.get(), name, log_dir_);
}
return;
}
case DATALAYOUT(kNCHW): {
std::vector<float> in_data_v(in->numel(), 0);
TargetWrapperCL::MemcpySync(in_data_v.data(),
Expand Down
9 changes: 6 additions & 3 deletions lite/core/type_system.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,15 +194,18 @@ static bool DataLayoutCompatibleTo(const Type& a, const Type& b) {
return a.IsVoid() || //
(a.layout() == b.layout() || //
((b.layout() == DATALAYOUT(kAny)) &&
(a.layout() != DATALAYOUT(kImageDefault))));
(a.layout() != DATALAYOUT(kImageDefault) &&
a.layout() != DATALAYOUT(kImageFolder))));
}
static bool DataLayoutCompatible(const Type& a, const Type& b) {
return a.IsVoid() || b.IsVoid() || //
(a.layout() == b.layout() || //
((b.layout() == DATALAYOUT(kAny)) &&
(a.layout() != DATALAYOUT(kImageDefault))) ||
(a.layout() != DATALAYOUT(kImageDefault) &&
a.layout() != DATALAYOUT(kImageFolder))) ||
((a.layout() == DATALAYOUT(kAny)) &&
(b.layout() != DATALAYOUT(kImageDefault))));
(b.layout() != DATALAYOUT(kImageDefault) &&
b.layout() != DATALAYOUT(kImageFolder))));
}

static bool PrecisionCompatibleTo(const Type& a, const Type& b) {
Expand Down
8 changes: 4 additions & 4 deletions lite/kernels/opencl/fc_image_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ namespace opencl {

class FcImageCompute : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
DATALAYOUT(kImageFolder)> {
public:
void PrepareForRun() override {
auto& param = this->Param<operators::FcParam>();
Expand Down Expand Up @@ -317,18 +317,18 @@ class FcImageCompute : public KernelLite<TARGET(kOpenCL),
REGISTER_LITE_KERNEL(fc,
kOpenCL,
kFP16,
kImageDefault,
kImageFolder,
paddle::lite::kernels::opencl::FcImageCompute,
image2d)
.BindInput("Input",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
DATALAYOUT(kImageFolder))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kHost))})
.BindInput("W", {LiteType::GetTensorTy(TARGET(kHost))})
.BindInput("Alpha", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
DATALAYOUT(kImageFolder))})
.Finalize();
4 changes: 2 additions & 2 deletions lite/kernels/opencl/fc_image_compute_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ void test(const lite_api::CLPrecisionType p,
<< " m=" << m << " n=" << n << " k=" << k;

auto kernels = KernelRegistry::Global().Create(
"fc", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault));
"fc", TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageFolder));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());

Expand Down Expand Up @@ -238,4 +238,4 @@ TEST(fc, compute_basic) {
} // namespace lite
} // namespace paddle

USE_LITE_KERNEL(fc, kOpenCL, kFP16, kImageDefault, image2d);
USE_LITE_KERNEL(fc, kOpenCL, kFP16, kImageFolder, image2d);
Loading