Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[OpenCL] Fix pool_local kernel #8397

Merged
merged 1 commit into from
Feb 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 22 additions & 19 deletions lite/backends/opencl/cl_kernel/image/pool_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ __kernel void pool(__read_only image2d_t input,
__private const int stride_w,
__private const int pad_top,
__private const int pad_left,
__private const int exclusive,
__private const int adaptive) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
Expand All @@ -37,8 +38,8 @@ __kernel void pool(__read_only image2d_t input,
int pool_size = 1;
if (adaptive == 1) {
start_h = floor((out_h * in_height) / (float)out_height);
end_h = ceil(((out_h + 1) * in_height) / (float)out_height);
start_w = floor((out_w * in_width) / (float)out_width);
end_h = ceil(((out_h + 1) * in_height) / (float)out_height);
end_w = ceil(((out_w + 1) * in_width) / (float)out_width);
} else {
start_h = out_h * stride_h - pad_top;
Expand Down Expand Up @@ -67,10 +68,7 @@ __kernel void pool(__read_only image2d_t input,
}
}

#ifdef EXCLUSIVE
pool_size = (end_h - start_h) * (end_w - start_w);
#endif // EXCLUSIVE
if (adaptive == 1) {
if (exclusive == 1 || adaptive == 1) {
pool_size = (end_h - start_h) * (end_w - start_w);
}

Expand Down Expand Up @@ -110,6 +108,8 @@ __kernel void pool_local(__read_only image2d_t input,
__private const int stride_w,
__private const int pad_top,
__private const int pad_left,
__private const int exclusive,
__private const int adaptive,
__private const int local_block_size,
__private const int2 local_block_size_wh,
__private const int2 local_block_count_wh,
Expand All @@ -119,7 +119,7 @@ __kernel void pool_local(__read_only image2d_t input,
const int out_nh = get_global_id(2);
const int out_n = out_nh / out_height;
// const int out_h = out_nh % out_height;
const int out_h = out_nh - mul24(out_h, out_height);
const int out_h = out_nh - mul24(out_n, out_height);

const int local_id = get_local_id(0);
const int local_width_id = local_id % local_block_size_wh.x;
Expand All @@ -131,6 +131,7 @@ __kernel void pool_local(__read_only image2d_t input,
const int input_width_start = mad24(out_w, stride_w, -pad_left);

#ifdef POOL_AVG
// 1. Get data from global memroy to local memory
__local float4* avg_output = (__local float4*)local_output;
avg_output[local_id] = (float4)0;
int pos_h = local_height_id;
Expand Down Expand Up @@ -161,6 +162,7 @@ __kernel void pool_local(__read_only image2d_t input,
}
barrier(CLK_LOCAL_MEM_FENCE);

// 2. Reduce in each workgroup
for (int stride_h = (local_block_size_wh.y >> 1); stride_h > 0;
stride_h >>= 1) {
if (local_height_id < stride_h) {
Expand All @@ -169,7 +171,6 @@ __kernel void pool_local(__read_only image2d_t input,
}
barrier(CLK_LOCAL_MEM_FENCE);
}

for (int stride_w = (local_block_size_wh.x >> 1); stride_w > 0;
stride_w >>= 1) {
if (local_height_id == 0 && local_width_id < stride_w) {
Expand All @@ -179,16 +180,18 @@ __kernel void pool_local(__read_only image2d_t input,
}

if (local_id == 0) {
const int kernel_height_start = max(0, input_height_start);
const int kernel_width_start = max(0, input_width_start);
const int kernel_height_end = min(input_height_start + ksize_h, in_height);
const int kernel_width_end = min(input_width_start + ksize_w, in_width);
#ifdef EXCLUSIVE
const int block_size = mul24((kernel_height_end - kernel_height_start),
(kernel_width_end - kernel_width_start));
#else
const int block_size = ksize_w * ksize_h;
#endif // EXCLUSIVE
int block_size;
if (exclusive == 1 || adaptive == 1) {
const int kernel_height_start = max(0, input_height_start);
const int kernel_width_start = max(0, input_width_start);
const int kernel_height_end =
min(input_height_start + ksize_h, in_height);
const int kernel_width_end = min(input_width_start + ksize_w, in_width);
block_size = mul24((kernel_height_end - kernel_height_start),
(kernel_width_end - kernel_width_start));
} else {
block_size = ksize_w * ksize_h;
}
avg_output[local_id] = avg_output[local_id] / (float)block_size;

const int output_channel_width_idx = mad24(out_c, out_width, out_w);
Expand All @@ -201,6 +204,7 @@ __kernel void pool_local(__read_only image2d_t input,
CL_DTYPE_CHAR, output, (int2)(output_channel_width_idx, out_nh), res);
}
#else
// 1. Get data from global memroy to local memory
local_output[local_id] = (CL_DTYPE4)(-FLT_MAX);
int pos_h = local_height_id;

Expand Down Expand Up @@ -236,9 +240,9 @@ __kernel void pool_local(__read_only image2d_t input,
}
pos_h += local_block_size_wh.y;
}

barrier(CLK_LOCAL_MEM_FENCE);

// 2. Reduce in each workgroup
for (int stride_h = (local_block_size_wh.y >> 1); stride_h > 0;
stride_h >>= 1) {
if (local_height_id < stride_h) {
Expand All @@ -248,7 +252,6 @@ __kernel void pool_local(__read_only image2d_t input,
}
barrier(CLK_LOCAL_MEM_FENCE);
}

for (int stride_w = (local_block_size_wh.x >> 1); stride_w > 0;
stride_w >>= 1) {
if (local_height_id == 0 && local_width_id < stride_w) {
Expand Down
70 changes: 36 additions & 34 deletions lite/kernels/opencl/pool_image_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
const auto& in_dims = param.x->dims();
const auto& out_dims = param.output->dims();
const bool global_pooling = param.global_pooling;
const bool exclusive = param.exclusive;
const bool adaptive = param.adaptive;
const std::string padding_algorithm = param.padding_algorithm;
const std::vector<int>& ksize = param.ksize;
Expand Down Expand Up @@ -66,9 +65,6 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
param.strides,
ksize);

if (exclusive) {
build_options_ += " -DEXCLUSIVE";
}
if (global_pooling) {
build_options_ += " -DGLOBAL";
ksize_.resize(static_cast<size_t>(in_dims.size()) - 2);
Expand All @@ -81,8 +77,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
run_local_work_ =
out_dims[0] * UP_DIV(out_dims[1], 4) * out_dims[2] * out_dims[3] <
low_op_parallelism_thre_ &&
ksize_[0] * ksize_[1] >= high_op_intensity_thre_;
run_local_work_ = false;
ksize_[0] * ksize_[1] >= high_op_intensity_thre_ && !adaptive;
if (run_local_work_) {
kernel_func_name_ += "_local";
}
Expand Down Expand Up @@ -154,39 +149,40 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
}

const int out_c_blks = UP_DIV(out_dims[1], 4);
uint32_t workgroup_size = 0;

int type_size =
(CLRuntime::Global()->get_precision() == lite_api::CL_PRECISION_FP16)
? sizeof(uint16_t)
: sizeof(float);
if (pooling_type == "avg") {
type_size = sizeof(float);
}
uint32_t local_mem_size =
CLRuntime::Global()->GetDeviceInfo()["CL_DEVICE_LOCAL_MEM_SIZE_KB"] *
1024;
uint32_t workgroupsize_max =
CLRuntime::Global()->GetMaxWorkGroupSize(kernel_);

uint32_t compute_intensity = ksize_[0] * ksize_[1];
run_local_work_ = out_dims[0] * out_c_blks * out_dims[2] * out_dims[3] <
low_op_parallelism_thre_ &&
compute_intensity >= high_op_intensity_thre_;
run_local_work_ = false;
compute_intensity >= high_op_intensity_thre_ &&
!adaptive;
if (run_local_work_) {
workgroup_size =
// Calculate workgroup_w_size, workgroup_h_size
int type_size = (CLRuntime::Global()->get_precision() ==
lite_api::CL_PRECISION_FP16)
? sizeof(uint16_t)
: sizeof(float);
if (pooling_type == "avg") {
type_size = sizeof(float);
}
uint32_t local_mem_size =
CLRuntime::Global()
->GetDeviceInfo()["CL_DEVICE_LOCAL_MEM_SIZE_KB"] *
1024;
uint32_t workgroupsize_max =
CLRuntime::Global()->GetMaxWorkGroupSize(kernel_);
uint32_t workgroup_size =
std::min(static_cast<uint32_t>(local_mem_size / (4 * type_size)),
workgroupsize_max);
workgroup_size =
std::min(static_cast<uint32_t>(compute_intensity), workgroup_size);

// make workgroup_size floor-round to pow(2)
uint32_t temp_size = 1;
while ((temp_size <<= 1) <= workgroup_size) {
}
workgroup_size = temp_size >> 1;

// make workgroup_w_size floor-round to pow(2)
int workgroup_w_size = 1, workgroup_h_size;
while ((workgroup_w_size <<= 1) <= ksize_[0] &&
while ((workgroup_w_size <<= 1) <= ksize_[1] &&
workgroup_w_size <= workgroup_size) {
}
workgroup_w_size >>= 1;
Expand All @@ -198,14 +194,21 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
local_work_size_ = cl::NDRange(workgroup_size, 1, 1);

cl_int2 local_block_size_shape = {workgroup_w_size, workgroup_h_size};
cl_int2 local_block_count_shape = {UP_DIV(ksize_[0], workgroup_w_size),
UP_DIV(ksize_[1], workgroup_h_size)};
cl_int2 local_block_count_shape = {UP_DIV(ksize_[1], workgroup_w_size),
UP_DIV(ksize_[0], workgroup_h_size)};

int idx = 12;
int idx = 14;
kernel_.setArg(idx++, static_cast<int>(workgroup_size));
kernel_.setArg(idx++, local_block_size_shape);
kernel_.setArg(idx++, local_block_count_shape);
kernel_.setArg(idx++, workgroup_size * 4 * type_size, nullptr);
#ifdef LITE_WITH_LOG
VLOG(4) << "workgroup_size: " << workgroup_size;
VLOG(4) << "local_block_size_shape(wh): " << local_block_size_shape.x
<< " " << local_block_size_shape.y;
VLOG(4) << "local_block_count_shape(wh): " << local_block_count_shape.x
<< " " << local_block_count_shape.y;
#endif
} else {
global_work_size_ =
cl::NDRange(out_c_blks, out_dims[3], out_dims[0] * out_dims[2]);
Expand Down Expand Up @@ -234,11 +237,10 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
CL_CHECK_FATAL(status);
status = kernel_.setArg(arg_idx++, paddings_[2]);
CL_CHECK_FATAL(status);
if (kernel_func_name_ == "pool") {
int ad = param.adaptive;
status = kernel_.setArg(arg_idx++, ad);
CL_CHECK_FATAL(status);
}
status = kernel_.setArg(arg_idx++, static_cast<int>(exclusive));
CL_CHECK_FATAL(status);
status = kernel_.setArg(arg_idx++, static_cast<int>(adaptive));
CL_CHECK_FATAL(status);

#ifdef LITE_WITH_LOG
const std::vector<int>& paddings = *param.paddings;
Expand Down