Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[cherry-pick][OpenCL] collection of cherry-picks #6266

Merged
merged 10 commits into from
Jun 16, 2021
1 change: 1 addition & 0 deletions lite/api/paddle_use_passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ USE_MIR_PASS(static_kernel_pick_pass);
USE_MIR_PASS(variable_place_inference_pass);
USE_MIR_PASS(type_target_cast_pass);
USE_MIR_PASS(__fpga_kernel_place_correct_pass);
USE_MIR_PASS(opencl_kernel_place_correct_pass);
USE_MIR_PASS(generate_program_pass);

USE_MIR_PASS(io_copy_kernel_pick_pass);
Expand Down
20 changes: 9 additions & 11 deletions lite/backends/opencl/cl_kernel/cl_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ limitations under the License. */
#define MAX_VALUE FLT_MAX
#define MIN_VALUE -FLT_MAX

#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))

/////////////////////////////////
// CL_DTYPE_float / CL_DTYPE_half
/////////////////////////////////
Expand Down Expand Up @@ -93,7 +95,11 @@ __constant sampler_t SAMPLER =
inline CL_DTYPE activation(CL_DTYPE in, CL_DTYPE prelu_alpha) {
CL_DTYPE output = in;
#ifdef PRELU
#ifdef CL_DTYPE_half
output = select(prelu_alpha * in, in, (ushort)(isgreaterequal(in, 0)));
#else
output = select(prelu_alpha * in, in, (uint)(isgreaterequal(in, 0)));
#endif
#endif

#ifdef RELU
Expand All @@ -105,15 +111,12 @@ inline CL_DTYPE activation(CL_DTYPE in, CL_DTYPE prelu_alpha) {
#endif

#ifdef LEAKY_RELU
#ifdef CL_DTYPE_float
output = select((CL_DTYPE)(LEAKY_RELU_ALPHA)*in,
in,
(int)(isgreaterequal(in, 0))); // NOLINT
#endif

#ifdef CL_DTYPE_half
output = select(
(CL_DTYPE)(LEAKY_RELU_ALPHA)*in, in, (ushort)(isgreaterequal(in, 0)));
#else
output = select(
(CL_DTYPE)(LEAKY_RELU_ALPHA)*in, in, (uint)(isgreaterequal(in, 0)));
#endif
#endif

Expand Down Expand Up @@ -151,11 +154,6 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in, CL_DTYPE4 prelu_alpha) {
#ifdef LEAKY_RELU
output = select(
(CL_DTYPE4)(LEAKY_RELU_ALPHA)*in, in, isgreaterequal(in, (CL_DTYPE4)0));
// same as bellow:
// output = select((CL_DTYPE4)(LEAKY_RELU_ALPHA)*in,
// in,
// (ushort4)((in.x >= 0) << 15, (in.y >= 0) << 15, (in.z >= 0)
// << 15, (in.w >= 0) << 15));
#endif

#ifdef HARD_SWISH
Expand Down
362 changes: 311 additions & 51 deletions lite/backends/opencl/cl_kernel/image/concat_kernel.cl

Large diffs are not rendered by default.

63 changes: 27 additions & 36 deletions lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -79,106 +79,97 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
int input_block = input_c / 4;
int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x,
in_pos_in_one_block.y);
input0 = select(
input0 = SELECT(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
zero_dtype4,
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
in_pos_in_one_block.y - dilation >= input_height);
input1 =
select(READ_IMG_TYPE(CL_DTYPE_CHAR,
SELECT(READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x, pos_in.y - dilation)),
zero_dtype4,
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input2 = select(
in_pos_in_one_block.y - dilation >= input_height);
input2 = SELECT(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
zero_dtype4,
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
in_pos_in_one_block.y - dilation >= input_height);

input3 =
select(READ_IMG_TYPE(CL_DTYPE_CHAR,
SELECT(READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x - dilation, pos_in.y)),
zero_dtype4,
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
in_pos_in_one_block.y >= input_height);

input4 = select(
input4 = SELECT(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x, pos_in.y)),
zero_dtype4,
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
in_pos_in_one_block.y >= input_height);
input5 =
select(READ_IMG_TYPE(CL_DTYPE_CHAR,
SELECT(READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x + dilation, pos_in.y)),
zero_dtype4,
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input6 = select(
in_pos_in_one_block.y >= input_height);
input6 = SELECT(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
zero_dtype4,
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
in_pos_in_one_block.y + dilation >= input_height);
input7 =
select(READ_IMG_TYPE(CL_DTYPE_CHAR,
SELECT(READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x, pos_in.y + dilation)),
zero_dtype4,
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input8 = select(
in_pos_in_one_block.y + dilation >= input_height);
input8 = SELECT(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
zero_dtype4,
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
in_pos_in_one_block.y + dilation >= input_height);

CL_DTYPE tmp_out = 0;
for (int j = 0; j < 9; j++) {
Expand Down
8 changes: 3 additions & 5 deletions lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -86,19 +86,17 @@ __kernel void conv2d_5x5(__private const int global_size_dim0,
in_pos_in_one_block.y + batch_index * input_height);
for (int j = 0; j < 5; j++) {
for (int k = 0; k < 5; k++) {
input = select(
input = SELECT(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x + (j - 2) * dilation,
pos_in.y + (k - 2) * dilation)),
(CL_DTYPE4)(0.0f),
(ushort4)(
(in_pos_in_one_block.x + (j - 2) * dilation < 0 ||
in_pos_in_one_block.x + (j - 2) * dilation < 0 ||
in_pos_in_one_block.y + (k - 2) * dilation < 0 ||
in_pos_in_one_block.x + (j - 2) * dilation >= input_width ||
in_pos_in_one_block.y + (k - 2) * dilation >= input_height)
<< 15));
in_pos_in_one_block.y + (k - 2) * dilation >= input_height);
int filter_h = k;
int filter_w = j;
int filter_c = i;
Expand Down
8 changes: 3 additions & 5 deletions lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -72,19 +72,17 @@ __kernel void conv2d_7x7(__private const int global_size_dim0,
in_pos_in_one_block.y + batch_index * input_height);
for (int j = 0; j < 7; j++) {
for (int k = 0; k < 7; k++) {
input = select(
input = SELECT(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x + (j - 3) * dilation,
pos_in.y + (k - 3) * dilation)),
(CL_DTYPE4)(0.0f),
(ushort4)(
(in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
<< 15));
in_pos_in_one_block.y + (k - 3) * dilation >= input_height);
int filter_h = k;
int filter_w = j;
int filter_c = i;
Expand Down
1 change: 1 addition & 0 deletions lite/core/mir/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ lite_cc_library(mir_passes
static_kernel_pick_pass.cc
variable_place_inference_pass.cc
fpga_kernel_place_correct_pass.cc
opencl_kernel_place_correct_pass.cc
type_target_cast_pass.cc
type_layout_cast_pass.cc
type_precision_cast_pass.cc
Expand Down
5 changes: 3 additions & 2 deletions lite/core/mir/fusion/conv_conv_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ void ConvConvFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
bool has_int8 = false;
bool has_weight_quant = false;
for (auto& place : graph->valid_places()) {
if (place.target == TARGET(kARM) || place.target == TARGET(kHost)) {
if (place.target == TARGET(kARM) || place.target == TARGET(kHost) ||
place.target == TARGET(kOpenCL)) {
if (place.precision == PRECISION(kInt8)) {
has_int8 = true;
}
Expand Down Expand Up @@ -77,4 +78,4 @@ void ConvConvFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
} // namespace paddle

REGISTER_MIR_PASS(lite_conv_conv_fuse_pass, paddle::lite::mir::ConvConvFusePass)
.BindTargets({TARGET(kARM)});
.BindTargets({TARGET(kARM), TARGET(kOpenCL)});
34 changes: 34 additions & 0 deletions lite/core/mir/opencl_kernel_place_correct_pass.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/core/mir/opencl_kernel_place_correct_pass.h"
#include <memory>
#include "lite/core/mir/pass_registry.h"

namespace paddle {
namespace lite {
namespace mir {

void OpenCLKernelPlaceCorrectPass::Apply(
const std::unique_ptr<SSAGraph> &graph) {
CorrectArgumentPlace(graph.get());
}

} // namespace mir
} // namespace lite
} // namespace paddle

REGISTER_MIR_PASS(opencl_kernel_place_correct_pass,
paddle::lite::mir::OpenCLKernelPlaceCorrectPass)
.BindTargets({TARGET(kOpenCL)});
Loading