PaddlePaddle · daming5432 · Jun 16, 2021 · Apr 29, 2021 · May 10, 2021 · May 12, 2021
diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
@@ -20,6 +20,7 @@ USE_MIR_PASS(static_kernel_pick_pass);
 USE_MIR_PASS(variable_place_inference_pass);
 USE_MIR_PASS(type_target_cast_pass);
 USE_MIR_PASS(__fpga_kernel_place_correct_pass);
+USE_MIR_PASS(opencl_kernel_place_correct_pass);
 USE_MIR_PASS(generate_program_pass);
 
 USE_MIR_PASS(io_copy_kernel_pick_pass);

diff --git a/lite/backends/opencl/cl_kernel/cl_common.h b/lite/backends/opencl/cl_kernel/cl_common.h
@@ -21,6 +21,8 @@ limitations under the License. */
 #define MAX_VALUE FLT_MAX
 #define MIN_VALUE -FLT_MAX
 
+#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
+
 /////////////////////////////////
 // CL_DTYPE_float / CL_DTYPE_half
 /////////////////////////////////
@@ -93,7 +95,11 @@ __constant sampler_t SAMPLER =
 inline CL_DTYPE activation(CL_DTYPE in, CL_DTYPE prelu_alpha) {
   CL_DTYPE output = in;
 #ifdef PRELU
+#ifdef CL_DTYPE_half
   output = select(prelu_alpha * in, in, (ushort)(isgreaterequal(in, 0)));
+#else
+  output = select(prelu_alpha * in, in, (uint)(isgreaterequal(in, 0)));
+#endif
 #endif
 
 #ifdef RELU
@@ -105,15 +111,12 @@ inline CL_DTYPE activation(CL_DTYPE in, CL_DTYPE prelu_alpha) {
 #endif
 
 #ifdef LEAKY_RELU
-#ifdef CL_DTYPE_float
-  output = select((CL_DTYPE)(LEAKY_RELU_ALPHA)*in,
-                  in,
-                  (int)(isgreaterequal(in, 0)));  // NOLINT
-#endif
-
 #ifdef CL_DTYPE_half
   output = select(
       (CL_DTYPE)(LEAKY_RELU_ALPHA)*in, in, (ushort)(isgreaterequal(in, 0)));
+#else
+  output = select(
+      (CL_DTYPE)(LEAKY_RELU_ALPHA)*in, in, (uint)(isgreaterequal(in, 0)));
 #endif
 #endif
 
@@ -151,11 +154,6 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in, CL_DTYPE4 prelu_alpha) {
 #ifdef LEAKY_RELU
   output = select(
       (CL_DTYPE4)(LEAKY_RELU_ALPHA)*in, in, isgreaterequal(in, (CL_DTYPE4)0));
-// same as bellow:
-// output = select((CL_DTYPE4)(LEAKY_RELU_ALPHA)*in,
-//                 in,
-//                 (ushort4)((in.x >= 0) << 15, (in.y >= 0) << 15, (in.z >= 0)
-//                 << 15, (in.w >= 0) << 15));
 #endif
 
 #ifdef HARD_SWISH

diff --git a/lite/backends/opencl/cl_kernel/image/concat_kernel.cl b/lite/backends/opencl/cl_kernel/image/concat_kernel.cl
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
@@ -79,106 +79,97 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
         int input_block = input_c / 4;
         int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x,
                              in_pos_in_one_block.y);
-        input0 = select(
+        input0 = SELECT(
             READ_IMG_TYPE(CL_DTYPE_CHAR,
                           input_image,
                           SAMPLER,
                           (int2)(pos_in.x - dilation, pos_in.y - dilation)),
             zero_dtype4,
-            (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+            in_pos_in_one_block.x - dilation < 0 ||
                        in_pos_in_one_block.y - dilation < 0 ||
                        in_pos_in_one_block.x - dilation >= input_width ||
-                       in_pos_in_one_block.y - dilation >= input_height)
-                      << 15));
+                       in_pos_in_one_block.y - dilation >= input_height);
         input1 =
-            select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+            SELECT(READ_IMG_TYPE(CL_DTYPE_CHAR,
                                  input_image,
                                  SAMPLER,
                                  (int2)(pos_in.x, pos_in.y - dilation)),
                    zero_dtype4,
-                   (ushort4)((in_pos_in_one_block.x < 0 ||
+                   in_pos_in_one_block.x < 0 ||
                               in_pos_in_one_block.y - dilation < 0 ||
                               in_pos_in_one_block.x >= input_width ||
-                              in_pos_in_one_block.y - dilation >= input_height)
-                             << 15));
-        input2 = select(
+                              in_pos_in_one_block.y - dilation >= input_height);
+        input2 = SELECT(
             READ_IMG_TYPE(CL_DTYPE_CHAR,
                           input_image,
                           SAMPLER,
                           (int2)(pos_in.x + dilation, pos_in.y - dilation)),
             zero_dtype4,
-            (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+            in_pos_in_one_block.x + dilation < 0 ||
                        in_pos_in_one_block.y - dilation < 0 ||
                        in_pos_in_one_block.x + dilation >= input_width ||
-                       in_pos_in_one_block.y - dilation >= input_height)
-                      << 15));
+                       in_pos_in_one_block.y - dilation >= input_height);
 
         input3 =
-            select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+            SELECT(READ_IMG_TYPE(CL_DTYPE_CHAR,
                                  input_image,
                                  SAMPLER,
                                  (int2)(pos_in.x - dilation, pos_in.y)),
                    zero_dtype4,
-                   (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                   in_pos_in_one_block.x - dilation < 0 ||
                               in_pos_in_one_block.y < 0 ||
                               in_pos_in_one_block.x - dilation >= input_width ||
-                              in_pos_in_one_block.y >= input_height)
-                             << 15));
+                              in_pos_in_one_block.y >= input_height);
 
-        input4 = select(
+        input4 = SELECT(
             READ_IMG_TYPE(CL_DTYPE_CHAR,
                           input_image,
                           SAMPLER,
                           (int2)(pos_in.x, pos_in.y)),
             zero_dtype4,
-            (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
+            in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
                        in_pos_in_one_block.x >= input_width ||
-                       in_pos_in_one_block.y >= input_height)
-                      << 15));
+                       in_pos_in_one_block.y >= input_height);
         input5 =
-            select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+            SELECT(READ_IMG_TYPE(CL_DTYPE_CHAR,
                                  input_image,
                                  SAMPLER,
                                  (int2)(pos_in.x + dilation, pos_in.y)),
                    zero_dtype4,
-                   (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                   in_pos_in_one_block.x + dilation < 0 ||
                               in_pos_in_one_block.y < 0 ||
                               in_pos_in_one_block.x + dilation >= input_width ||
-                              in_pos_in_one_block.y >= input_height)
-                             << 15));
-        input6 = select(
+                              in_pos_in_one_block.y >= input_height);
+        input6 = SELECT(
             READ_IMG_TYPE(CL_DTYPE_CHAR,
                           input_image,
                           SAMPLER,
                           (int2)(pos_in.x - dilation, pos_in.y + dilation)),
             zero_dtype4,
-            (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+            in_pos_in_one_block.x - dilation < 0 ||
                        in_pos_in_one_block.y + dilation < 0 ||
                        in_pos_in_one_block.x - dilation >= input_width ||
-                       in_pos_in_one_block.y + dilation >= input_height)
-                      << 15));
+                       in_pos_in_one_block.y + dilation >= input_height);
         input7 =
-            select(READ_IMG_TYPE(CL_DTYPE_CHAR,
+            SELECT(READ_IMG_TYPE(CL_DTYPE_CHAR,
                                  input_image,
                                  SAMPLER,
                                  (int2)(pos_in.x, pos_in.y + dilation)),
                    zero_dtype4,
-                   (ushort4)((in_pos_in_one_block.x < 0 ||
+                   in_pos_in_one_block.x < 0 ||
                               in_pos_in_one_block.y + dilation < 0 ||
                               in_pos_in_one_block.x >= input_width ||
-                              in_pos_in_one_block.y + dilation >= input_height)
-                             << 15));
-        input8 = select(
+                              in_pos_in_one_block.y + dilation >= input_height);
+        input8 = SELECT(
             READ_IMG_TYPE(CL_DTYPE_CHAR,
                           input_image,
                           SAMPLER,
                           (int2)(pos_in.x + dilation, pos_in.y + dilation)),
             zero_dtype4,
-            (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+            in_pos_in_one_block.x + dilation < 0 ||
                        in_pos_in_one_block.y + dilation < 0 ||
                        in_pos_in_one_block.x + dilation >= input_width ||
-                       in_pos_in_one_block.y + dilation >= input_height)
-                      << 15));
+                       in_pos_in_one_block.y + dilation >= input_height);
 
         CL_DTYPE tmp_out = 0;
         for (int j = 0; j < 9; j++) {

diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
@@ -86,19 +86,17 @@ __kernel void conv2d_5x5(__private const int global_size_dim0,
                          in_pos_in_one_block.y + batch_index * input_height);
     for (int j = 0; j < 5; j++) {
       for (int k = 0; k < 5; k++) {
-        input = select(
+        input = SELECT(
             READ_IMG_TYPE(CL_DTYPE_CHAR,
                           input_image,
                           SAMPLER,
                           (int2)(pos_in.x + (j - 2) * dilation,
                                  pos_in.y + (k - 2) * dilation)),
             (CL_DTYPE4)(0.0f),
-            (ushort4)(
-                (in_pos_in_one_block.x + (j - 2) * dilation < 0 ||
+                 in_pos_in_one_block.x + (j - 2) * dilation < 0 ||
                  in_pos_in_one_block.y + (k - 2) * dilation < 0 ||
                  in_pos_in_one_block.x + (j - 2) * dilation >= input_width ||
-                 in_pos_in_one_block.y + (k - 2) * dilation >= input_height)
-                << 15));
+                 in_pos_in_one_block.y + (k - 2) * dilation >= input_height);
         int filter_h = k;
         int filter_w = j;
         int filter_c = i;

diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
@@ -72,19 +72,17 @@ __kernel void conv2d_7x7(__private const int global_size_dim0,
                          in_pos_in_one_block.y + batch_index * input_height);
     for (int j = 0; j < 7; j++) {
       for (int k = 0; k < 7; k++) {
-        input = select(
+        input = SELECT(
             READ_IMG_TYPE(CL_DTYPE_CHAR,
                           input_image,
                           SAMPLER,
                           (int2)(pos_in.x + (j - 3) * dilation,
                                  pos_in.y + (k - 3) * dilation)),
             (CL_DTYPE4)(0.0f),
-            (ushort4)(
-                (in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
+                 in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
                  in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
                  in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
-                 in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
-                << 15));
+                 in_pos_in_one_block.y + (k - 3) * dilation >= input_height);
         int filter_h = k;
         int filter_w = j;
         int filter_c = i;

diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt
@@ -73,6 +73,7 @@ lite_cc_library(mir_passes
       static_kernel_pick_pass.cc
       variable_place_inference_pass.cc
       fpga_kernel_place_correct_pass.cc
+      opencl_kernel_place_correct_pass.cc
       type_target_cast_pass.cc
       type_layout_cast_pass.cc
       type_precision_cast_pass.cc

diff --git a/lite/core/mir/fusion/conv_conv_fuse_pass.cc b/lite/core/mir/fusion/conv_conv_fuse_pass.cc
@@ -31,7 +31,8 @@ void ConvConvFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   bool has_int8 = false;
   bool has_weight_quant = false;
   for (auto& place : graph->valid_places()) {
-    if (place.target == TARGET(kARM) || place.target == TARGET(kHost)) {
+    if (place.target == TARGET(kARM) || place.target == TARGET(kHost) ||
+        place.target == TARGET(kOpenCL)) {
       if (place.precision == PRECISION(kInt8)) {
         has_int8 = true;
       }
@@ -77,4 +78,4 @@ void ConvConvFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 }  // namespace paddle
 
 REGISTER_MIR_PASS(lite_conv_conv_fuse_pass, paddle::lite::mir::ConvConvFusePass)
-    .BindTargets({TARGET(kARM)});
+    .BindTargets({TARGET(kARM), TARGET(kOpenCL)});
diff --git a/lite/core/mir/opencl_kernel_place_correct_pass.cc b/lite/core/mir/opencl_kernel_place_correct_pass.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/opencl_kernel_place_correct_pass.h"
+#include <memory>
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void OpenCLKernelPlaceCorrectPass::Apply(
+    const std::unique_ptr<SSAGraph> &graph) {
+  CorrectArgumentPlace(graph.get());
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(opencl_kernel_place_correct_pass,
+                  paddle::lite::mir::OpenCLKernelPlaceCorrectPass)
+    .BindTargets({TARGET(kOpenCL)});