Skip to content

Commit

Permalink
[OPENCL] Fix opencl fc int16 model bug caused by fc kernel (PaddlePad…
Browse files Browse the repository at this point in the history
…dle#3900)

* fix opencl fc kernel caused int16 model weight abnormal. test=develop
  • Loading branch information
ysh329 committed Jul 9, 2020
1 parent 6b9b9b7 commit ad4b1ed
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 17 deletions.
4 changes: 3 additions & 1 deletion lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,9 @@ void fc_gemm_4x4(__global const CL_DTYPE* a,
} else {
for (int cidx = col; cidx < N; ++cidx) {
for (int ridx = row; ridx < M; ++ridx) {
CL_COMPUTE_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0;
CL_COMPUTE_DTYPE a0 = 0;
CL_COMPUTE_DTYPE b0 = 0;
CL_COMPUTE_DTYPE c0 = bias ? bias[cidx] : 0;
for (int p = 0; p < K; ++p) {
a0 = *(a + ridx * K + p);
b0 = *(b + p * N + cidx),
Expand Down
2 changes: 1 addition & 1 deletion lite/kernels/opencl/conv_image_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
/* image kernel*/

void ConvImageCompute::PrepareForRun() {
const auto& param = this->Param<param_t>();
auto x_dims = param.x->dims();
Expand Down
35 changes: 28 additions & 7 deletions lite/kernels/opencl/fc_buffer_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,27 @@ class FcCompute
public:
using param_t = operators::FcParam;

void PrepareForRun() override {}
void PrepareForRun() override {
fc_param_ = param_.get_mutable<param_t>();
auto w_t = fc_param_->w;
auto bias_t = fc_param_->bias;

w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
auto w_gpu_data =
w_gpu_t_->mutable_data(TARGET(kOpenCL), w_t->memory_size());
TargetWrapperCL::MemcpySync(
w_gpu_data, w_t->raw_data(), w_t->memory_size(), IoDirection::HtoD);

bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
auto b_gpu_data =
bias_gpu_t_->mutable_data(TARGET(kOpenCL), bias_t->memory_size());
TargetWrapperCL::MemcpySync(b_gpu_data,
bias_t->raw_data(),
bias_t->memory_size(),
IoDirection::HtoD);
}

void ReInitWhenNeeded() override {
fc_param_ = param_.get_mutable<param_t>();
const auto x_dims = fc_param_->input->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
Expand Down Expand Up @@ -89,7 +106,7 @@ class FcCompute
}

void GetGlobalWorkSize() {
if (m_ == 1) { // gemv
if (kernel_func_name_ == "fc_gemv_1x4") { // gemv
global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)};
} else { // gemm
global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
Expand All @@ -99,8 +116,8 @@ class FcCompute

void Run() override {
auto* x_buf = fc_param_->input->data<float, cl::Buffer>();
auto* w_buf = fc_param_->w->data<float, cl::Buffer>();
auto* bias_buf = fc_param_->bias->data<float, cl::Buffer>();
auto* w_buf = w_gpu_t_->data<float, cl::Buffer>();
auto* bias_buf = bias_gpu_t_->data<float, cl::Buffer>();
auto* out_buf =
fc_param_->output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));

Expand Down Expand Up @@ -142,6 +159,10 @@ class FcCompute
std::string time_stamp_{GetTimeStamp()};
bool first_epoch_for_reinit_{true};
DDim last_x_dims_;

std::unique_ptr<Tensor> w_gpu_t_{nullptr};
std::unique_ptr<Tensor> bias_gpu_t_{nullptr};

cl::NDRange global_work_size_;
cl::Kernel kernel_;
};
Expand All @@ -154,7 +175,7 @@ class FcCompute
REGISTER_LITE_KERNEL(
fc, kOpenCL, kFloat, kNCHW, paddle::lite::kernels::opencl::FcCompute, def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("W", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.Finalize();
18 changes: 10 additions & 8 deletions lite/kernels/opencl/fc_buffer_compute_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,11 @@ TEST(fc, compute) {
out.Resize(out_dim);
out_ref.Resize(out_dim);

VLOG(2) << "out.dims():" << out.dims() << ", out_dim:" << out_dim;

auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto* w_data = w.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto* bias_data = bias.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto* w_data = w.mutable_data<float>();
auto* bias_data = bias.mutable_data<float>();
auto* out_data = out.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));

std::default_random_engine engine;
Expand All @@ -148,17 +150,15 @@ TEST(fc, compute) {
}
for (size_t i = 0; i < w_dim.production(); ++i) {
w_source[i] = static_cast<int>(dist(engine));
w_data[i] = w_source[i];
}
for (size_t i = 0; i < bias_dim.production(); ++i) {
bias_source[i] = 10; // static_cast<int>(dist(engine));
bias_data[i] = 10;
}

TargetWrapperCL::MemcpySync(
x_data, x_source.data(), x_size, IoDirection::HtoD);
TargetWrapperCL::MemcpySync(
w_data, w_source.data(), w_size, IoDirection::HtoD);
TargetWrapperCL::MemcpySync(
bias_data, bias_source.data(), bias_size, IoDirection::HtoD);

// run opencl kernel
kernel->Launch();
Expand Down Expand Up @@ -186,8 +186,10 @@ TEST(fc, compute) {
#endif

std::vector<float> out_data_from_gpu(out_dim.production());
TargetWrapperCL::MemcpySync(
out_data_from_gpu.data(), out_data, bias_size, IoDirection::DtoH);
TargetWrapperCL::MemcpySync(out_data_from_gpu.data(),
out_data,
out_data_from_gpu.size() * sizeof(float),
IoDirection::DtoH);

// run cpu ref
auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
Expand Down

0 comments on commit ad4b1ed

Please sign in to comment.