diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h index e268b1dc54f..e3eb8aea3aa 100644 --- a/lite/api/paddle_use_passes.h +++ b/lite/api/paddle_use_passes.h @@ -107,6 +107,7 @@ USE_MIR_PASS(__xpu__bigru_fuse_pass); USE_MIR_PASS(__xpu__dynamic_lstm_fuse_pass); USE_MIR_PASS(__xpu__multi_softmax_fuse_pass); USE_MIR_PASS(__xpu__max_pooling_pad_zero_detect_fuse_pass); +USE_MIR_PASS(__xpu__static_kernel_pick_pass); USE_MIR_PASS(x86_int8_attribute_pass); USE_MIR_PASS(fill_range_fuse_pass); USE_MIR_PASS(range_calc_offline_pass); diff --git a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc new file mode 100644 index 00000000000..038e7e22678 --- /dev/null +++ b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc @@ -0,0 +1,736 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h" +#include +#include +#include +#include +#include +#include +#include +#ifdef LITE_WITH_XPU +#include "lite/backends/xpu/target_wrapper.h" +#endif +#include "lite/core/optimizer/mir/graph_visualize_pass.h" +#include "lite/core/optimizer/mir/pass_registry.h" +namespace paddle { +namespace lite { +namespace mir { + +bool XPUKernelScoreCmp(const std::pair>& a, + const std::pair>& b) { + return a.first > b.first; +} + +void XPUStaticKernelPickPass::Apply(const std::unique_ptr& graph) { + kernel_pick_factors_.ConsiderTarget(); + kernel_pick_factors_.ConsiderPrecision(); + kernel_pick_factors_.ConsiderDataLayout(); + CHECK(kernel_pick_factors_.any_factor_considered()) + << "kernel_pick_factors should be specified first"; + CHECK(graph) << "graph not valid"; + +// Collect input data precision for each node in the graph +#ifdef LITE_WITH_XPU + DicideUseFP16Optimizer(graph); + if (xpu_use_fp16_optimizer_) { + GetXPUDeviceType(); + for (auto& node : graph->StmtTopologicalOrder()) { + if (!node->IsStmt()) continue; + if (xpu_special_op_.count(node->AsStmt().op_type())) { + SpecialNodeInputPrecision(node); + continue; + } + + if (xpu_inplace_op_.count(node->AsStmt().op_type())) { + continue; + } + + NodeInputPrecision(node, graph); + } + + for (auto& node : graph->StmtTopologicalOrder()) { + if (!node->IsStmt()) continue; + if (xpu_inplace_op_.count(node->AsStmt().op_type()) == 0) { + continue; + } + + InplaceNodeInputPrecision(node); + } + } +#endif + + // sort kernels by the factors. + VLOG(2) << "graph block_idx: " << graph->blockIdx(); + VLOG(2) << "graph->mutable_nodes().size(): " << graph->mutable_nodes().size(); + size_t idx = 0; + for (auto& node : graph->StmtTopologicalOrder()) { + if (!node->IsStmt()) continue; + auto& instruct = node->AsStmt(); + VLOG(2) << "pick kernel for op : " << instruct.op_type() << ", in block " + << graph->blockIdx() << ", idx : " << idx++; + + std::map in_types; + std::map out_types; + // threse precision info store in __model__ file, if selected fp16 kernel, + // the output precision should be changed + for (std::list::iterator i = node->inlinks.begin(); + i != node->inlinks.end(); + ++i) { + if ((*i)->arg()->type) + in_types[(*i)->arg()->name] = (*i)->arg()->type->precision(); + } + for (std::list::iterator i = node->outlinks.begin(); + i != node->outlinks.end(); + ++i) { + if ((*i)->arg()->type) + out_types[(*i)->arg()->name] = (*i)->arg()->type->precision(); + } + // Get candidate kernels + std::vector>> scored; + CHECK(!instruct.kernels().empty()) << "No kernels found for " + << instruct.op_type(); + + VLOG(2) << "candidate kernels size:" << instruct.kernels().size(); + + for (auto&& kernel : instruct.kernels()) { + VLOG(2) << "current candidate kernel is: " << kernel->summary(); + VLOG(2) << "valid_places size is: " << graph->valid_places().size(); + + float score = KernelGrade(node, + *kernel, + graph->valid_places(), + in_types, + out_types, + instruct.op_info()->input_names(), + instruct.op_info()->output_names()); + + scored.emplace_back(score, std::move(kernel)); + } + std::stable_sort(scored.begin(), scored.end(), XPUKernelScoreCmp); + instruct.kernels().clear(); + + if (!instruct.op_info()->HasAttr("enable_int8")) { +#ifdef LITE_WITH_XPU + if (xpu_use_fp16_optimizer_) { + if (xpu_special_op_.count(node->AsStmt().op_type())) { + SpecialNodeOutputPrecision(graph, node, scored.front().second); + } else if (xpu_inplace_op_.count(node->AsStmt().op_type())) { + InplaceNodeOutputPrecision(node->AsStmt(), + instruct.op_info()->input_names(), + instruct.op_info()->output_names()); + } else { + NodeOutputPrecision(graph, node); + } + } +#endif + + instruct.kernels().emplace_back(std::move(scored.front().second)); + VLOG(2) << "the final pick kernel is " + << instruct.kernels().front()->summary() << "\n\n"; + } else { + // TODO(quwei): consider XPU int8 data precision + bool out_type_int8 = true; + // Quantized lstm has fp32 output + if (instruct.op_type() == "lstm" || instruct.op_type() == "gru" || + instruct.op_type() == "__xpu__multi_encoder" || + instruct.op_type() == "__xpu__fc") { + out_type_int8 = false; + } + // Only if all ops linked to this op output has enable_int8 attr, + // then the op output type is int8, or fp32. + // Note, the quantized op linked to lstm and gru should output fp32 + // tensor. + for (auto* out_n : node->outlinks) { + CHECK(out_n->IsArg()); + for (auto* tmp_op : out_n->outlinks) { + CHECK(tmp_op->IsStmt()); + auto* tmp_op_info = tmp_op->AsStmt().op_info(); + if (!tmp_op_info->HasAttr("enable_int8") || + tmp_op_info->Type() == "lstm" || tmp_op_info->Type() == "gru" || + instruct.op_type() == "__xpu__multi_encoder" || + instruct.op_type() == "__xpu__fc") { + out_type_int8 = false; + break; + } + } + if (!out_type_int8) break; + } + // If the out_type_int8 is true, it turns out that the output type of + // this + // op can be int8. + // So we need to specify output scale for this op. + if (out_type_int8) { + auto out_node = node->outlinks.front(); + CHECK(out_node->IsArg()); + auto out_node_name = out_node->arg()->name; + auto one_adj_op_node = out_node->outlinks.front(); + CHECK(one_adj_op_node->IsStmt()); + auto& one_adj_instruct = one_adj_op_node->AsStmt(); + CHECK(one_adj_instruct.op_info()->HasAttr("enable_int8")); + CHECK(one_adj_instruct.op_info()->HasInputScale(out_node_name)); + + instruct.mutable_op_info()->SetOutputScale( + out_node_name, + one_adj_instruct.op_info()->GetInputScale(out_node_name)); + + auto update_desc = *instruct.mutable_op_info(); + instruct.ResetOp(update_desc, graph->valid_places()); + scored.clear(); + for (auto&& kernel : instruct.kernels()) { + float score = KernelGrade(node, + *kernel, + graph->valid_places(), + in_types, + out_types, + instruct.op_info()->input_names(), + instruct.op_info()->output_names()); + scored.emplace_back(score, std::move(kernel)); + } + std::stable_sort(scored.begin(), scored.end(), XPUKernelScoreCmp); + instruct.kernels().clear(); + } + // If the out_type_int8 is true, we should pick the kernel with the + // int8 input and int8 output. + // If the out_type_int8 is false, we should pick the kernel with the + // int8 input and fp32 output. + auto output_arguments = instruct.op_info()->OutputArgumentNames(); + for (auto& candidate : scored) { + bool all_output_type_match = true; + auto expect_output_type = + out_type_int8 ? PRECISION(kInt8) : PRECISION(kFloat); + + for (auto& arg_name : output_arguments) { + const Type* out_arg_ty = + candidate.second->GetOutputDeclType(arg_name); + if (out_arg_ty->precision() != expect_output_type) { + all_output_type_match = false; + } + } + + if (all_output_type_match) { + instruct.kernels().emplace_back(std::move(candidate.second)); + VLOG(2) << "instruct.kernels.emplace_back " + << instruct.kernels().front()->name(); + break; + } + } + CHECK(!instruct.kernels().empty()) << "No kernels found for " + << instruct.op_type(); + } + } +} + +#ifdef LITE_WITH_XPU +void XPUStaticKernelPickPass::DicideUseFP16Optimizer( + const std::unique_ptr& graph) { + if (graph->valid_places()[0].precision == PrecisionType::kFP16) { + xpu_use_fp16_optimizer_ = true; + VLOG(2) << "XPU auto use data precision: FP16/FP32/INT16 "; + } +} + +void XPUStaticKernelPickPass::ForceUseFP32Kernel( + size_t* score, + const lite::KernelBase& kernel, + const paddle::lite::mir::Node::Stmt& instruct) { + if (kernel.place().target != TARGET(kXPU)) { + return; + } + + // only use in FC,it will not use in future. + if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int31" || + lite::TargetWrapperXPU::multi_encoder_precision == "int31") { + if (kernel.alias() == "XPU_Real_kFloat" && + instruct.op_type() == "__xpu__fc") { + *score *= 2; + VLOG(6) << "__xpu__fc: force use PRECISON INT31: *2"; + } + return; + } + + if (GetStringFromEnv("XPU_COMPUTE_PRECISION", "int16") == "int31") { + if (kernel.alias() == "XPU_Real_kFloat" && + PRECISION_INT31_OP_.count(instruct.op_type())) { + *score *= 2; + VLOG(6) << instruct.op_type() << ": force use PRECISON INT31: *2"; + } + return; + } + + if (kernel.alias() == "XPU_Real_kFloat") { + *score = 0; + VLOG(6) << "By default,XPU not use PRECISION INT31, so not pick " + "current kernel: " + << kernel.summary(); + } +} + +void XPUStaticKernelPickPass::ForceUseInt8Kernel( + size_t* score, + const lite::KernelBase& kernel, + const paddle::lite::mir::Node::Stmt& instruct) { + if (kernel.place().target != TARGET(kXPU)) { + return; + } + + // only use in FC,it will not use in future. + if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int8" || + lite::TargetWrapperXPU::multi_encoder_precision == "int8") { + if (kernel.alias() == "XPU_Int8_FP32_FP32" && + instruct.op_type() == "__xpu__fc") { + *score *= 2; + VLOG(6) << "__xpu__fc: force use PRECISON INT8: *2"; + } + return; + } + + if (GetStringFromEnv("XPU_COMPUTE_PRECISION", "int16") == "int8") { + if (kernel.alias() == "XPU_Int8_FP32_FP32" && + PRECISION_INT8_OP_.count(instruct.op_type())) { + *score *= 2; + VLOG(6) << instruct.op_type() << ": force use PRECISON INT8: *2"; + } + return; + } + + if (kernel.alias() == "XPU_Int8_FP32_FP32") { + *score = 0; + VLOG(6) << "By default,XPU not use PRECISION INT8, so not pick " + "current kernel: " + << kernel.summary(); + } +} + +void XPUStaticKernelPickPass::GetScore(PrecisionType precision, + size_t* score_tmp) { + if (precision == PrecisionType::kInt16) { + *score_tmp = *score_tmp > 9 ? *score_tmp : 9; + } else if (precision == PrecisionType::kFP16) { + *score_tmp = *score_tmp > 7 ? *score_tmp : 7; + } else if (precision == PrecisionType::kAny) { + *score_tmp = *score_tmp > 1 ? *score_tmp : 1; + } else { + *score_tmp = *score_tmp > 5 ? *score_tmp : 5; + } +} + +void XPUStaticKernelPickPass::NodeOutputPrecision( + const std::unique_ptr& graph, lite::mir::Node* node) { + auto& inst = node->AsStmt(); + if (inst.op_type() == "fetch") { + return; + } + + const auto* op_info = inst.op_info(); + for (auto* out_node : node->outlinks) { + auto& var = out_node->AsArg(); + const auto& var_name = var.name; + std::string arg_name; + CHECK(op_info->GetOutputArgname(var_name, &arg_name)) + << "Can not find the output argument,current var name : " << var_name; + VLOG(6) << " output arg name:" << arg_name << " var name:" << var_name; + Scope* scope = node->AsStmt().op()->scope(); + auto* var_ptr = scope->FindVar(var_name); + if (var_ptr == nullptr) { + VLOG(6) << "Can't find ouput var_name: " << var_name + << "in current scope."; + continue; + } + + PrecisionType precison = var_ptr->GetMutable()->precision(); + xpu_output_type_.emplace(var_name, precison); + } +} + +void XPUStaticKernelPickPass::SpecialNodeOutputPrecision( + const std::unique_ptr& graph, + lite::mir::Node* node, + const std::unique_ptr& kernel) { + auto& inst = node->AsStmt(); + + std::vector out_var_names; + const auto* op_info = inst.op_info(); + for (auto* out_node : node->outlinks) { + auto& var = out_node->AsArg(); + const auto& var_name = var.name; + std::string arg_name; + + CHECK(op_info->GetOutputArgname(var_name, &arg_name)) + << "Can not find the output argument, current var name : " << var_name; + VLOG(6) << " output arg name:" << arg_name << " var name:" << var_name; + if (output_parameter_name_.count(arg_name) == 0) { + continue; + } + + const auto* decl_type = kernel->GetOutputDeclType(arg_name); + CHECK(decl_type); + PrecisionType precison = decl_type->precision(); + xpu_output_type_.emplace(var_name, precison); + } +} + +void XPUStaticKernelPickPass::InplaceNodeOutputPrecision( + const paddle::lite::mir::Node::Stmt& instruct, + const std::vector& in_names, + const std::vector& out_names) { + PrecisionType pre_op_output_precision = PrecisionType::kUnk; + for (size_t i = 0; i < in_names.size(); ++i) { + std::string tmp; + CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp)); + VLOG(6) << "current kernel input data variable name:" << in_names[i] + << "Parameter name:" << tmp; + if (input_parameter_name_.count(tmp) && + xpu_output_type_.count(in_names[i])) { + pre_op_output_precision = xpu_output_type_[in_names[i]]; + } + } + + // collect inplace op output data precision + if (pre_op_output_precision != PrecisionType::kUnk) { + for (size_t i = 0; i < out_names.size(); ++i) { + std::string tmp; + CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp)); + if (output_parameter_name_.count(tmp)) { + xpu_output_type_.emplace(out_names[i], pre_op_output_precision); + } + } + } +} + +// Special nodes like conv2d, matmul ; collect input data precision for eatch +// registry kernel as a candidate set. +void XPUStaticKernelPickPass::SpecialNodeInputPrecision(lite::mir::Node* node) { + auto& inst = node->AsStmt(); + const auto* op_info = inst.op_info(); + for (auto* in_node : node->inlinks) { + auto& var = in_node->AsArg(); + const auto& var_name = var.name; + std::string arg_name; + CHECK(op_info->GetInputArgname(var_name, &arg_name)) + << "Can not find the input argument,current var name : " << var_name; + VLOG(6) << " input arg name:" << arg_name << " var name:" << var_name; + if (input_parameter_name_.count(arg_name) == 0) { + continue; + } + + std::vector> kernel_input_type{}; + for (auto&& kernel : inst.kernels()) { + if (kernel->summary().find(xpu_disable_flag_) != std::string::npos) { + VLOG(6) << " ignore collect current kernel:" << kernel->summary(); + continue; + } + + std::map tmp_map; + PrecisionType precison; + + const auto* decl_type = kernel->GetInputDeclType(arg_name); + CHECK(decl_type); + precison = decl_type->precision(); + tmp_map.emplace(kernel->summary(), precison); + kernel_input_type.emplace_back(std::move(tmp_map)); + } + + xpu_input_type_.emplace(var_name, kernel_input_type); + } +} + +void XPUStaticKernelPickPass::NodeInputPrecision( + lite::mir::Node* node, const std::unique_ptr& graph) { + auto& inst = node->AsStmt(); + if (inst.op_type() == "feed") { + return; + } + + const auto* op_info = inst.op_info(); + for (auto* in_node : node->inlinks) { + auto& var = in_node->AsArg(); + const auto& var_name = var.name; + std::string arg_name; + CHECK(op_info->GetInputArgname(var_name, &arg_name)) + << "Can not find the input argument,current var name : " << var_name; + VLOG(6) << " input arg name:" << arg_name << " var name:" << var_name; + + std::vector> kernel_input_type{}; + std::map tmp_map; + PrecisionType precison; + Scope* scope = node->AsStmt().op()->scope(); + + auto* var_ptr = scope->FindVar(var_name); + if (var_ptr == nullptr) { + VLOG(6) << "Can't find input var_name: " << var_name + << "in current scope."; + continue; + } + + precison = var_ptr->GetMutable()->precision(); + tmp_map.emplace(inst.op_type(), precison); + kernel_input_type.emplace_back(std::move(tmp_map)); + xpu_input_type_.emplace(var_name, kernel_input_type); + } +} + +// Special for inplace op. +void XPUStaticKernelPickPass::InplaceNodeInputPrecision(lite::mir::Node* node) { + auto& inst = node->AsStmt(); + const auto* op_info = inst.op_info(); + // inplace op only has one inpute variable. + std::string inplace_op_input_name{"none"}; + for (auto* in_node : node->inlinks) { + auto& var = in_node->AsArg(); + const auto& var_name = var.name; + std::string arg_name; + CHECK(op_info->GetInputArgname(var_name, &arg_name)) + << "Can not find the input argument,current var name : " << var_name; + VLOG(6) << " input arg name:" << arg_name << " var name:" << var_name; + if (input_parameter_name_.count(arg_name)) { + inplace_op_input_name = var_name; + } + } + + for (auto* out_node : node->outlinks) { + auto& var = out_node->AsArg(); + const auto& var_name = var.name; + std::string arg_name; + int num = 0; + + CHECK(op_info->GetOutputArgname(var_name, &arg_name)) + << "Can not find the output argument,current var name : " << var_name; + VLOG(6) << " output arg name:" << arg_name << " var name:" << var_name; + // inplace op only have one output variable,but ic can connect input + // variables of multiple Ops + int output_match_num = xpu_input_type_.count(var_name); + if (output_parameter_name_.count(arg_name) == 0 || output_match_num == 0) { + continue; + } + + for (auto iter = xpu_input_type_.begin(); iter != xpu_input_type_.end(); + ++iter) { + if (num >= output_match_num) { + break; + } + + if (iter->first != var_name) { + continue; + } + + ++num; + xpu_input_type_.emplace(inplace_op_input_name, iter->second); + } + VLOG(6) << "inplace op :" << inst.op_type() << "input prision" + << "replace by the next op input prision "; + VLOG(6) << "inplace op :" << inst.op_type() + << ", inpute name:" << inplace_op_input_name + << ",the next op input input name : " << var_name; + } +} + +void XPUStaticKernelPickPass::InplaceOpScore( + const lite::KernelBase& kernel, + const paddle::lite::mir::Node::Stmt& instruct, + const std::vector& in_names, + const std::vector& out_names, + bool* type_match, + size_t* score) { + PrecisionType pre_op_output_precision = PrecisionType::kUnk; + for (size_t i = 0; i < in_names.size(); ++i) { + std::string tmp; + CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp)); + VLOG(6) << "current kernel input data variable name:" << in_names[i] + << "Parameter name:" << tmp; + if (input_parameter_name_.count(tmp) && + xpu_output_type_.count(in_names[i])) { + size_t score_tmp = 0; + pre_op_output_precision = xpu_output_type_[in_names[i]]; + if (kernel.GetInputDeclType(tmp)->precision() == PrecisionType::kAny) { + GetScore(PrecisionType::kAny, &score_tmp); + VLOG(6) << "current inplace kernel input data precision:kAny"; + } + + if (pre_op_output_precision == + kernel.GetInputDeclType(tmp)->precision() || + pre_op_output_precision == PrecisionType::kAny) { + GetScore(pre_op_output_precision, &score_tmp); + *type_match = true; + VLOG(6) << "inplace op match input data precision"; + } + + *score += score_tmp; + } + } + + // collect inplace op output data precision + if (pre_op_output_precision != PrecisionType::kUnk) { + for (size_t i = 0; i < out_names.size(); ++i) { + std::string tmp; + CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp)); + if (output_parameter_name_.count(tmp)) { + xpu_output_type_.emplace(out_names[i], pre_op_output_precision); + } + } + } +} + +void XPUStaticKernelPickPass::SpecialOpScore( + const lite::KernelBase& kernel, + const paddle::lite::mir::Node::Stmt& instruct, + const std::vector& in_names, + const std::vector& out_names, + bool* type_match, + size_t* score) { + size_t score_tmp_all = 0; + bool intput_match = true; + bool output_match = true; + bool consider_cpu = false; + // delete?? + if (consider_cpu_op_.count(instruct.op_type())) { + consider_cpu = true; + } + + if (!(kernel.place().target == TARGET(kXPU) || consider_cpu)) { + return; + } + + // input data precision score + for (size_t i = 0; i < in_names.size(); ++i) { + std::string tmp; + CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp)); + if (input_parameter_name_.count(tmp) == 0) { + continue; + } + + if (xpu_output_type_.count(in_names[i]) == 0) { + continue; + } + + VLOG(6) << "current kernel input data variable name:" << in_names[i] + << ", Parameter name:" << tmp; + + size_t score_tmp = 0; + if (kernel.GetInputDeclType(tmp)->precision() == PrecisionType::kAny) { + GetScore(PrecisionType::kAny, &score_tmp); + VLOG(6) << "match input data precision:kAny"; + } + + if (xpu_output_type_[in_names[i]] == + kernel.GetInputDeclType(tmp)->precision() || + xpu_output_type_[in_names[i]] == PrecisionType::kAny) { + GetScore(xpu_output_type_[in_names[i]], &score_tmp); + VLOG(6) << "match input data precision"; + } + + if (score_tmp == 0) { + output_match = false; + } + + score_tmp_all += score_tmp; + } + + // output data precision score + for (size_t i = 0; i < out_names.size(); ++i) { + std::string tmp; + CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp)); + int output_match_num = xpu_input_type_.count(out_names[i]); + if (output_parameter_name_.count(tmp) == 0) { + continue; + } + + if (output_match_num == 0) { + continue; + } + + VLOG(6) << "current kernel output data variable name:" << out_names[i] + << ", Parameter name:" << tmp; + int num = 0; + size_t score_tmp = 0; + for (auto iter = xpu_input_type_.begin(); iter != xpu_input_type_.end(); + ++iter) { + if (num >= output_match_num) { + break; + } + + if (iter->first != out_names[i]) { + continue; + } + + ++num; + for (auto& map_kernel : iter->second) { + // Special op fetch + if (map_kernel.begin()->first.substr(0, 5) == "fetch") { + if (map_kernel.begin()->second == + kernel.GetOutputDeclType(tmp)->precision()) { + score_tmp = 500; + } + continue; + } + + if (kernel.GetOutputDeclType(tmp)->precision() == PrecisionType::kAny) { + VLOG(6) << "match precision:kAny,the next kernel's name:" + << map_kernel.begin()->first; + GetScore(PrecisionType::kAny, &score_tmp); + } + + if (map_kernel.begin()->second == + kernel.GetOutputDeclType(tmp)->precision() || + map_kernel.begin()->second == PrecisionType::kAny) { + VLOG(6) << "match next kernel's input data precision,the " + "next kernel name:" + << map_kernel.begin()->first; + GetScore(map_kernel.begin()->second, &score_tmp); + } + } + } + + if (score_tmp == 0) { + output_match = false; + } + score_tmp_all += score_tmp; + } + + if (score_tmp_all > 0) { + *type_match = intput_match & output_match; + } + + *score += score_tmp_all; +} + +void XPUStaticKernelPickPass::GetXPUDeviceType() { + int cur_dev_idx = 0; + uint64_t cur_dev_attr = 0; + + XPU_CALL(xpu_current_device(&cur_dev_idx)); + XPU_CALL(xpu_device_get_attr(&cur_dev_attr, XPUATTR_MODEL, cur_dev_idx)); + if (cur_dev_attr <= 1) { + VLOG(4) << "Currents XPU device : XPU1"; + xpu_disable_flag_ = "DISABLE_XPU1"; + } else if (cur_dev_attr >= 2 && cur_dev_attr <= 299) { + VLOG(4) << "Currents XPU device : XPU2"; + xpu_disable_flag_ = "DISABLE_XPU2"; + } else if (cur_dev_attr >= 300 && cur_dev_attr <= 599) { + VLOG(4) << "Currents XPU device : XPU3"; + xpu_disable_flag_ = "DISABLE_XPU3"; + } else { + VLOG(4) << "invaid XPU device"; + xpu_disable_flag_ = "NONE"; + } +} + +#endif +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__static_kernel_pick_pass, + paddle::lite::mir::XPUStaticKernelPickPass) + .BindTargets({TARGET(kXPU)}); diff --git a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h new file mode 100644 index 00000000000..cb3d25d3309 --- /dev/null +++ b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h @@ -0,0 +1,332 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include +#include +#include +#include +#include +#include +#include "lite/core/optimizer/mir/pass.h" +#include "lite/core/types.h" + +namespace paddle { +namespace lite { +namespace mir { + +/* + * XPUStaticKernelPickPass is a simple strategy for picking the kernel for each + * Operator using operator developer defined rule, there are many other tactics + * such as considering IO or kernel execution latency and we will implement them + * latter. + * + * There are two argument for this pass: + * - place, the target place. + * - kernel_pick_factors, the factors to consider in picking kernels. + * Set them first before execute the pass. + */ +class XPUStaticKernelPickPass : public mir::StmtPass { + public: + void Apply(const std::unique_ptr& graph) override; + + const core::KernelPickFactor& kernel_pick_factors() const { + return kernel_pick_factors_; + } + core::KernelPickFactor* mutable_kernel_pick_factors() { + return &kernel_pick_factors_; + } + + private: + // Score the kernel. + size_t KernelGrade(lite::mir::Node* node, + const lite::KernelBase& kernel, + const std::vector& places, + const std::map& in_types, + const std::map& out_types, + const std::vector& in_names, + const std::vector& out_names) { + const auto& instruct = node->AsStmt(); + CHECK_GT(places.size(), static_cast(0)) << "valid_places is empty."; + float final_score{-1.}; + Place winner_place{places[0]}; + const int kMax = + (std::numeric_limits::max)(); + size_t place_size = places.size(); + + // NOTE: We compare kernel's place with place in valid_places to select the + // best match place + // The place's order in valid_places array decide the user's + // preference + // final_score = weight * socre + // weight: The weight is compute with (valid_places.size() - i) / + // valid_places.size() as default. + // where i is the place's index in valid_places array. + // score: score is the weighted sum of target、percision and layout + for (size_t i = 0; i < place_size; ++i) { + const auto& place = places[i]; + float weight = static_cast(place_size - i) / place_size; + VLOG(4) << "current place is " << place.DebugString() << ", idx : " << i + << ", weight : " << weight; + size_t score{}; + + // The more important factor comes first + if (kernel_pick_factors_.IsTargetConsidered() && + (place.target == kernel.target() || kernel.target() == TARGET(kAny) || + place.target == TARGET(kAny))) { + size_t target_score = + kMax / + static_cast(core::KernelPickFactor::Factor::TargetFirst); + score += target_score; + VLOG(4) << "[TargetConsidered score]:" << target_score; + } + VLOG(4) << "[score s1]:" << score; + + if (kernel_pick_factors_.IsPrecisionConsidered() && + (place.precision == kernel.precision() || + kernel.precision() == PRECISION(kFloat) || + kernel.precision() == PRECISION(kAny) || + place.precision == PRECISION(kAny))) { + // score skipped, if kernel is int8, but op is not int8 + if (!(kernel.precision() == PRECISION(kInt8) && + !instruct.op_info()->HasAttr("enable_int8"))) { + size_t precision_score = + kMax / + static_cast(core::KernelPickFactor::Factor::PrecisionFirst); + score += precision_score; + VLOG(4) << "[PrecisionConsidered score]:" << precision_score; + } + } + VLOG(4) << "[score s2]:" << score; + + if (kernel_pick_factors_.IsDataLayoutConsidered() && + (place.layout == kernel.layout() || + kernel.layout() == DATALAYOUT(kAny) || + place.layout == DATALAYOUT(kAny))) { + size_t datalayout_score = + kMax / + static_cast(core::KernelPickFactor::Factor::DataLayoutFirst); + score += datalayout_score; + VLOG(4) << "[DataLayoutConsidered score]:" << datalayout_score; + } + VLOG(4) << "[score s3]:" << score; + + // add new rules for precision: When the input types are consistent with + // kernel's input types, select the kernel of the precision. However, if + // the op is feed, we should compare the output precision type. + // Note that this strategy is not compatible with quantization, so skip + // quantization op. + if (!instruct.op_info()->HasAttr("enable_int8")) { + bool type_match = true; + if (instruct.op_type() == "feed") { + for (size_t i = 0; i < out_names.size(); ++i) { + std::string tmp; + CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp)); + if (out_types.count(out_names[i]) && + out_types.at(out_names[i]) != + kernel.GetOutputDeclType(tmp)->precision()) { + type_match = false; + } + } + } else { + for (size_t i = 0; i < in_names.size(); ++i) { + std::string tmp; + CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp)); + if (in_types.count(in_names[i]) && + !PrecTypeCompatible( + in_types.at(in_names[i]), + kernel.GetInputDeclType(tmp)->precision())) { + type_match = false; + } + } + } +#ifdef LITE_WITH_XPU + if (xpu_use_fp16_optimizer_ && + (xpu_special_op_.count(instruct.op_type()) || + xpu_inplace_op_.count(instruct.op_type()))) { + type_match = false; + if (kernel.summary().find(xpu_disable_flag_) != std::string::npos) { + score = 0; + VLOG(6) << " ignore pick current kernel:" << kernel.summary(); + } else if (xpu_inplace_op_.count(instruct.op_type())) { + InplaceOpScore( + kernel, instruct, in_names, out_names, &type_match, &score); + } else { + SpecialOpScore( + kernel, instruct, in_names, out_names, &type_match, &score); + } + } +#endif + + if (type_match) { + score *= 2; + VLOG(4) << "[Input/Output precision compatible]: *2"; + } + VLOG(4) << "[score s4]:" << score; + } +#ifdef LITE_WITH_XPU + ForceUseFP32Kernel(&score, kernel, instruct); + ForceUseInt8Kernel(&score, kernel, instruct); +#endif + + // add new rules for datatype: When the input types are consistent with + // kernel's input types, select the kernel of the datatype. + if (instruct.op_info()->Type() != "conditional_block" && + instruct.op_info()->Type() != "while" && + instruct.op_info()->Type() != "subgraph") { + bool datatype_match = true; + for (auto* in : node->inlinks) { + if (!in->IsArg()) continue; + if (in->AsArg().name == "feed" || in->AsArg().is_persist) continue; + std::string argname; + instruct.op_info()->GetInputArgname(in->AsArg().name, &argname); + VLOG(5) << "intput var name : " << in->AsArg().name; + // only when datatype is LOD_TENSOR, LOD_TENSOR_ARRAY, STEP_SCOPES, + // the type pointer is not null; + if (in->AsArg().type) { + VLOG(5) << "input datatype : " + << static_cast(in->AsArg().type->id()); + VLOG(5) << "kernel bind datatype : " + << static_cast(kernel.GetInputDeclType(argname)->id()); + if (static_cast(in->AsArg().type->id()) != + static_cast(kernel.GetInputDeclType(argname)->id())) + datatype_match = false; + } else { + datatype_match = false; + } + } + if (datatype_match) { + score *= 2; + VLOG(4) << "[Input datatype compatible]: *2"; + } + VLOG(4) << "[score s5]:" << score; + } + + if (weight * score > final_score) { + final_score = weight * score; + winner_place = place; + } + } + + VLOG(2) << "-------- score summary for candidate kernel : " + << kernel.summary() << " --------"; + VLOG(2) << " ===> winner_place():" << PrecisionToStr(winner_place.precision) + << " " << DataLayoutToStr(winner_place.layout) << " " + << TargetToStr(winner_place.target); + VLOG(2) << " ===> kernel.place():" + << PrecisionToStr(kernel.place().precision) << " " + << DataLayoutToStr(kernel.place().layout) << " " + << TargetToStr(kernel.place().target); + VLOG(4) << "kernel.op_type():" << kernel.op_type(); + VLOG(4) << "kernel picker factors:" << kernel_pick_factors_; + VLOG(4) << "winner_picker place:" << winner_place.DebugString(); + VLOG(4) << "[score(final)]:" << final_score; + VLOG(4) << "------------------------------"; + + // The data layout is not considered, for the input and output arguments + // might have different data layout. + // TODO(Superjomn) reconsider the idea of taking the data layout as a kernel + // specification. + return final_score; + } + + // Compatible for PrecisionType. + // For cuda, in the process of choosing kernel, fp16 and fp32 are compatiable. + // If kernel's declared type is kAny, it is matched. + bool PrecTypeCompatible(const PrecisionType& p1, const PrecisionType& p2) { + if (p1 == p2 || p2 == PRECISION(kAny)) { + return true; + } else if ((p1 == PRECISION(kFP16) || p1 == PRECISION(kFloat)) && + (p2 == PRECISION(kFP16) || p2 == PRECISION(kFloat))) { + return true; + } else { + return false; + } + } +#ifdef LITE_WITH_XPU + void DicideUseFP16Optimizer(const std::unique_ptr& graph); + void ForceUseFP32Kernel(size_t* score, + const lite::KernelBase& kernel, + const paddle::lite::mir::Node::Stmt& instruct); + void ForceUseInt8Kernel(size_t* score, + const lite::KernelBase& kernel, + const paddle::lite::mir::Node::Stmt& instruct); + void GetScore(PrecisionType precision, size_t* score_tmp); + + void NodeInputPrecision(lite::mir::Node* node, + const std::unique_ptr& graph); + void InplaceNodeInputPrecision(lite::mir::Node* node); + void SpecialNodeInputPrecision(lite::mir::Node* node); + + void NodeOutputPrecision(const std::unique_ptr& graph, + lite::mir::Node* node); + void InplaceNodeOutputPrecision(const paddle::lite::mir::Node::Stmt& instruct, + const std::vector& in_names, + const std::vector& out_names); + void SpecialNodeOutputPrecision( + const std::unique_ptr& graph, + lite::mir::Node* node, + const std::unique_ptr& kernel); + + void SpecialOpScore(const lite::KernelBase& kernel, + const paddle::lite::mir::Node::Stmt& instruct, + const std::vector& in_names, + const std::vector& out_names, + bool* type_match, + size_t* score); + void GetXPUDeviceType(); + void InplaceOpScore(const lite::KernelBase& kernel, + const paddle::lite::mir::Node::Stmt& instruct, + const std::vector& in_names, + const std::vector& out_names, + bool* type_match, + size_t* score); +#endif + + private: + core::KernelPickFactor kernel_pick_factors_; +#ifdef LITE_WITH_XPU + bool xpu_use_fp16_optimizer_{false}; + // TODO(quwei:) addn more op + const std::set PRECISION_INT31_OP_{"__xpu__fc"}; + const std::set PRECISION_INT8_OP_{"__xpu__fc"}; + const std::set input_parameter_name_{ + "Input", "X", "Y", "Branch", "BBoxes", "Scores", "repeat_times_tensor"}; + const std::set output_parameter_name_{ + "Output", "Out", "Boxes", "Scores", "Y"}; + std::multimap>> + xpu_input_type_{}; + std::map xpu_output_type_{}; + std::string xpu_disable_flag_{}; + const std::set consider_cpu_op_{"cast"}; + const std::set xpu_special_op_{"__xpu__fc", + "conv3d", + "__xpu__conv2d", + "gather", + "pool2d", + "concat", + "calib"}; + const std::set xpu_inplace_op_{"reshape", + "reshape2", + "flatten", + "flatten2", + "squeeze", + "squeeze2", + "unsqueeze", + "unsqueeze2"}; +#endif +}; + +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc index 0e3f3b0335d..d31f3d8d2b0 100644 --- a/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc +++ b/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc @@ -82,27 +82,6 @@ class XPUFcFuser : public FuseBase { op_desc.SetInput("Input", {matched.at("x")->arg()->name}); op_desc.SetInput("Filter", {matched.at("W")->arg()->name}); - std::string precision = "int16"; -#ifdef LITE_WITH_XPU - if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int31" || - lite::TargetWrapperXPU::multi_encoder_precision == "int31") { - precision = "int31"; - VLOG(3) << "Use int31 in XPUFcOp"; - } else if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int8" || - lite::TargetWrapperXPU::multi_encoder_precision == "int8") { - precision = "int8"; - if (op_desc.HasAttr("enable_int8") && - op_desc.GetAttr("enable_int8")) { - CHECK(op_desc.HasAttr("X0_scale")) << " quant model fc no X0_scale"; - CHECK(op_desc.HasAttr("Y0_scale")) << " quant model fc no Y0_scale"; - VLOG(3) << "Use int8 quant model in XPUFcOp, InputMax:" - << 127 * op_desc.GetAttr>("X0_scale")[0] - << ", WeightMax: " - << 127 * op_desc.GetAttr>("Y0_scale")[0]; - } - VLOG(3) << "Use int8 in XPUFcOp"; - } -#endif if (with_bias_) { op_desc.SetInput("Bias", {matched.at("bias")->arg()->name}); } @@ -119,7 +98,6 @@ class XPUFcFuser : public FuseBase { output_node_name = "mul_out"; } op_desc.SetOutput("Output", {output_name}); - op_desc.SetAttr("precision", precision); std::map act_map{{"linear", 0}, {"relu", 1}, {"sigmoid", 2}, diff --git a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc index 0b2516a3bf4..e323efbf7f5 100644 --- a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc +++ b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc @@ -1176,7 +1176,7 @@ class XPUMultiEncoderFusePass : public ProgramPass { std::vector matmul_types{"matmul", "matmul_v2"}; std::vector mul_types{"mul", "matmul"}; std::vector with_q_scales{true, false}; - std::vector norm_befores{false}; + std::vector norm_befores{true, false}; std::string fc_precision; bool adaptive_seqlen = false; diff --git a/lite/core/optimizer/mir/static_kernel_pick_pass.cc b/lite/core/optimizer/mir/static_kernel_pick_pass.cc index 92695aa9ed7..236173558d0 100644 --- a/lite/core/optimizer/mir/static_kernel_pick_pass.cc +++ b/lite/core/optimizer/mir/static_kernel_pick_pass.cc @@ -193,4 +193,5 @@ void StaticKernelPickPass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(static_kernel_pick_pass, paddle::lite::mir::StaticKernelPickPass) - .BindTargets({TARGET(kAny)}); + .BindTargets({TARGET(kAny)}) + .ExcludeTargets({TARGET(kXPU)}); diff --git a/lite/core/optimizer/optimizer.cc b/lite/core/optimizer/optimizer.cc index a6790eeb15d..a971ee6e048 100644 --- a/lite/core/optimizer/optimizer.cc +++ b/lite/core/optimizer/optimizer.cc @@ -14,6 +14,9 @@ #include "lite/core/optimizer/optimizer.h" #include +#ifdef LITE_WITH_XPU +#include "lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h" +#endif #include "lite/core/optimizer/mir/static_kernel_pick_pass.h" #include "lite/core/optimizer/mir/type_target_cast_pass.h" #include "lite/model_parser/model_parser.h" @@ -49,7 +52,6 @@ std::unique_ptr Optimizer::Run(Program&& program) { graph->SetValidPlaces(valid_places_); graphs_.emplace_back(std::move(graph)); } - SpecifyKernelPickTactic(kernel_pick_factor_); InitTargetTypeTransformPass(); InitControlFlowOpUnusedInputsAndOutputsEliminatePass(); @@ -63,8 +65,12 @@ std::unique_ptr Optimizer::Run(Program&& program) { } void Optimizer::SpecifyKernelPickTactic(core::KernelPickFactor factor) { + std::string static_pick_name = "static_kernel_pick_pass"; +#ifdef LITE_WITH_XPU + static_pick_name = "__xpu__static_kernel_pick_pass"; +#endif auto* pass = mir::PassManager::Global().LookUp( - "static_kernel_pick_pass"); + static_pick_name); CHECK(pass); *pass->mutable_kernel_pick_factors() = factor; @@ -229,6 +235,9 @@ std::unique_ptr RunDefaultOptimizer( "fpga_concat_fuse_pass", "control_flow_op_unused_inputs_and_outputs_eliminate_pass", "static_kernel_pick_pass", // pick original kernel from graph +#ifdef LITE_WITH_XPU + "__xpu__static_kernel_pick_pass", // xpu pick original kernel from graph +#endif "remove_tf_redundant_ops_pass", "variable_place_inference_pass", // inference arg/var's diff --git a/lite/kernels/xpu/__xpu__conv2d_compute.cc b/lite/kernels/xpu/__xpu__conv2d_compute.cc index cad9a4fd691..fb65dea06c5 100644 --- a/lite/kernels/xpu/__xpu__conv2d_compute.cc +++ b/lite/kernels/xpu/__xpu__conv2d_compute.cc @@ -22,36 +22,12 @@ namespace lite { namespace kernels { namespace xpu { -template -bool QuantFilter(const float* filter_on_host, - T* quant_res, - float max, - int64_t len) { - return false; -} - -template <> -bool QuantFilter(const float* filter_on_host, - int16_t* quant_res, - float max, - int64_t len) { - paddle::lite::xpu::math::ConvertFP32ToInt16( - filter_on_host, quant_res, max, len); - return true; -} - -template <> -bool QuantFilter(const float* filter_on_host, - int8_t* quant_res, - float max, - int64_t len) { - paddle::lite::xpu::math::ConvertFP32ToInt8( - filter_on_host, quant_res, max, len); - return true; -} - -template -void XPUConv2dCompute::PrepareForRun() { +template +void XPUConv2dCompute::PrepareForRun() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); int max_ptr_size = ctx.GetRawContext()->max_ptr_size(); @@ -60,12 +36,16 @@ void XPUConv2dCompute::PrepareForRun() { auto filter_dims = param.filter->dims(); xpu_quant_filter_ = - TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( + TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( filter_ptr, filter_dims, false); } -template -void XPUConv2dCompute::Run() { +template +void XPUConv2dCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); @@ -86,8 +66,8 @@ void XPUConv2dCompute::Run() { param.output_max->template mutable_data(TARGET(kXPU)); const auto* bias = param.has_bias ? param.bias->template data() : nullptr; - const float* branch = - param.has_branch ? param.branch->template data() : nullptr; + const DY* branch = + param.has_branch ? param.branch->template data() : nullptr; const float* input_max = param.input_max ? param.input_max->template data() : nullptr; xdnn::Activation_t act((xdnn::Activation_t::act_enum)act_type); @@ -101,15 +81,15 @@ void XPUConv2dCompute::Run() { CHECK_EQ(act_type, 0); if (branch_broadcast_guard_.get() == nullptr) { branch_broadcast_guard_ = TargetWrapperXPU::MallocScratchPad( - param.output->numel() * sizeof(float)); + param.output->numel() * sizeof(DY)); } else { - branch_broadcast_guard_->Reserve(param.output->numel() * sizeof(float)); + branch_broadcast_guard_->Reserve(param.output->numel() * sizeof(DY)); } - int r = xdnn::conv2d_fusion( + int r = xdnn::conv2d_fusion( ctx.GetRawContext(), - param.input->template data(), - reinterpret_cast(xpu_quant_filter_.data_ptr_), - reinterpret_cast(branch_broadcast_guard_->addr_), + param.input->template data(), + reinterpret_cast(xpu_quant_filter_.data_ptr_), + reinterpret_cast(branch_broadcast_guard_->addr_), batch, img_c, img_h, @@ -139,21 +119,21 @@ void XPUConv2dCompute::Run() { if (branch_shape > conv_out_shape) { param.output->Resize(lite::DDim(branch_shape)); } - float* output = param.output->template mutable_data(TARGET(kXPU)); - r = xdnn::broadcast_add( + DY* output = param.output->template mutable_data(TARGET(kXPU)); + r = xdnn::broadcast_add( ctx.GetRawContext(), - reinterpret_cast(branch_broadcast_guard_->addr_), + reinterpret_cast(branch_broadcast_guard_->addr_), branch, output, xshape, yshape); CHECK_EQ(r, 0); } else { - float* output = param.output->template mutable_data(TARGET(kXPU)); - int r = xdnn::conv2d_fusion( + DY* output = param.output->template mutable_data(TARGET(kXPU)); + int r = xdnn::conv2d_fusion( ctx.GetRawContext(), - param.input->template data(), - reinterpret_cast(xpu_quant_filter_.data_ptr_), + param.input->template data(), + reinterpret_cast(xpu_quant_filter_.data_ptr_), output, batch, img_c, @@ -182,11 +162,27 @@ void XPUConv2dCompute::Run() { } // namespace paddle namespace xpu = paddle::lite::kernels::xpu; -using XPUConv2dFp32 = xpu::XPUConv2dCompute; -using XPUConv2dInt8 = xpu::XPUConv2dCompute; +using XPUConv2dFP32 = + xpu::XPUConv2dCompute; + +using XPUConv2d_FP16_FP32_FP32 = + xpu::XPUConv2dCompute; + +using XPUConv2dFp16 = + xpu::XPUConv2dCompute; + +using XPUConv2d_FP16_FP16_FP32 = + xpu::XPUConv2dCompute; -REGISTER_LITE_KERNEL(__xpu__conv2d, kXPU, kFloat, kNCHW, XPUConv2dFp32, def) +using XPUConv2d_FP16_FP32_FP16 = + xpu::XPUConv2dCompute; + +using XPUConv2dInt8_FP32_FP32 = + xpu::XPUConv2dCompute; + +REGISTER_LITE_KERNEL( + __xpu__conv2d, kXPU, kFloat, kNCHW, XPUConv2d_FP16_FP32_FP32, def) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) @@ -196,7 +192,71 @@ REGISTER_LITE_KERNEL(__xpu__conv2d, kXPU, kFloat, kNCHW, XPUConv2dFp32, def) .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); -REGISTER_LITE_KERNEL(__xpu__conv2d, kXPU, kInt8, kNCHW, XPUConv2dInt8, def) +REGISTER_LITE_KERNEL( + __xpu__conv2d, kXPU, kFloat, kNCHW, XPUConv2dFP32, XPU_Real_kFloat) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Branch", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + __xpu__conv2d, kXPU, kFP16, kNCHW, XPUConv2dFp16, XPU_FP16_FP16__FP16) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindInput("Branch", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(__xpu__conv2d, + kXPU, + kFP16, + kNCHW, + XPUConv2d_FP16_FP16_FP32, + XPU_FP16_FP16__FP32) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindInput("Branch", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(__xpu__conv2d, + kXPU, + kFP16, + kNCHW, + XPUConv2d_FP16_FP32_FP16, + XPU_FP16_FP32__FP16) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindInput("Branch", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(__xpu__conv2d, + kXPU, + kInt8, + kNCHW, + XPUConv2dInt8_FP32_FP32, + XPU_Int8_FP32_FP32) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) diff --git a/lite/kernels/xpu/__xpu__conv2d_compute.h b/lite/kernels/xpu/__xpu__conv2d_compute.h index 69a9aec69c8..c3c31d94743 100644 --- a/lite/kernels/xpu/__xpu__conv2d_compute.h +++ b/lite/kernels/xpu/__xpu__conv2d_compute.h @@ -21,8 +21,11 @@ namespace paddle { namespace lite { namespace kernels { namespace xpu { - -template +template class XPUConv2dCompute : public KernelLite { public: using param_t = operators::XPUBlockFuseParam; diff --git a/lite/kernels/xpu/__xpu__fc_compute.cc b/lite/kernels/xpu/__xpu__fc_compute.cc index c22f69354bf..024e5d6ed9e 100644 --- a/lite/kernels/xpu/__xpu__fc_compute.cc +++ b/lite/kernels/xpu/__xpu__fc_compute.cc @@ -24,10 +24,15 @@ namespace lite { namespace kernels { namespace xpu { -void XPUFcCompute::PrepareForRun() { +template +void XPUFcCompute::PrepareForRun() { auto& ctx = this->ctx_->template As(); auto& param = this->template Param(); - auto w_ptr = param.w->data(); + auto w_ptr = param.w->template data(); auto weight_dims = param.w->dims(); bool quant_int8 = false; if (param.quant_w_max > 0.f) { @@ -55,26 +60,25 @@ void XPUFcCompute::PrepareForRun() { sizeof(float) * max_ptr_size, IoDirection::HtoD); return; - } - - if (param.precision == "int31") { - xpu_quant_weight_ = - TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( - w_ptr, weight_dims, true); - CHECK(xpu_quant_weight_.max_ptr_ == nullptr) - << "int31 weight max should be null"; - } else if (param.precision == "int16") { - xpu_quant_weight_ = - TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( - w_ptr, weight_dims, true); - } else if (param.precision == "int8") { + } else { xpu_quant_weight_ = - TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( + TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( w_ptr, weight_dims, true); + if (std::is_same::value) { + VLOG(6) + << "If fc compute precision is int31,must check weight max should " + "be null "; + CHECK(xpu_quant_weight_.max_ptr_ == nullptr) + << "int31 weight max should be null"; + } } } - -void XPUFcCompute::Run() { +template +void XPUFcCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); @@ -85,13 +89,15 @@ void XPUFcCompute::Run() { int n = param.w->dims()[1]; bool quant_int8 = param.quant_w_max > 0.f; - float* output_max = quant_int8 - ? nullptr - : param.output_max->mutable_data(TARGET(kXPU)); - const auto* bias = param.has_bias ? param.bias->data() : nullptr; + float* output_max = + quant_int8 ? nullptr + : param.output_max->template mutable_data(TARGET(kXPU)); + const auto* bias = + param.has_bias ? param.bias->template data() : nullptr; const float* input_max = quant_int8 ? reinterpret_cast(input_max_guard_->addr_) - : (param.input_max ? param.input_max->data() : nullptr); + : (param.input_max ? param.input_max->template data() + : nullptr); xdnn::Activation_t act((xdnn::Activation_t::act_enum)param.act_type); if (param.act_type == 5) { act.leaky_alpha = param.act_param; @@ -100,82 +106,28 @@ void XPUFcCompute::Run() { act.hard_sigmoid_slope = param.act_param; } // TODO(weihaoji): remove fc_int31 and fc_int16 after xpu fc wrapper refactor - if (param.precision == "int31") { - int r = xdnn::fc_fusion( - ctx.GetRawContext(), // ctx - param.input->data(), // x - reinterpret_cast(xpu_quant_weight_.data_ptr_), // w - param.output->mutable_data(TARGET(kXPU)), // y - m, // m - n, // n - k, // k - false, // x_trans - true, // w_trans - input_max, // x_maxptr - reinterpret_cast(xpu_quant_weight_.max_ptr_), // w_maxptr - output_max, // y_maxptr - k, // ldx - k, // ldw - n, // ldy - 1.0f, // alpha - 0.0f, // beta - bias, // bias - act); - CHECK_EQ(r, 0); - } else if (param.precision == "int16") { - int r = 0; - r = xdnn::fc_fusion( - ctx.GetRawContext(), // ctx - param.input->data(), // x - reinterpret_cast(xpu_quant_weight_.data_ptr_), // w - param.output->mutable_data(TARGET(kXPU)), // y - m, // m - n, // n - k, // k - false, // x_trans - true, // w_trans - input_max, // x_maxptr - reinterpret_cast(xpu_quant_weight_.max_ptr_), // w_maxptr - output_max, // y_maxptr - k, // ldx - k, // ldw - n, // ldy - 1.0f, // alpha - 0.0f, // beta - bias, // bias - act); // act - - CHECK_EQ(r, 0); - } else if (param.precision == "int8") { - bool x_trans = false; - bool w_trans = true; - int ldx = (x_trans ? m : k); - int ldw = (w_trans ? k : n); - int ldy = n; - int r = xdnn::fc_fusion( - ctx.GetRawContext(), /* context */ - param.input->data(), /* x */ - reinterpret_cast(xpu_quant_weight_.data_ptr_), - param.output->mutable_data(TARGET(kXPU)), /* y */ - m, /* m */ - n, /* n */ - k, /* k */ - x_trans, /* x_trans */ - w_trans, /* w_trans */ - input_max, /* x_max */ - reinterpret_cast(xpu_quant_weight_.max_ptr_), /* w_max */ - output_max, /* y_max */ - ldx, /* ldx */ - ldw, /* ldw */ - ldy, /* ldy */ - 1.0f, /* alpha */ - 0.0f, /* beta */ - bias, /* bias */ - act); /* act_type */ - CHECK_EQ(r, 0); - } else { - LOG(FATAL) << "Unsupport XPUFC Precision: " << param.precision; - } + + int r = xdnn::fc_fusion( + ctx.GetRawContext(), // ctx + param.input->template data(), // x + reinterpret_cast(xpu_quant_weight_.data_ptr_), // w + param.output->template mutable_data(TARGET(kXPU)), // y + m, // m + n, // n + k, // k + false, // x_trans + true, // w_trans + input_max, // x_maxptr + reinterpret_cast(xpu_quant_weight_.max_ptr_), // w_maxptr + output_max, // y_maxptr + k, // ldx + k, // ldw + n, // ldy + 1.0f, // alpha + 0.0f, // beta + bias, // bias + act); + CHECK_EQ(r, 0); } } // namespace xpu @@ -183,12 +135,37 @@ void XPUFcCompute::Run() { } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL(__xpu__fc, - kXPU, - kFloat, - kNCHW, - paddle::lite::kernels::xpu::XPUFcCompute, - def) +namespace xpu = paddle::lite::kernels::xpu; + +using XPUFC_FP32 = + xpu::XPUFcCompute; + +using XPUFC_FP16_FP32_FP32 = + xpu::XPUFcCompute; + +using XPUFC_FP16_FP16_FP16 = + xpu::XPUFcCompute; + +using XPUFC_FP16_FP32_FP16 = + xpu::XPUFcCompute; + +using XPUFC_FP16_FP16_FP32 = + xpu::XPUFcCompute; + +using XPUFC_Int8_FP32_FP32 = + xpu::XPUFcCompute; + +REGISTER_LITE_KERNEL( + __xpu__fc, kXPU, kFloat, kNCHW, XPUFC_FP32, XPU_Real_kFloat) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(__xpu__fc, kXPU, kFloat, kNCHW, XPUFC_FP16_FP32_FP32, def) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) @@ -196,3 +173,49 @@ REGISTER_LITE_KERNEL(__xpu__fc, .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); + +REGISTER_LITE_KERNEL( + __xpu__fc, kXPU, kFP16, kNCHW, XPUFC_FP16_FP16_FP16, XPUFC_FP16_FP16_FP16) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + __xpu__fc, kXPU, kFP16, kNCHW, XPUFC_FP16_FP32_FP16, XPUFC_FP16_FP32_FP16) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + __xpu__fc, kXPU, kFP16, kNCHW, XPUFC_FP16_FP16_FP32, XPUFC_FP16_FP16_FP32) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + __xpu__fc, kXPU, kFloat, kNCHW, XPUFC_Int8_FP32_FP32, XPU_Int8_FP32_FP32) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/__xpu__fc_compute.h b/lite/kernels/xpu/__xpu__fc_compute.h index 687f8d5e9c1..6d6dba66faf 100644 --- a/lite/kernels/xpu/__xpu__fc_compute.h +++ b/lite/kernels/xpu/__xpu__fc_compute.h @@ -20,8 +20,12 @@ namespace paddle { namespace lite { namespace kernels { namespace xpu { - -class XPUFcCompute : public KernelLite { +template +class XPUFcCompute : public KernelLite { public: using param_t = operators::XPUFcParam; @@ -32,8 +36,6 @@ class XPUFcCompute : public KernelLite { virtual ~XPUFcCompute() = default; private: - // TODO(weihaoji): remove cpu w_max after xpu fc wrapper refactor - float w_max; XPUScratchPadGuard input_max_guard_; XPUQuantData xpu_quant_weight_; }; diff --git a/lite/kernels/xpu/calib_compute.cc b/lite/kernels/xpu/calib_compute.cc index 34a6fb53d72..dc134fde02a 100644 --- a/lite/kernels/xpu/calib_compute.cc +++ b/lite/kernels/xpu/calib_compute.cc @@ -29,6 +29,9 @@ void CalibCompute::Run() { int numel = param.input->numel(); const auto* in_data = param.input->template data(); auto* out_data = param.output->template mutable_data(TARGET(kXPU)); + if (numel == 0) { + return; + } int r = xdnn::cast_v2( ctx.GetRawContext(), in_data, out_data, numel); CHECK_EQ(r, 0); @@ -43,31 +46,69 @@ using xpu_calib_int64_to_int32 = paddle::lite::kernels::xpu::CalibCompute; using xpu_calib_int32_to_int64 = paddle::lite::kernels::xpu::CalibCompute; +using xpu_calib_fp32_to_fp16 = + paddle::lite::kernels::xpu::CalibCompute; +using xpu_calib_fp16_to_fp32 = + paddle::lite::kernels::xpu::CalibCompute; REGISTER_LITE_KERNEL( - calib, kXPU, kFloat, kNCHW, xpu_calib_int64_to_int32, int64_to_int32) + calib, kXPU, kFloat, kNCHW, xpu_calib_int64_to_int32, calib_int64_to_int32) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .Finalize(); REGISTER_LITE_KERNEL( - calib, kXPU, kFloat, kNCHW, xpu_calib_int32_to_int64, int32_to_int64) + calib, kXPU, kFloat, kNCHW, xpu_calib_int32_to_int64, calib_int32_to_int64) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .Finalize(); REGISTER_LITE_KERNEL( - calib_once, kXPU, kFloat, kNCHW, xpu_calib_int64_to_int32, int64_to_int32) + calib, kXPU, kFloat, kNCHW, xpu_calib_fp32_to_fp16, calib_fp32_to_fp16) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + calib, kXPU, kFloat, kNCHW, xpu_calib_fp16_to_fp32, calib_fp16_to_fp32) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .Finalize(); + +REGISTER_LITE_KERNEL(calib_once, + kXPU, + kFloat, + kNCHW, + xpu_calib_int64_to_int32, + calib_int64_to_int32) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .Finalize(); -REGISTER_LITE_KERNEL( - calib_once, kXPU, kFloat, kNCHW, xpu_calib_int32_to_int64, int32_to_int64) +REGISTER_LITE_KERNEL(calib_once, + kXPU, + kFloat, + kNCHW, + xpu_calib_int32_to_int64, + calib_int32_to_int64) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .Finalize(); + +REGISTER_LITE_KERNEL( + calib_once, kXPU, kFloat, kNCHW, xpu_calib_fp32_to_fp16, calib_fp32_to_fp16) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + calib_once, kXPU, kFloat, kNCHW, xpu_calib_fp16_to_fp32, calib_fp16_to_fp32) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .Finalize(); diff --git a/lite/kernels/xpu/concat_compute.cc b/lite/kernels/xpu/concat_compute.cc index e3fc5ef554d..9eceace16f5 100644 --- a/lite/kernels/xpu/concat_compute.cc +++ b/lite/kernels/xpu/concat_compute.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "lite/kernels/xpu/concat_compute.h" + #include #include + #include "lite/backends/xpu/xpu_header_sitter.h" #include "lite/core/op_registry.h" @@ -23,8 +25,8 @@ namespace lite { namespace kernels { namespace xpu { -template -void ConcatCompute::Run() { +template +void ConcatCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); @@ -34,7 +36,7 @@ void ConcatCompute::Run() { ? param.axis + static_cast(ins[0]->dims().size()) : param.axis; - std::vector x_list; + std::vector x_list; std::vector> xdims_list; for (int i = 0; i < ins.size(); i++) { if (ins[i]->numel() > 0) { @@ -46,14 +48,14 @@ void ConcatCompute::Run() { xdims_list[i].back() = xdims_list[i].back() * 2; } x_list.push_back( - reinterpret_cast(ins[i]->template data())); + reinterpret_cast(ins[i]->template data())); } } if (x_list.size() > 1) { - int r = xdnn::concat( + int r = xdnn::concat( ctx.GetRawContext(), x_list, - reinterpret_cast( + reinterpret_cast( out->template mutable_data(TARGET(kXPU))), xdims_list, axis); @@ -75,37 +77,45 @@ void ConcatCompute::Run() { } // namespace kernels } // namespace lite } // namespace paddle - -REGISTER_LITE_KERNEL(concat, - kXPU, - kFloat, - kNCHW, - paddle::lite::kernels::xpu::ConcatCompute, - def) +using concatfp32 = + paddle::lite::kernels::xpu::ConcatCompute; +using concatfp16 = + paddle::lite::kernels::xpu::ConcatCompute; +using concati16 = + paddle::lite::kernels::xpu::ConcatCompute; +using concati32 = + paddle::lite::kernels::xpu::ConcatCompute; +using concati64 = + paddle::lite::kernels::xpu::ConcatCompute; +REGISTER_LITE_KERNEL(concat, kXPU, kFloat, kNCHW, concatfp32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) .BindInput("AxisTensor", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) .Finalize(); -REGISTER_LITE_KERNEL(concat, - kXPU, - kFloat, - kNCHW, - paddle::lite::kernels::xpu::ConcatCompute, - concat_i32) +REGISTER_LITE_KERNEL(concat, kXPU, kFP16, kNCHW, concatfp16, concat_FP16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("AxisTensor", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + +REGISTER_LITE_KERNEL(concat, kXPU, kInt16, kNCHW, concati16, concat_INT16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt16))}) + .BindInput("AxisTensor", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt16))}) + .Finalize(); + +REGISTER_LITE_KERNEL(concat, kXPU, kInt32, kNCHW, concati32, concat_INT32) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .BindInput("AxisTensor", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .Finalize(); -REGISTER_LITE_KERNEL(concat, - kXPU, - kFloat, - kNCHW, - paddle::lite::kernels::xpu::ConcatCompute, - concat_i64) +REGISTER_LITE_KERNEL(concat, kXPU, kInt64, kNCHW, concati64, concat_INT64) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .BindInput("AxisTensor", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) diff --git a/lite/kernels/xpu/concat_compute.h b/lite/kernels/xpu/concat_compute.h index 218c4704557..964f94f8194 100644 --- a/lite/kernels/xpu/concat_compute.h +++ b/lite/kernels/xpu/concat_compute.h @@ -21,8 +21,8 @@ namespace lite { namespace kernels { namespace xpu { -template -class ConcatCompute : public KernelLite { +template +class ConcatCompute : public KernelLite { public: using param_t = operators::ConcatParam; diff --git a/lite/kernels/xpu/conv3d_compute.cc b/lite/kernels/xpu/conv3d_compute.cc index cc3ad389679..8416d964448 100644 --- a/lite/kernels/xpu/conv3d_compute.cc +++ b/lite/kernels/xpu/conv3d_compute.cc @@ -22,8 +22,26 @@ namespace lite { namespace kernels { namespace xpu { -template <> -void Conv3DCompute::Run() { +template +void Conv3DCompute::PrepareForRun() { + auto& param = this->template Param(); + auto filter_ptr = param.filter->template data(); + auto filter_dims = param.filter->dims(); + xpu_quant_filter_ = + TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( + filter_ptr, filter_dims, false); +} + +template +void Conv3DCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); @@ -34,11 +52,11 @@ void Conv3DCompute::Run() { auto paddings = *param.paddings; auto dilations = *param.dilations; - int r = xdnn::conv3d( + int r = xdnn::conv3d( ctx.GetRawContext(), /* context */ - param.x->data(), - param.filter->data(), /* weight */ - param.output->mutable_data(TARGET(kXPU)), + param.x->template data(), + reinterpret_cast(xpu_quant_filter_.data_ptr_), /* weight */ + param.output->template mutable_data(TARGET(kXPU)), x_dims[0], /* input_n */ x_dims[1], /* input_c */ x_dims[2], /* input_d */ @@ -53,7 +71,7 @@ void Conv3DCompute::Run() { dilations, groups, nullptr, - nullptr, + reinterpret_cast(xpu_quant_filter_.max_ptr_), nullptr, true /*is_ncdhw*/); CHECK_EQ(r, 0); @@ -65,11 +83,61 @@ void Conv3DCompute::Run() { } // namespace paddle namespace xpu = paddle::lite::kernels::xpu; -using Conv3dFp32 = xpu::Conv3DCompute; -REGISTER_LITE_KERNEL(conv3d, kXPU, kFloat, kNCHW, Conv3dFp32, def) +using XPUConv3dFP32 = + xpu::Conv3DCompute; + +using XPUConv3d_FP16_FP32_FP32 = + xpu::Conv3DCompute; + +using XPUConv3dFp16 = + xpu::Conv3DCompute; + +using XPUConv3d_FP16_FP16_FP32 = + xpu::Conv3DCompute; + +using XPUConv3d_FP16_FP32_FP16 = + xpu::Conv3DCompute; + +REGISTER_LITE_KERNEL( + conv3d, kXPU, kFloat, kNCHW, XPUConv3dFP32, XPU_Real_kFloat) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) - .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); + +REGISTER_LITE_KERNEL(conv3d, kXPU, kFloat, kNCHW, XPUConv3d_FP16_FP32_FP32, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + conv3d, kXPU, kFP16, kNCHW, XPUConv3dFp16, XPU_FP16_FP16_FP16) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + conv3d, kXPU, kFP16, kNCHW, XPUConv3d_FP16_FP16_FP32, XPU_FP16_FP16_FP32) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + conv3d, kXPU, kFP16, kNCHW, XPUConv3d_FP16_FP32_FP16, XPU_FP16_FP32_FP16) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); diff --git a/lite/kernels/xpu/conv3d_compute.h b/lite/kernels/xpu/conv3d_compute.h index caadb82a1e8..4cd5fdaeca7 100644 --- a/lite/kernels/xpu/conv3d_compute.h +++ b/lite/kernels/xpu/conv3d_compute.h @@ -21,14 +21,22 @@ namespace lite { namespace kernels { namespace xpu { -template -class Conv3DCompute : public KernelLite { +template +class Conv3DCompute : public KernelLite { public: using param_t = operators::ConvParam; + void PrepareForRun() override; void Run() override; virtual ~Conv3DCompute() = default; + + private: + XPUQuantData xpu_quant_filter_; }; } // namespace xpu diff --git a/lite/kernels/xpu/gather_compute.cc b/lite/kernels/xpu/gather_compute.cc index f3eafc878fb..697204689d9 100644 --- a/lite/kernels/xpu/gather_compute.cc +++ b/lite/kernels/xpu/gather_compute.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "lite/kernels/xpu/gather_compute.h" + #include + #include "lite/backends/xpu/xpu_header_sitter.h" #include "lite/core/op_registry.h" @@ -22,8 +24,8 @@ namespace lite { namespace kernels { namespace xpu { -template -void GatherCompute::Run() { +template +void GatherCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); @@ -46,88 +48,16 @@ void GatherCompute::Run() { axis += x_dims.size(); } - if (param.X->precision() == PrecisionType::kInt64 && - param.Index->precision() == PrecisionType::kInt64) { - auto* index_int64 = param.Index->template data(); - int size = param.Index->dims().production(); - XPUScratchPadGuard index_xpu_guard_ = - TargetWrapperXPU::MallocScratchPad(size * sizeof(int)); - int* index_int32_device = reinterpret_cast(index_xpu_guard_->addr_); - - int r0 = xdnn::cast_v2( - ctx.GetRawContext(), index_int64, index_int32_device, index->numel()); - CHECK_EQ(r0, 0); + int r = xdnn::gather( + ctx.GetRawContext(), + x->template data(), + index->template data(), + out->template mutable_data(TARGET(kXPU)), + x_dims, + index->numel(), + axis); - int r1 = xdnn::gather( - ctx.GetRawContext(), - x->template data(), - index_int32_device, - out->template mutable_data(TARGET(kXPU)), - x_dims, - index->numel(), - axis); - CHECK_EQ(r1, 0); - } else if (param.X->precision() == PrecisionType::kInt64 && - param.Index->precision() == PrecisionType::kInt32) { - int r = xdnn::gather( - ctx.GetRawContext(), - x->template data(), - index->template data(), - out->template mutable_data(TARGET(kXPU)), - x_dims, - index->numel(), - axis); - CHECK_EQ(r, 0); - } else if (param.X->precision() == PrecisionType::kInt32 && - param.Index->precision() == PrecisionType::kInt32) { - int r = xdnn::gather( - ctx.GetRawContext(), - x->template data(), - index->template data(), - out->template mutable_data(TARGET(kXPU)), - x_dims, - index->numel(), - axis); - CHECK_EQ(r, 0); - } else if (param.X->precision() == PrecisionType::kInt32 && - param.Index->precision() == PrecisionType::kInt64) { - int r = xdnn::gather( - ctx.GetRawContext(), - x->template data(), - index->template data(), - out->template mutable_data(TARGET(kXPU)), - x_dims, - index->numel(), - axis); - CHECK_EQ(r, 0); - } else if (param.X->precision() == PrecisionType::kFloat && - param.Index->precision() == PrecisionType::kInt32) { - int r = xdnn::gather( - ctx.GetRawContext(), - x->template data(), - index->template data(), - out->template mutable_data(TARGET(kXPU)), - x_dims, - index->numel(), - axis); - CHECK_EQ(r, 0); - } else if (param.X->precision() == PrecisionType::kFloat && - param.Index->precision() == PrecisionType::kInt64) { - int r = xdnn::gather( - ctx.GetRawContext(), - x->template data(), - index->template data(), - out->template mutable_data(TARGET(kXPU)), - x_dims, - index->numel(), - axis); - CHECK_EQ(r, 0); - } else { - LOG(FATAL) << "Unsupported gather op with x dtype: " - << lite_api::PrecisionToStr(param.X->precision()) - << " and index dtype: " - << lite_api::PrecisionToStr(param.Index->precision()); - } + CHECK_EQ(r, 0); } } // namespace xpu @@ -141,10 +71,21 @@ REGISTER_LITE_KERNEL(gather, kXPU, kFloat, kNCHW, GatherXPUFloatInt32, def) {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .BindInput("Axis", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) .Finalize(); + REGISTER_LITE_KERNEL( - gather, kXPU, kFloat, kNCHW, GatherXPUFloatInt64, gather_float_i64) + gather, kXPU, kFP16, kNCHW, GatherXPUkFP16Int32, gather_FP16_Int32) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Index", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindInput("Axis", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + gather, kXPU, kFloat, kNCHW, GatherXPUFloatInt64, gather_FP32_INT64) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) .BindInput("Index", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) @@ -153,7 +94,7 @@ REGISTER_LITE_KERNEL( .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); REGISTER_LITE_KERNEL( - gather, kXPU, kFloat, kNCHW, GatherXPUInt32Int32, gather_i32_i32) + gather, kXPU, kInt32, kNCHW, GatherXPUInt32Int32, gather_INT32_INT32) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .BindInput("Index", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) @@ -162,7 +103,7 @@ REGISTER_LITE_KERNEL( .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); REGISTER_LITE_KERNEL( - gather, kXPU, kFloat, kNCHW, GatherXPUInt32Int64, gather_i32_i64) + gather, kXPU, kInt32, kNCHW, GatherXPUInt32Int64, gather_INT32_INT64) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .BindInput("Index", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) @@ -171,7 +112,7 @@ REGISTER_LITE_KERNEL( .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); REGISTER_LITE_KERNEL( - gather, kXPU, kFloat, kNCHW, GatherXPUInt64Int32, gather_i64_i32) + gather, kXPU, kInt64, kNCHW, GatherXPUInt64Int32, gather_INT64_INT32) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .BindInput("Index", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) @@ -179,12 +120,3 @@ REGISTER_LITE_KERNEL( {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .Finalize(); -REGISTER_LITE_KERNEL( - gather, kXPU, kFloat, kNCHW, GatherXPUInt64Int64, gather_i64_i64) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) - .BindInput("Index", - {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) - .BindInput("Axis", - {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) - .Finalize(); diff --git a/lite/kernels/xpu/gather_compute.h b/lite/kernels/xpu/gather_compute.h index a78be677d09..2363e8651ca 100644 --- a/lite/kernels/xpu/gather_compute.h +++ b/lite/kernels/xpu/gather_compute.h @@ -21,8 +21,8 @@ namespace lite { namespace kernels { namespace xpu { -template -class GatherCompute : public KernelLite { +template +class GatherCompute : public KernelLite { public: using param_t = operators::GatherParam; @@ -36,15 +36,27 @@ class GatherCompute : public KernelLite { } // namespace lite } // namespace paddle -typedef paddle::lite::kernels::xpu::GatherCompute +typedef paddle::lite::kernels::xpu::GatherCompute GatherXPUInt32Int32; -typedef paddle::lite::kernels::xpu::GatherCompute +typedef paddle::lite::kernels::xpu::GatherCompute GatherXPUInt32Int64; -typedef paddle::lite::kernels::xpu::GatherCompute +typedef paddle::lite::kernels::xpu::GatherCompute GatherXPUFloatInt32; -typedef paddle::lite::kernels::xpu::GatherCompute +typedef paddle::lite::kernels::xpu::GatherCompute + GatherXPUkFP16Int32; +typedef paddle::lite::kernels::xpu::GatherCompute GatherXPUFloatInt64; -typedef paddle::lite::kernels::xpu::GatherCompute +typedef paddle::lite::kernels::xpu::GatherCompute GatherXPUInt64Int32; -typedef paddle::lite::kernels::xpu::GatherCompute - GatherXPUInt64Int64; diff --git a/lite/kernels/xpu/pool_compute.cc b/lite/kernels/xpu/pool_compute.cc index 9df03bc3c48..8211de7e438 100644 --- a/lite/kernels/xpu/pool_compute.cc +++ b/lite/kernels/xpu/pool_compute.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "lite/kernels/xpu/pool_compute.h" + #include #include + #include "lite/backends/xpu/xpu_header_sitter.h" #include "lite/core/op_registry.h" @@ -22,8 +24,8 @@ namespace paddle { namespace lite { namespace kernels { namespace xpu { - -void Pool2DCompute::Run() { +template +void Pool2DCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); @@ -55,8 +57,8 @@ void Pool2DCompute::Run() { if (param.pooling_type == "avg") { int r = xdnn::adaptive_avg_pool2d( ctx.GetRawContext(), - param.x->data(), - param.output->mutable_data(TARGET(kXPU)), + param.x->template data(), + param.output->template mutable_data(TARGET(kXPU)), x_dims[0], x_dims[1], x_dims[2], @@ -68,8 +70,8 @@ void Pool2DCompute::Run() { } else { int r = xdnn::adaptive_max_pool2d( ctx.GetRawContext(), - param.x->data(), - param.output->mutable_data(TARGET(kXPU)), + param.x->template data(), + param.output->template mutable_data(TARGET(kXPU)), nullptr, x_dims[0], x_dims[1], @@ -82,10 +84,10 @@ void Pool2DCompute::Run() { } } else { if (param.pooling_type == "avg") { - int r = xdnn::avg_pool2d( + int r = xdnn::avg_pool2d( ctx.GetRawContext(), - param.x->data(), - param.output->mutable_data(TARGET(kXPU)), + param.x->template data(), + param.output->template mutable_data(TARGET(kXPU)), x_dims[0], x_dims[1], x_dims[2], @@ -98,10 +100,10 @@ void Pool2DCompute::Run() { CHECK_EQ(r, 0); } else { if (param.pad_zero == true) { - int r = xdnn::max_pool2d( + int r = xdnn::max_pool2d( ctx.GetRawContext(), - param.x->data(), - param.output->mutable_data(TARGET(kXPU)), + param.x->template data(), + param.output->template mutable_data(TARGET(kXPU)), nullptr, x_dims[0], x_dims[1], @@ -113,7 +115,7 @@ void Pool2DCompute::Run() { true); CHECK_EQ(r, 0); } else { - const float* xpu_x_padded = nullptr; + const InType* xpu_x_padded = nullptr; std::vector xpu_x_padded_dims{static_cast(x_dims[0]), static_cast(x_dims[1]), static_cast(x_dims[2]), @@ -121,7 +123,7 @@ void Pool2DCompute::Run() { XPUScratchPadGuard xpu_x_padded_guard_; if (paddings[0] == 0 && paddings[1] == 0 && paddings[2] == 0 && paddings[3] == 0) { - xpu_x_padded = param.x->data(); + xpu_x_padded = param.x->template data(); } else { std::vector pad_left{0, 0, paddings[0], paddings[2]}; std::vector pad_right{0, 0, paddings[1], paddings[3]}; @@ -130,25 +132,25 @@ void Pool2DCompute::Run() { xpu_x_padded_dims[3] = xpu_x_padded_dims[3] + paddings[2] + paddings[3]; xpu_x_padded_guard_ = TargetWrapperXPU::MallocScratchPad( - sizeof(float) * xpu_x_padded_dims[0] * xpu_x_padded_dims[1] * + sizeof(InType) * xpu_x_padded_dims[0] * xpu_x_padded_dims[1] * xpu_x_padded_dims[2] * xpu_x_padded_dims[3]); - xpu_x_padded = reinterpret_cast(xpu_x_padded_guard_->addr_); - int r = xdnn::pad(ctx.GetRawContext(), - param.x->data(), - const_cast(xpu_x_padded), - {static_cast(x_dims[0]), - static_cast(x_dims[1]), - static_cast(x_dims[2]), - static_cast(x_dims[3])}, - pad_left, - pad_right, - -9999999.0f); + xpu_x_padded = reinterpret_cast(xpu_x_padded_guard_->addr_); + int r = xdnn::pad(ctx.GetRawContext(), + param.x->template data(), + const_cast(xpu_x_padded), + {static_cast(x_dims[0]), + static_cast(x_dims[1]), + static_cast(x_dims[2]), + static_cast(x_dims[3])}, + pad_left, + pad_right, + -9999999.0f); CHECK_EQ(r, 0); } - int r = xdnn::max_pool2d( + int r = xdnn::max_pool2d( ctx.GetRawContext(), xpu_x_padded, - param.output->mutable_data(TARGET(kXPU)), + param.output->template mutable_data(TARGET(kXPU)), nullptr, xpu_x_padded_dims[0], xpu_x_padded_dims[1], @@ -168,19 +170,29 @@ void Pool2DCompute::Run() { } // namespace kernels } // namespace lite } // namespace paddle +// (TODO:quwei) refactor pool2d + +using pool2d_fp32 = + paddle::lite::kernels::xpu::Pool2DCompute; +using pool2d_fp16 = + paddle::lite::kernels::xpu::Pool2DCompute; + +using max_pool2d_with_index_fp32 = + paddle::lite::kernels::xpu::Pool2DCompute; + +REGISTER_LITE_KERNEL(pool2d, kXPU, kFloat, kNCHW, pool2d_fp32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .Finalize(); REGISTER_LITE_KERNEL( - pool2d, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::Pool2DCompute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + pool2d, kXPU, kFP16, kNCHW, pool2d_fp16, DISABLE_XPU1_pool2d_FP16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) .Finalize(); -REGISTER_LITE_KERNEL(max_pool2d_with_index, - kXPU, - kFloat, - kNCHW, - paddle::lite::kernels::xpu::Pool2DCompute, - def) +REGISTER_LITE_KERNEL( + max_pool2d_with_index, kXPU, kFloat, kNCHW, max_pool2d_with_index_fp32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kXPU))}) diff --git a/lite/kernels/xpu/pool_compute.h b/lite/kernels/xpu/pool_compute.h index 39e14f04a8c..c107b2877b1 100644 --- a/lite/kernels/xpu/pool_compute.h +++ b/lite/kernels/xpu/pool_compute.h @@ -20,8 +20,8 @@ namespace paddle { namespace lite { namespace kernels { namespace xpu { - -class Pool2DCompute : public KernelLite { +template +class Pool2DCompute : public KernelLite { public: using param_t = operators::PoolParam; diff --git a/lite/operators/__xpu__fc_op.cc b/lite/operators/__xpu__fc_op.cc index 21f6faebcb5..a15d6c4fa88 100644 --- a/lite/operators/__xpu__fc_op.cc +++ b/lite/operators/__xpu__fc_op.cc @@ -107,12 +107,8 @@ bool XPUFcOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) { param_.input_max = scope->FindVar(op_desc.Input("InputMax").front())->GetMutable(); } - if (op_desc.HasAttr("precision")) { - param_.precision = op_desc.GetAttr("precision"); - } + if (op_desc.HasAttr("enable_int8") && op_desc.GetAttr("enable_int8")) { - CHECK(param_.precision == "int8") << "enable_int8 precison:" - << param_.precision; param_.quant_input_max = 127 * op_desc.GetAttr>("X0_scale")[0]; param_.quant_w_max =