diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index e268b1dc54f..e3eb8aea3aa 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -107,6 +107,7 @@ USE_MIR_PASS(__xpu__bigru_fuse_pass);
 USE_MIR_PASS(__xpu__dynamic_lstm_fuse_pass);
 USE_MIR_PASS(__xpu__multi_softmax_fuse_pass);
 USE_MIR_PASS(__xpu__max_pooling_pad_zero_detect_fuse_pass);
+USE_MIR_PASS(__xpu__static_kernel_pick_pass);
 USE_MIR_PASS(x86_int8_attribute_pass);
 USE_MIR_PASS(fill_range_fuse_pass);
 USE_MIR_PASS(range_calc_offline_pass);
diff --git a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc
new file mode 100644
index 00000000000..038e7e22678
--- /dev/null
+++ b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc
@@ -0,0 +1,736 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h"
+#include <algorithm>
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/target_wrapper.h"
+#endif
+#include "lite/core/optimizer/mir/graph_visualize_pass.h"
+#include "lite/core/optimizer/mir/pass_registry.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+
+bool XPUKernelScoreCmp(const std::pair<float, std::unique_ptr<KernelBase>>& a,
+                       const std::pair<float, std::unique_ptr<KernelBase>>& b) {
+  return a.first > b.first;
+}
+
+void XPUStaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  kernel_pick_factors_.ConsiderTarget();
+  kernel_pick_factors_.ConsiderPrecision();
+  kernel_pick_factors_.ConsiderDataLayout();
+  CHECK(kernel_pick_factors_.any_factor_considered())
+      << "kernel_pick_factors should be specified first";
+  CHECK(graph) << "graph not valid";
+
+// Collect input data precision for each node in the graph
+#ifdef LITE_WITH_XPU
+  DicideUseFP16Optimizer(graph);
+  if (xpu_use_fp16_optimizer_) {
+    GetXPUDeviceType();
+    for (auto& node : graph->StmtTopologicalOrder()) {
+      if (!node->IsStmt()) continue;
+      if (xpu_special_op_.count(node->AsStmt().op_type())) {
+        SpecialNodeInputPrecision(node);
+        continue;
+      }
+
+      if (xpu_inplace_op_.count(node->AsStmt().op_type())) {
+        continue;
+      }
+
+      NodeInputPrecision(node, graph);
+    }
+
+    for (auto& node : graph->StmtTopologicalOrder()) {
+      if (!node->IsStmt()) continue;
+      if (xpu_inplace_op_.count(node->AsStmt().op_type()) == 0) {
+        continue;
+      }
+
+      InplaceNodeInputPrecision(node);
+    }
+  }
+#endif
+
+  // sort kernels by the factors.
+  VLOG(2) << "graph block_idx: " << graph->blockIdx();
+  VLOG(2) << "graph->mutable_nodes().size(): " << graph->mutable_nodes().size();
+  size_t idx = 0;
+  for (auto& node : graph->StmtTopologicalOrder()) {
+    if (!node->IsStmt()) continue;
+    auto& instruct = node->AsStmt();
+    VLOG(2) << "pick kernel for op : " << instruct.op_type() << ", in block "
+            << graph->blockIdx() << ", idx : " << idx++;
+
+    std::map<std::string, PrecisionType> in_types;
+    std::map<std::string, PrecisionType> out_types;
+    // threse precision info store in __model__ file, if selected fp16 kernel,
+    // the output precision should be changed
+    for (std::list<Node*>::iterator i = node->inlinks.begin();
+         i != node->inlinks.end();
+         ++i) {
+      if ((*i)->arg()->type)
+        in_types[(*i)->arg()->name] = (*i)->arg()->type->precision();
+    }
+    for (std::list<Node*>::iterator i = node->outlinks.begin();
+         i != node->outlinks.end();
+         ++i) {
+      if ((*i)->arg()->type)
+        out_types[(*i)->arg()->name] = (*i)->arg()->type->precision();
+    }
+    // Get candidate kernels
+    std::vector<std::pair<float, std::unique_ptr<KernelBase>>> scored;
+    CHECK(!instruct.kernels().empty()) << "No kernels found for "
+                                       << instruct.op_type();
+
+    VLOG(2) << "candidate kernels size:" << instruct.kernels().size();
+
+    for (auto&& kernel : instruct.kernels()) {
+      VLOG(2) << "current candidate kernel is: " << kernel->summary();
+      VLOG(2) << "valid_places size is: " << graph->valid_places().size();
+
+      float score = KernelGrade(node,
+                                *kernel,
+                                graph->valid_places(),
+                                in_types,
+                                out_types,
+                                instruct.op_info()->input_names(),
+                                instruct.op_info()->output_names());
+
+      scored.emplace_back(score, std::move(kernel));
+    }
+    std::stable_sort(scored.begin(), scored.end(), XPUKernelScoreCmp);
+    instruct.kernels().clear();
+
+    if (!instruct.op_info()->HasAttr("enable_int8")) {
+#ifdef LITE_WITH_XPU
+      if (xpu_use_fp16_optimizer_) {
+        if (xpu_special_op_.count(node->AsStmt().op_type())) {
+          SpecialNodeOutputPrecision(graph, node, scored.front().second);
+        } else if (xpu_inplace_op_.count(node->AsStmt().op_type())) {
+          InplaceNodeOutputPrecision(node->AsStmt(),
+                                     instruct.op_info()->input_names(),
+                                     instruct.op_info()->output_names());
+        } else {
+          NodeOutputPrecision(graph, node);
+        }
+      }
+#endif
+
+      instruct.kernels().emplace_back(std::move(scored.front().second));
+      VLOG(2) << "the final pick kernel is "
+              << instruct.kernels().front()->summary() << "\n\n";
+    } else {
+      // TODO(quwei): consider XPU int8 data precision
+      bool out_type_int8 = true;
+      // Quantized lstm has fp32 output
+      if (instruct.op_type() == "lstm" || instruct.op_type() == "gru" ||
+          instruct.op_type() == "__xpu__multi_encoder" ||
+          instruct.op_type() == "__xpu__fc") {
+        out_type_int8 = false;
+      }
+      // Only if all ops linked to this op output has enable_int8 attr,
+      // then the op output type is int8, or fp32.
+      // Note, the quantized op linked to lstm and gru should output fp32
+      // tensor.
+      for (auto* out_n : node->outlinks) {
+        CHECK(out_n->IsArg());
+        for (auto* tmp_op : out_n->outlinks) {
+          CHECK(tmp_op->IsStmt());
+          auto* tmp_op_info = tmp_op->AsStmt().op_info();
+          if (!tmp_op_info->HasAttr("enable_int8") ||
+              tmp_op_info->Type() == "lstm" || tmp_op_info->Type() == "gru" ||
+              instruct.op_type() == "__xpu__multi_encoder" ||
+              instruct.op_type() == "__xpu__fc") {
+            out_type_int8 = false;
+            break;
+          }
+        }
+        if (!out_type_int8) break;
+      }
+      // If the out_type_int8 is true, it turns out that the output type of
+      // this
+      // op can be int8.
+      // So we need to specify output scale for this op.
+      if (out_type_int8) {
+        auto out_node = node->outlinks.front();
+        CHECK(out_node->IsArg());
+        auto out_node_name = out_node->arg()->name;
+        auto one_adj_op_node = out_node->outlinks.front();
+        CHECK(one_adj_op_node->IsStmt());
+        auto& one_adj_instruct = one_adj_op_node->AsStmt();
+        CHECK(one_adj_instruct.op_info()->HasAttr("enable_int8"));
+        CHECK(one_adj_instruct.op_info()->HasInputScale(out_node_name));
+
+        instruct.mutable_op_info()->SetOutputScale(
+            out_node_name,
+            one_adj_instruct.op_info()->GetInputScale(out_node_name));
+
+        auto update_desc = *instruct.mutable_op_info();
+        instruct.ResetOp(update_desc, graph->valid_places());
+        scored.clear();
+        for (auto&& kernel : instruct.kernels()) {
+          float score = KernelGrade(node,
+                                    *kernel,
+                                    graph->valid_places(),
+                                    in_types,
+                                    out_types,
+                                    instruct.op_info()->input_names(),
+                                    instruct.op_info()->output_names());
+          scored.emplace_back(score, std::move(kernel));
+        }
+        std::stable_sort(scored.begin(), scored.end(), XPUKernelScoreCmp);
+        instruct.kernels().clear();
+      }
+      // If the out_type_int8 is true, we should pick the kernel with the
+      // int8 input and int8 output.
+      // If the out_type_int8 is false, we should pick the kernel with the
+      // int8 input and fp32 output.
+      auto output_arguments = instruct.op_info()->OutputArgumentNames();
+      for (auto& candidate : scored) {
+        bool all_output_type_match = true;
+        auto expect_output_type =
+            out_type_int8 ? PRECISION(kInt8) : PRECISION(kFloat);
+
+        for (auto& arg_name : output_arguments) {
+          const Type* out_arg_ty =
+              candidate.second->GetOutputDeclType(arg_name);
+          if (out_arg_ty->precision() != expect_output_type) {
+            all_output_type_match = false;
+          }
+        }
+
+        if (all_output_type_match) {
+          instruct.kernels().emplace_back(std::move(candidate.second));
+          VLOG(2) << "instruct.kernels.emplace_back "
+                  << instruct.kernels().front()->name();
+          break;
+        }
+      }
+      CHECK(!instruct.kernels().empty()) << "No kernels found for "
+                                         << instruct.op_type();
+    }
+  }
+}
+
+#ifdef LITE_WITH_XPU
+void XPUStaticKernelPickPass::DicideUseFP16Optimizer(
+    const std::unique_ptr<SSAGraph>& graph) {
+  if (graph->valid_places()[0].precision == PrecisionType::kFP16) {
+    xpu_use_fp16_optimizer_ = true;
+    VLOG(2) << "XPU auto use data precision: FP16/FP32/INT16 ";
+  }
+}
+
+void XPUStaticKernelPickPass::ForceUseFP32Kernel(
+    size_t* score,
+    const lite::KernelBase& kernel,
+    const paddle::lite::mir::Node::Stmt& instruct) {
+  if (kernel.place().target != TARGET(kXPU)) {
+    return;
+  }
+
+  // only use in FC，it will not use in future.
+  if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int31" ||
+      lite::TargetWrapperXPU::multi_encoder_precision == "int31") {
+    if (kernel.alias() == "XPU_Real_kFloat" &&
+        instruct.op_type() == "__xpu__fc") {
+      *score *= 2;
+      VLOG(6) << "__xpu__fc: force use PRECISON INT31: *2";
+    }
+    return;
+  }
+
+  if (GetStringFromEnv("XPU_COMPUTE_PRECISION", "int16") == "int31") {
+    if (kernel.alias() == "XPU_Real_kFloat" &&
+        PRECISION_INT31_OP_.count(instruct.op_type())) {
+      *score *= 2;
+      VLOG(6) << instruct.op_type() << ": force use PRECISON INT31: *2";
+    }
+    return;
+  }
+
+  if (kernel.alias() == "XPU_Real_kFloat") {
+    *score = 0;
+    VLOG(6) << "By default,XPU not use PRECISION INT31, so not pick "
+               "current kernel: "
+            << kernel.summary();
+  }
+}
+
+void XPUStaticKernelPickPass::ForceUseInt8Kernel(
+    size_t* score,
+    const lite::KernelBase& kernel,
+    const paddle::lite::mir::Node::Stmt& instruct) {
+  if (kernel.place().target != TARGET(kXPU)) {
+    return;
+  }
+
+  // only use in FC，it will not use in future.
+  if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int8" ||
+      lite::TargetWrapperXPU::multi_encoder_precision == "int8") {
+    if (kernel.alias() == "XPU_Int8_FP32_FP32" &&
+        instruct.op_type() == "__xpu__fc") {
+      *score *= 2;
+      VLOG(6) << "__xpu__fc: force use PRECISON INT8: *2";
+    }
+    return;
+  }
+
+  if (GetStringFromEnv("XPU_COMPUTE_PRECISION", "int16") == "int8") {
+    if (kernel.alias() == "XPU_Int8_FP32_FP32" &&
+        PRECISION_INT8_OP_.count(instruct.op_type())) {
+      *score *= 2;
+      VLOG(6) << instruct.op_type() << ": force use PRECISON INT8: *2";
+    }
+    return;
+  }
+
+  if (kernel.alias() == "XPU_Int8_FP32_FP32") {
+    *score = 0;
+    VLOG(6) << "By default,XPU not use PRECISION INT8, so not pick "
+               "current kernel: "
+            << kernel.summary();
+  }
+}
+
+void XPUStaticKernelPickPass::GetScore(PrecisionType precision,
+                                       size_t* score_tmp) {
+  if (precision == PrecisionType::kInt16) {
+    *score_tmp = *score_tmp > 9 ? *score_tmp : 9;
+  } else if (precision == PrecisionType::kFP16) {
+    *score_tmp = *score_tmp > 7 ? *score_tmp : 7;
+  } else if (precision == PrecisionType::kAny) {
+    *score_tmp = *score_tmp > 1 ? *score_tmp : 1;
+  } else {
+    *score_tmp = *score_tmp > 5 ? *score_tmp : 5;
+  }
+}
+
+void XPUStaticKernelPickPass::NodeOutputPrecision(
+    const std::unique_ptr<SSAGraph>& graph, lite::mir::Node* node) {
+  auto& inst = node->AsStmt();
+  if (inst.op_type() == "fetch") {
+    return;
+  }
+
+  const auto* op_info = inst.op_info();
+  for (auto* out_node : node->outlinks) {
+    auto& var = out_node->AsArg();
+    const auto& var_name = var.name;
+    std::string arg_name;
+    CHECK(op_info->GetOutputArgname(var_name, &arg_name))
+        << "Can not find the output argument,current var name : " << var_name;
+    VLOG(6) << " output arg name:" << arg_name << " var name:" << var_name;
+    Scope* scope = node->AsStmt().op()->scope();
+    auto* var_ptr = scope->FindVar(var_name);
+    if (var_ptr == nullptr) {
+      VLOG(6) << "Can't find ouput var_name:  " << var_name
+              << "in current scope.";
+      continue;
+    }
+
+    PrecisionType precison = var_ptr->GetMutable<lite::Tensor>()->precision();
+    xpu_output_type_.emplace(var_name, precison);
+  }
+}
+
+void XPUStaticKernelPickPass::SpecialNodeOutputPrecision(
+    const std::unique_ptr<SSAGraph>& graph,
+    lite::mir::Node* node,
+    const std::unique_ptr<lite::KernelBase>& kernel) {
+  auto& inst = node->AsStmt();
+
+  std::vector<std::string> out_var_names;
+  const auto* op_info = inst.op_info();
+  for (auto* out_node : node->outlinks) {
+    auto& var = out_node->AsArg();
+    const auto& var_name = var.name;
+    std::string arg_name;
+
+    CHECK(op_info->GetOutputArgname(var_name, &arg_name))
+        << "Can not find the output argument, current var name : " << var_name;
+    VLOG(6) << " output arg name:" << arg_name << " var name:" << var_name;
+    if (output_parameter_name_.count(arg_name) == 0) {
+      continue;
+    }
+
+    const auto* decl_type = kernel->GetOutputDeclType(arg_name);
+    CHECK(decl_type);
+    PrecisionType precison = decl_type->precision();
+    xpu_output_type_.emplace(var_name, precison);
+  }
+}
+
+void XPUStaticKernelPickPass::InplaceNodeOutputPrecision(
+    const paddle::lite::mir::Node::Stmt& instruct,
+    const std::vector<std::string>& in_names,
+    const std::vector<std::string>& out_names) {
+  PrecisionType pre_op_output_precision = PrecisionType::kUnk;
+  for (size_t i = 0; i < in_names.size(); ++i) {
+    std::string tmp;
+    CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp));
+    VLOG(6) << "current kernel input data variable name:" << in_names[i]
+            << "Parameter name:" << tmp;
+    if (input_parameter_name_.count(tmp) &&
+        xpu_output_type_.count(in_names[i])) {
+      pre_op_output_precision = xpu_output_type_[in_names[i]];
+    }
+  }
+
+  // collect inplace op output data precision
+  if (pre_op_output_precision != PrecisionType::kUnk) {
+    for (size_t i = 0; i < out_names.size(); ++i) {
+      std::string tmp;
+      CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp));
+      if (output_parameter_name_.count(tmp)) {
+        xpu_output_type_.emplace(out_names[i], pre_op_output_precision);
+      }
+    }
+  }
+}
+
+// Special nodes like conv2d, matmul ; collect input data precision for eatch
+// registry kernel as a candidate set.
+void XPUStaticKernelPickPass::SpecialNodeInputPrecision(lite::mir::Node* node) {
+  auto& inst = node->AsStmt();
+  const auto* op_info = inst.op_info();
+  for (auto* in_node : node->inlinks) {
+    auto& var = in_node->AsArg();
+    const auto& var_name = var.name;
+    std::string arg_name;
+    CHECK(op_info->GetInputArgname(var_name, &arg_name))
+        << "Can not find the input argument,current var name : " << var_name;
+    VLOG(6) << " input arg name:" << arg_name << " var name:" << var_name;
+    if (input_parameter_name_.count(arg_name) == 0) {
+      continue;
+    }
+
+    std::vector<std::map<std::string, PrecisionType>> kernel_input_type{};
+    for (auto&& kernel : inst.kernels()) {
+      if (kernel->summary().find(xpu_disable_flag_) != std::string::npos) {
+        VLOG(6) << " ignore collect current kernel:" << kernel->summary();
+        continue;
+      }
+
+      std::map<std::string, PrecisionType> tmp_map;
+      PrecisionType precison;
+
+      const auto* decl_type = kernel->GetInputDeclType(arg_name);
+      CHECK(decl_type);
+      precison = decl_type->precision();
+      tmp_map.emplace(kernel->summary(), precison);
+      kernel_input_type.emplace_back(std::move(tmp_map));
+    }
+
+    xpu_input_type_.emplace(var_name, kernel_input_type);
+  }
+}
+
+void XPUStaticKernelPickPass::NodeInputPrecision(
+    lite::mir::Node* node, const std::unique_ptr<SSAGraph>& graph) {
+  auto& inst = node->AsStmt();
+  if (inst.op_type() == "feed") {
+    return;
+  }
+
+  const auto* op_info = inst.op_info();
+  for (auto* in_node : node->inlinks) {
+    auto& var = in_node->AsArg();
+    const auto& var_name = var.name;
+    std::string arg_name;
+    CHECK(op_info->GetInputArgname(var_name, &arg_name))
+        << "Can not find the input argument,current var name : " << var_name;
+    VLOG(6) << " input arg name:" << arg_name << " var name:" << var_name;
+
+    std::vector<std::map<std::string, PrecisionType>> kernel_input_type{};
+    std::map<std::string, PrecisionType> tmp_map;
+    PrecisionType precison;
+    Scope* scope = node->AsStmt().op()->scope();
+
+    auto* var_ptr = scope->FindVar(var_name);
+    if (var_ptr == nullptr) {
+      VLOG(6) << "Can't find input var_name:  " << var_name
+              << "in current scope.";
+      continue;
+    }
+
+    precison = var_ptr->GetMutable<lite::Tensor>()->precision();
+    tmp_map.emplace(inst.op_type(), precison);
+    kernel_input_type.emplace_back(std::move(tmp_map));
+    xpu_input_type_.emplace(var_name, kernel_input_type);
+  }
+}
+
+// Special for inplace op.
+void XPUStaticKernelPickPass::InplaceNodeInputPrecision(lite::mir::Node* node) {
+  auto& inst = node->AsStmt();
+  const auto* op_info = inst.op_info();
+  // inplace op only has one inpute variable.
+  std::string inplace_op_input_name{"none"};
+  for (auto* in_node : node->inlinks) {
+    auto& var = in_node->AsArg();
+    const auto& var_name = var.name;
+    std::string arg_name;
+    CHECK(op_info->GetInputArgname(var_name, &arg_name))
+        << "Can not find the input argument,current var name : " << var_name;
+    VLOG(6) << " input arg name:" << arg_name << " var name:" << var_name;
+    if (input_parameter_name_.count(arg_name)) {
+      inplace_op_input_name = var_name;
+    }
+  }
+
+  for (auto* out_node : node->outlinks) {
+    auto& var = out_node->AsArg();
+    const auto& var_name = var.name;
+    std::string arg_name;
+    int num = 0;
+
+    CHECK(op_info->GetOutputArgname(var_name, &arg_name))
+        << "Can not find the output argument,current var name : " << var_name;
+    VLOG(6) << " output arg name:" << arg_name << " var name:" << var_name;
+    // inplace op only have one output variable,but ic can connect input
+    // variables of multiple Ops
+    int output_match_num = xpu_input_type_.count(var_name);
+    if (output_parameter_name_.count(arg_name) == 0 || output_match_num == 0) {
+      continue;
+    }
+
+    for (auto iter = xpu_input_type_.begin(); iter != xpu_input_type_.end();
+         ++iter) {
+      if (num >= output_match_num) {
+        break;
+      }
+
+      if (iter->first != var_name) {
+        continue;
+      }
+
+      ++num;
+      xpu_input_type_.emplace(inplace_op_input_name, iter->second);
+    }
+    VLOG(6) << "inplace op :" << inst.op_type() << "input prision"
+            << "replace by the next op input prision ";
+    VLOG(6) << "inplace op :" << inst.op_type()
+            << ", inpute name:" << inplace_op_input_name
+            << ",the next op input input name : " << var_name;
+  }
+}
+
+void XPUStaticKernelPickPass::InplaceOpScore(
+    const lite::KernelBase& kernel,
+    const paddle::lite::mir::Node::Stmt& instruct,
+    const std::vector<std::string>& in_names,
+    const std::vector<std::string>& out_names,
+    bool* type_match,
+    size_t* score) {
+  PrecisionType pre_op_output_precision = PrecisionType::kUnk;
+  for (size_t i = 0; i < in_names.size(); ++i) {
+    std::string tmp;
+    CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp));
+    VLOG(6) << "current kernel input data variable name:" << in_names[i]
+            << "Parameter name:" << tmp;
+    if (input_parameter_name_.count(tmp) &&
+        xpu_output_type_.count(in_names[i])) {
+      size_t score_tmp = 0;
+      pre_op_output_precision = xpu_output_type_[in_names[i]];
+      if (kernel.GetInputDeclType(tmp)->precision() == PrecisionType::kAny) {
+        GetScore(PrecisionType::kAny, &score_tmp);
+        VLOG(6) << "current inplace kernel input data precision:kAny";
+      }
+
+      if (pre_op_output_precision ==
+              kernel.GetInputDeclType(tmp)->precision() ||
+          pre_op_output_precision == PrecisionType::kAny) {
+        GetScore(pre_op_output_precision, &score_tmp);
+        *type_match = true;
+        VLOG(6) << "inplace op match input data precision";
+      }
+
+      *score += score_tmp;
+    }
+  }
+
+  // collect inplace op output data precision
+  if (pre_op_output_precision != PrecisionType::kUnk) {
+    for (size_t i = 0; i < out_names.size(); ++i) {
+      std::string tmp;
+      CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp));
+      if (output_parameter_name_.count(tmp)) {
+        xpu_output_type_.emplace(out_names[i], pre_op_output_precision);
+      }
+    }
+  }
+}
+
+void XPUStaticKernelPickPass::SpecialOpScore(
+    const lite::KernelBase& kernel,
+    const paddle::lite::mir::Node::Stmt& instruct,
+    const std::vector<std::string>& in_names,
+    const std::vector<std::string>& out_names,
+    bool* type_match,
+    size_t* score) {
+  size_t score_tmp_all = 0;
+  bool intput_match = true;
+  bool output_match = true;
+  bool consider_cpu = false;
+  // delete??
+  if (consider_cpu_op_.count(instruct.op_type())) {
+    consider_cpu = true;
+  }
+
+  if (!(kernel.place().target == TARGET(kXPU) || consider_cpu)) {
+    return;
+  }
+
+  // input data precision score
+  for (size_t i = 0; i < in_names.size(); ++i) {
+    std::string tmp;
+    CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp));
+    if (input_parameter_name_.count(tmp) == 0) {
+      continue;
+    }
+
+    if (xpu_output_type_.count(in_names[i]) == 0) {
+      continue;
+    }
+
+    VLOG(6) << "current kernel input data variable name:" << in_names[i]
+            << ", Parameter name:" << tmp;
+
+    size_t score_tmp = 0;
+    if (kernel.GetInputDeclType(tmp)->precision() == PrecisionType::kAny) {
+      GetScore(PrecisionType::kAny, &score_tmp);
+      VLOG(6) << "match input data precision:kAny";
+    }
+
+    if (xpu_output_type_[in_names[i]] ==
+            kernel.GetInputDeclType(tmp)->precision() ||
+        xpu_output_type_[in_names[i]] == PrecisionType::kAny) {
+      GetScore(xpu_output_type_[in_names[i]], &score_tmp);
+      VLOG(6) << "match input data precision";
+    }
+
+    if (score_tmp == 0) {
+      output_match = false;
+    }
+
+    score_tmp_all += score_tmp;
+  }
+
+  // output data precision score
+  for (size_t i = 0; i < out_names.size(); ++i) {
+    std::string tmp;
+    CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp));
+    int output_match_num = xpu_input_type_.count(out_names[i]);
+    if (output_parameter_name_.count(tmp) == 0) {
+      continue;
+    }
+
+    if (output_match_num == 0) {
+      continue;
+    }
+
+    VLOG(6) << "current kernel output data variable name:" << out_names[i]
+            << ", Parameter name:" << tmp;
+    int num = 0;
+    size_t score_tmp = 0;
+    for (auto iter = xpu_input_type_.begin(); iter != xpu_input_type_.end();
+         ++iter) {
+      if (num >= output_match_num) {
+        break;
+      }
+
+      if (iter->first != out_names[i]) {
+        continue;
+      }
+
+      ++num;
+      for (auto& map_kernel : iter->second) {
+        // Special op fetch
+        if (map_kernel.begin()->first.substr(0, 5) == "fetch") {
+          if (map_kernel.begin()->second ==
+              kernel.GetOutputDeclType(tmp)->precision()) {
+            score_tmp = 500;
+          }
+          continue;
+        }
+
+        if (kernel.GetOutputDeclType(tmp)->precision() == PrecisionType::kAny) {
+          VLOG(6) << "match precision:kAny,the next kernel's name:"
+                  << map_kernel.begin()->first;
+          GetScore(PrecisionType::kAny, &score_tmp);
+        }
+
+        if (map_kernel.begin()->second ==
+                kernel.GetOutputDeclType(tmp)->precision() ||
+            map_kernel.begin()->second == PrecisionType::kAny) {
+          VLOG(6) << "match next kernel's input data precision,the "
+                     "next kernel name:"
+                  << map_kernel.begin()->first;
+          GetScore(map_kernel.begin()->second, &score_tmp);
+        }
+      }
+    }
+
+    if (score_tmp == 0) {
+      output_match = false;
+    }
+    score_tmp_all += score_tmp;
+  }
+
+  if (score_tmp_all > 0) {
+    *type_match = intput_match & output_match;
+  }
+
+  *score += score_tmp_all;
+}
+
+void XPUStaticKernelPickPass::GetXPUDeviceType() {
+  int cur_dev_idx = 0;
+  uint64_t cur_dev_attr = 0;
+
+  XPU_CALL(xpu_current_device(&cur_dev_idx));
+  XPU_CALL(xpu_device_get_attr(&cur_dev_attr, XPUATTR_MODEL, cur_dev_idx));
+  if (cur_dev_attr <= 1) {
+    VLOG(4) << "Currents XPU device : XPU1";
+    xpu_disable_flag_ = "DISABLE_XPU1";
+  } else if (cur_dev_attr >= 2 && cur_dev_attr <= 299) {
+    VLOG(4) << "Currents XPU device : XPU2";
+    xpu_disable_flag_ = "DISABLE_XPU2";
+  } else if (cur_dev_attr >= 300 && cur_dev_attr <= 599) {
+    VLOG(4) << "Currents XPU device : XPU3";
+    xpu_disable_flag_ = "DISABLE_XPU3";
+  } else {
+    VLOG(4) << "invaid XPU device";
+    xpu_disable_flag_ = "NONE";
+  }
+}
+
+#endif
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__static_kernel_pick_pass,
+                  paddle::lite::mir::XPUStaticKernelPickPass)
+    .BindTargets({TARGET(kXPU)});
diff --git a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h
new file mode 100644
index 00000000000..cb3d25d3309
--- /dev/null
+++ b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h
@@ -0,0 +1,332 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <limits>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "lite/core/optimizer/mir/pass.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+/*
+ * XPUStaticKernelPickPass is a simple strategy for picking the kernel for each
+ * Operator using operator developer defined rule, there are many other tactics
+ * such as considering IO or kernel execution latency and we will implement them
+ * latter.
+ *
+ * There are two argument for this pass:
+ * - place, the target place.
+ * - kernel_pick_factors, the factors to consider in picking kernels.
+ * Set them first before execute the pass.
+ */
+class XPUStaticKernelPickPass : public mir::StmtPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+
+  const core::KernelPickFactor& kernel_pick_factors() const {
+    return kernel_pick_factors_;
+  }
+  core::KernelPickFactor* mutable_kernel_pick_factors() {
+    return &kernel_pick_factors_;
+  }
+
+ private:
+  // Score the kernel.
+  size_t KernelGrade(lite::mir::Node* node,
+                     const lite::KernelBase& kernel,
+                     const std::vector<Place>& places,
+                     const std::map<std::string, PrecisionType>& in_types,
+                     const std::map<std::string, PrecisionType>& out_types,
+                     const std::vector<std::string>& in_names,
+                     const std::vector<std::string>& out_names) {
+    const auto& instruct = node->AsStmt();
+    CHECK_GT(places.size(), static_cast<size_t>(0)) << "valid_places is empty.";
+    float final_score{-1.};
+    Place winner_place{places[0]};
+    const int kMax =
+        (std::numeric_limits<core::KernelPickFactor::value_type>::max)();
+    size_t place_size = places.size();
+
+    // NOTE: We compare kernel's place with place in valid_places to select the
+    // best match place
+    //       The place's order in valid_places array decide the user's
+    //       preference
+    // final_score = weight * socre
+    // weight: The weight is compute with (valid_places.size() - i) /
+    // valid_places.size() as default.
+    //         where i is the place's index in valid_places array.
+    // score:  score is the weighted sum of target、percision and layout
+    for (size_t i = 0; i < place_size; ++i) {
+      const auto& place = places[i];
+      float weight = static_cast<float>(place_size - i) / place_size;
+      VLOG(4) << "current place is " << place.DebugString() << ", idx : " << i
+              << ", weight : " << weight;
+      size_t score{};
+
+      // The more important factor comes first
+      if (kernel_pick_factors_.IsTargetConsidered() &&
+          (place.target == kernel.target() || kernel.target() == TARGET(kAny) ||
+           place.target == TARGET(kAny))) {
+        size_t target_score =
+            kMax /
+            static_cast<int>(core::KernelPickFactor::Factor::TargetFirst);
+        score += target_score;
+        VLOG(4) << "[TargetConsidered score]:" << target_score;
+      }
+      VLOG(4) << "[score s1]:" << score;
+
+      if (kernel_pick_factors_.IsPrecisionConsidered() &&
+          (place.precision == kernel.precision() ||
+           kernel.precision() == PRECISION(kFloat) ||
+           kernel.precision() == PRECISION(kAny) ||
+           place.precision == PRECISION(kAny))) {
+        // score skipped, if kernel is int8, but op is not int8
+        if (!(kernel.precision() == PRECISION(kInt8) &&
+              !instruct.op_info()->HasAttr("enable_int8"))) {
+          size_t precision_score =
+              kMax /
+              static_cast<int>(core::KernelPickFactor::Factor::PrecisionFirst);
+          score += precision_score;
+          VLOG(4) << "[PrecisionConsidered score]:" << precision_score;
+        }
+      }
+      VLOG(4) << "[score s2]:" << score;
+
+      if (kernel_pick_factors_.IsDataLayoutConsidered() &&
+          (place.layout == kernel.layout() ||
+           kernel.layout() == DATALAYOUT(kAny) ||
+           place.layout == DATALAYOUT(kAny))) {
+        size_t datalayout_score =
+            kMax /
+            static_cast<int>(core::KernelPickFactor::Factor::DataLayoutFirst);
+        score += datalayout_score;
+        VLOG(4) << "[DataLayoutConsidered score]:" << datalayout_score;
+      }
+      VLOG(4) << "[score s3]:" << score;
+
+      // add new rules for precision: When the input types are consistent with
+      // kernel's input types, select the kernel of the precision. However, if
+      // the op is feed, we should compare the output precision type.
+      // Note that this strategy is not compatible with quantization, so skip
+      // quantization op.
+      if (!instruct.op_info()->HasAttr("enable_int8")) {
+        bool type_match = true;
+        if (instruct.op_type() == "feed") {
+          for (size_t i = 0; i < out_names.size(); ++i) {
+            std::string tmp;
+            CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp));
+            if (out_types.count(out_names[i]) &&
+                out_types.at(out_names[i]) !=
+                    kernel.GetOutputDeclType(tmp)->precision()) {
+              type_match = false;
+            }
+          }
+        } else {
+          for (size_t i = 0; i < in_names.size(); ++i) {
+            std::string tmp;
+            CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp));
+            if (in_types.count(in_names[i]) &&
+                !PrecTypeCompatible(
+                    in_types.at(in_names[i]),
+                    kernel.GetInputDeclType(tmp)->precision())) {
+              type_match = false;
+            }
+          }
+        }
+#ifdef LITE_WITH_XPU
+        if (xpu_use_fp16_optimizer_ &&
+            (xpu_special_op_.count(instruct.op_type()) ||
+             xpu_inplace_op_.count(instruct.op_type()))) {
+          type_match = false;
+          if (kernel.summary().find(xpu_disable_flag_) != std::string::npos) {
+            score = 0;
+            VLOG(6) << " ignore pick current kernel:" << kernel.summary();
+          } else if (xpu_inplace_op_.count(instruct.op_type())) {
+            InplaceOpScore(
+                kernel, instruct, in_names, out_names, &type_match, &score);
+          } else {
+            SpecialOpScore(
+                kernel, instruct, in_names, out_names, &type_match, &score);
+          }
+        }
+#endif
+
+        if (type_match) {
+          score *= 2;
+          VLOG(4) << "[Input/Output precision compatible]: *2";
+        }
+        VLOG(4) << "[score s4]:" << score;
+      }
+#ifdef LITE_WITH_XPU
+      ForceUseFP32Kernel(&score, kernel, instruct);
+      ForceUseInt8Kernel(&score, kernel, instruct);
+#endif
+
+      // add new rules for datatype: When the input types are consistent with
+      // kernel's input types, select the kernel of the datatype.
+      if (instruct.op_info()->Type() != "conditional_block" &&
+          instruct.op_info()->Type() != "while" &&
+          instruct.op_info()->Type() != "subgraph") {
+        bool datatype_match = true;
+        for (auto* in : node->inlinks) {
+          if (!in->IsArg()) continue;
+          if (in->AsArg().name == "feed" || in->AsArg().is_persist) continue;
+          std::string argname;
+          instruct.op_info()->GetInputArgname(in->AsArg().name, &argname);
+          VLOG(5) << "intput var name : " << in->AsArg().name;
+          // only when datatype is LOD_TENSOR, LOD_TENSOR_ARRAY, STEP_SCOPES,
+          // the type pointer is not null;
+          if (in->AsArg().type) {
+            VLOG(5) << "input datatype : "
+                    << static_cast<int>(in->AsArg().type->id());
+            VLOG(5) << "kernel bind datatype : "
+                    << static_cast<int>(kernel.GetInputDeclType(argname)->id());
+            if (static_cast<int>(in->AsArg().type->id()) !=
+                static_cast<int>(kernel.GetInputDeclType(argname)->id()))
+              datatype_match = false;
+          } else {
+            datatype_match = false;
+          }
+        }
+        if (datatype_match) {
+          score *= 2;
+          VLOG(4) << "[Input datatype compatible]: *2";
+        }
+        VLOG(4) << "[score s5]:" << score;
+      }
+
+      if (weight * score > final_score) {
+        final_score = weight * score;
+        winner_place = place;
+      }
+    }
+
+    VLOG(2) << "-------- score summary for candidate kernel : "
+            << kernel.summary() << " --------";
+    VLOG(2) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
+            << " " << DataLayoutToStr(winner_place.layout) << " "
+            << TargetToStr(winner_place.target);
+    VLOG(2) << " ===> kernel.place():"
+            << PrecisionToStr(kernel.place().precision) << " "
+            << DataLayoutToStr(kernel.place().layout) << " "
+            << TargetToStr(kernel.place().target);
+    VLOG(4) << "kernel.op_type():" << kernel.op_type();
+    VLOG(4) << "kernel picker factors:" << kernel_pick_factors_;
+    VLOG(4) << "winner_picker place:" << winner_place.DebugString();
+    VLOG(4) << "[score(final)]:" << final_score;
+    VLOG(4) << "------------------------------";
+
+    // The data layout is not considered, for the input and output arguments
+    // might have different data layout.
+    // TODO(Superjomn) reconsider the idea of taking the data layout as a kernel
+    // specification.
+    return final_score;
+  }
+
+  // Compatible for PrecisionType.
+  // For cuda, in the process of choosing kernel, fp16 and fp32 are compatiable.
+  // If kernel's declared type is kAny, it is matched.
+  bool PrecTypeCompatible(const PrecisionType& p1, const PrecisionType& p2) {
+    if (p1 == p2 || p2 == PRECISION(kAny)) {
+      return true;
+    } else if ((p1 == PRECISION(kFP16) || p1 == PRECISION(kFloat)) &&
+               (p2 == PRECISION(kFP16) || p2 == PRECISION(kFloat))) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+#ifdef LITE_WITH_XPU
+  void DicideUseFP16Optimizer(const std::unique_ptr<SSAGraph>& graph);
+  void ForceUseFP32Kernel(size_t* score,
+                          const lite::KernelBase& kernel,
+                          const paddle::lite::mir::Node::Stmt& instruct);
+  void ForceUseInt8Kernel(size_t* score,
+                          const lite::KernelBase& kernel,
+                          const paddle::lite::mir::Node::Stmt& instruct);
+  void GetScore(PrecisionType precision, size_t* score_tmp);
+
+  void NodeInputPrecision(lite::mir::Node* node,
+                          const std::unique_ptr<SSAGraph>& graph);
+  void InplaceNodeInputPrecision(lite::mir::Node* node);
+  void SpecialNodeInputPrecision(lite::mir::Node* node);
+
+  void NodeOutputPrecision(const std::unique_ptr<SSAGraph>& graph,
+                           lite::mir::Node* node);
+  void InplaceNodeOutputPrecision(const paddle::lite::mir::Node::Stmt& instruct,
+                                  const std::vector<std::string>& in_names,
+                                  const std::vector<std::string>& out_names);
+  void SpecialNodeOutputPrecision(
+      const std::unique_ptr<SSAGraph>& graph,
+      lite::mir::Node* node,
+      const std::unique_ptr<lite::KernelBase>& kernel);
+
+  void SpecialOpScore(const lite::KernelBase& kernel,
+                      const paddle::lite::mir::Node::Stmt& instruct,
+                      const std::vector<std::string>& in_names,
+                      const std::vector<std::string>& out_names,
+                      bool* type_match,
+                      size_t* score);
+  void GetXPUDeviceType();
+  void InplaceOpScore(const lite::KernelBase& kernel,
+                      const paddle::lite::mir::Node::Stmt& instruct,
+                      const std::vector<std::string>& in_names,
+                      const std::vector<std::string>& out_names,
+                      bool* type_match,
+                      size_t* score);
+#endif
+
+ private:
+  core::KernelPickFactor kernel_pick_factors_;
+#ifdef LITE_WITH_XPU
+  bool xpu_use_fp16_optimizer_{false};
+  // TODO(quwei:) addn more op
+  const std::set<std::string> PRECISION_INT31_OP_{"__xpu__fc"};
+  const std::set<std::string> PRECISION_INT8_OP_{"__xpu__fc"};
+  const std::set<std::string> input_parameter_name_{
+      "Input", "X", "Y", "Branch", "BBoxes", "Scores", "repeat_times_tensor"};
+  const std::set<std::string> output_parameter_name_{
+      "Output", "Out", "Boxes", "Scores", "Y"};
+  std::multimap<std::string, std::vector<std::map<std::string, PrecisionType>>>
+      xpu_input_type_{};
+  std::map<std::string, PrecisionType> xpu_output_type_{};
+  std::string xpu_disable_flag_{};
+  const std::set<std::string> consider_cpu_op_{"cast"};
+  const std::set<std::string> xpu_special_op_{"__xpu__fc",
+                                              "conv3d",
+                                              "__xpu__conv2d",
+                                              "gather",
+                                              "pool2d",
+                                              "concat",
+                                              "calib"};
+  const std::set<std::string> xpu_inplace_op_{"reshape",
+                                              "reshape2",
+                                              "flatten",
+                                              "flatten2",
+                                              "squeeze",
+                                              "squeeze2",
+                                              "unsqueeze",
+                                              "unsqueeze2"};
+#endif
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc
index 0e3f3b0335d..d31f3d8d2b0 100644
--- a/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc
+++ b/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc
@@ -82,27 +82,6 @@ class XPUFcFuser : public FuseBase {
     op_desc.SetInput("Input", {matched.at("x")->arg()->name});
     op_desc.SetInput("Filter", {matched.at("W")->arg()->name});
 
-    std::string precision = "int16";
-#ifdef LITE_WITH_XPU
-    if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int31" ||
-        lite::TargetWrapperXPU::multi_encoder_precision == "int31") {
-      precision = "int31";
-      VLOG(3) << "Use int31 in XPUFcOp";
-    } else if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int8" ||
-               lite::TargetWrapperXPU::multi_encoder_precision == "int8") {
-      precision = "int8";
-      if (op_desc.HasAttr("enable_int8") &&
-          op_desc.GetAttr<bool>("enable_int8")) {
-        CHECK(op_desc.HasAttr("X0_scale")) << " quant model fc no X0_scale";
-        CHECK(op_desc.HasAttr("Y0_scale")) << " quant model fc no Y0_scale";
-        VLOG(3) << "Use int8 quant model in XPUFcOp, InputMax:"
-                << 127 * op_desc.GetAttr<std::vector<float>>("X0_scale")[0]
-                << ", WeightMax: "
-                << 127 * op_desc.GetAttr<std::vector<float>>("Y0_scale")[0];
-      }
-      VLOG(3) << "Use int8 in XPUFcOp";
-    }
-#endif
     if (with_bias_) {
       op_desc.SetInput("Bias", {matched.at("bias")->arg()->name});
     }
@@ -119,7 +98,6 @@ class XPUFcFuser : public FuseBase {
       output_node_name = "mul_out";
     }
     op_desc.SetOutput("Output", {output_name});
-    op_desc.SetAttr<std::string>("precision", precision);
     std::map<std::string, int> act_map{{"linear", 0},
                                        {"relu", 1},
                                        {"sigmoid", 2},
diff --git a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
index 0b2516a3bf4..e323efbf7f5 100644
--- a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
@@ -1176,7 +1176,7 @@ class XPUMultiEncoderFusePass : public ProgramPass {
     std::vector<std::string> matmul_types{"matmul", "matmul_v2"};
     std::vector<std::string> mul_types{"mul", "matmul"};
     std::vector<bool> with_q_scales{true, false};
-    std::vector<bool> norm_befores{false};
+    std::vector<bool> norm_befores{true, false};
 
     std::string fc_precision;
     bool adaptive_seqlen = false;
diff --git a/lite/core/optimizer/mir/static_kernel_pick_pass.cc b/lite/core/optimizer/mir/static_kernel_pick_pass.cc
index 92695aa9ed7..236173558d0 100644
--- a/lite/core/optimizer/mir/static_kernel_pick_pass.cc
+++ b/lite/core/optimizer/mir/static_kernel_pick_pass.cc
@@ -193,4 +193,5 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(static_kernel_pick_pass,
                   paddle::lite::mir::StaticKernelPickPass)
-    .BindTargets({TARGET(kAny)});
+    .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kXPU)});
diff --git a/lite/core/optimizer/optimizer.cc b/lite/core/optimizer/optimizer.cc
index a6790eeb15d..a971ee6e048 100644
--- a/lite/core/optimizer/optimizer.cc
+++ b/lite/core/optimizer/optimizer.cc
@@ -14,6 +14,9 @@
 
 #include "lite/core/optimizer/optimizer.h"
 #include <fstream>
+#ifdef LITE_WITH_XPU
+#include "lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h"
+#endif
 #include "lite/core/optimizer/mir/static_kernel_pick_pass.h"
 #include "lite/core/optimizer/mir/type_target_cast_pass.h"
 #include "lite/model_parser/model_parser.h"
@@ -49,7 +52,6 @@ std::unique_ptr<RuntimeProgram> Optimizer::Run(Program&& program) {
     graph->SetValidPlaces(valid_places_);
     graphs_.emplace_back(std::move(graph));
   }
-
   SpecifyKernelPickTactic(kernel_pick_factor_);
   InitTargetTypeTransformPass();
   InitControlFlowOpUnusedInputsAndOutputsEliminatePass();
@@ -63,8 +65,12 @@ std::unique_ptr<RuntimeProgram> Optimizer::Run(Program&& program) {
 }
 
 void Optimizer::SpecifyKernelPickTactic(core::KernelPickFactor factor) {
+  std::string static_pick_name = "static_kernel_pick_pass";
+#ifdef LITE_WITH_XPU
+  static_pick_name = "__xpu__static_kernel_pick_pass";
+#endif
   auto* pass = mir::PassManager::Global().LookUp<mir::StaticKernelPickPass>(
-      "static_kernel_pick_pass");
+      static_pick_name);
   CHECK(pass);
 
   *pass->mutable_kernel_pick_factors() = factor;
@@ -229,6 +235,9 @@ std::unique_ptr<RuntimeProgram> RunDefaultOptimizer(
        "fpga_concat_fuse_pass",
        "control_flow_op_unused_inputs_and_outputs_eliminate_pass",
        "static_kernel_pick_pass",  // pick original kernel from graph
+#ifdef LITE_WITH_XPU
+       "__xpu__static_kernel_pick_pass",  // xpu pick original kernel from graph
+#endif
 
        "remove_tf_redundant_ops_pass",
        "variable_place_inference_pass",  // inference arg/var's
diff --git a/lite/kernels/xpu/__xpu__conv2d_compute.cc b/lite/kernels/xpu/__xpu__conv2d_compute.cc
index cad9a4fd691..fb65dea06c5 100644
--- a/lite/kernels/xpu/__xpu__conv2d_compute.cc
+++ b/lite/kernels/xpu/__xpu__conv2d_compute.cc
@@ -22,36 +22,12 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-template <typename T>
-bool QuantFilter(const float* filter_on_host,
-                 T* quant_res,
-                 float max,
-                 int64_t len) {
-  return false;
-}
-
-template <>
-bool QuantFilter<int16_t>(const float* filter_on_host,
-                          int16_t* quant_res,
-                          float max,
-                          int64_t len) {
-  paddle::lite::xpu::math::ConvertFP32ToInt16(
-      filter_on_host, quant_res, max, len);
-  return true;
-}
-
-template <>
-bool QuantFilter<int8_t>(const float* filter_on_host,
-                         int8_t* quant_res,
-                         float max,
-                         int64_t len) {
-  paddle::lite::xpu::math::ConvertFP32ToInt8(
-      filter_on_host, quant_res, max, len);
-  return true;
-}
-
-template <typename T, PrecisionType PType>
-void XPUConv2dCompute<T, PType>::PrepareForRun() {
+template <typename TGEMM,
+          typename TW,
+          typename DX,
+          typename DY,
+          PrecisionType PType>
+void XPUConv2dCompute<TGEMM, TW, DX, DY, PType>::PrepareForRun() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
   int max_ptr_size = ctx.GetRawContext()->max_ptr_size();
@@ -60,12 +36,16 @@ void XPUConv2dCompute<T, PType>::PrepareForRun() {
   auto filter_dims = param.filter->dims();
 
   xpu_quant_filter_ =
-      TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, T>(
+      TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, TW>(
           filter_ptr, filter_dims, false);
 }
 
-template <typename T, PrecisionType PType>
-void XPUConv2dCompute<T, PType>::Run() {
+template <typename TGEMM,
+          typename TW,
+          typename DX,
+          typename DY,
+          PrecisionType PType>
+void XPUConv2dCompute<TGEMM, TW, DX, DY, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
 
@@ -86,8 +66,8 @@ void XPUConv2dCompute<T, PType>::Run() {
       param.output_max->template mutable_data<float>(TARGET(kXPU));
   const auto* bias =
       param.has_bias ? param.bias->template data<float>() : nullptr;
-  const float* branch =
-      param.has_branch ? param.branch->template data<float>() : nullptr;
+  const DY* branch =
+      param.has_branch ? param.branch->template data<DY>() : nullptr;
   const float* input_max =
       param.input_max ? param.input_max->template data<float>() : nullptr;
   xdnn::Activation_t act((xdnn::Activation_t::act_enum)act_type);
@@ -101,15 +81,15 @@ void XPUConv2dCompute<T, PType>::Run() {
     CHECK_EQ(act_type, 0);
     if (branch_broadcast_guard_.get() == nullptr) {
       branch_broadcast_guard_ = TargetWrapperXPU::MallocScratchPad(
-          param.output->numel() * sizeof(float));
+          param.output->numel() * sizeof(DY));
     } else {
-      branch_broadcast_guard_->Reserve(param.output->numel() * sizeof(float));
+      branch_broadcast_guard_->Reserve(param.output->numel() * sizeof(DY));
     }
-    int r = xdnn::conv2d_fusion<float, T, float, T>(
+    int r = xdnn::conv2d_fusion<DX, TW, DY, TGEMM>(
         ctx.GetRawContext(),
-        param.input->template data<float>(),
-        reinterpret_cast<const T*>(xpu_quant_filter_.data_ptr_),
-        reinterpret_cast<float*>(branch_broadcast_guard_->addr_),
+        param.input->template data<DX>(),
+        reinterpret_cast<const TW*>(xpu_quant_filter_.data_ptr_),
+        reinterpret_cast<DY*>(branch_broadcast_guard_->addr_),
         batch,
         img_c,
         img_h,
@@ -139,21 +119,21 @@ void XPUConv2dCompute<T, PType>::Run() {
     if (branch_shape > conv_out_shape) {
       param.output->Resize(lite::DDim(branch_shape));
     }
-    float* output = param.output->template mutable_data<float>(TARGET(kXPU));
-    r = xdnn::broadcast_add<float>(
+    DY* output = param.output->template mutable_data<DY>(TARGET(kXPU));
+    r = xdnn::broadcast_add<DY>(
         ctx.GetRawContext(),
-        reinterpret_cast<float*>(branch_broadcast_guard_->addr_),
+        reinterpret_cast<DY*>(branch_broadcast_guard_->addr_),
         branch,
         output,
         xshape,
         yshape);
     CHECK_EQ(r, 0);
   } else {
-    float* output = param.output->template mutable_data<float>(TARGET(kXPU));
-    int r = xdnn::conv2d_fusion<float, T, float, T>(
+    DY* output = param.output->template mutable_data<DY>(TARGET(kXPU));
+    int r = xdnn::conv2d_fusion<DX, TW, DY, TGEMM>(
         ctx.GetRawContext(),
-        param.input->template data<float>(),
-        reinterpret_cast<const T*>(xpu_quant_filter_.data_ptr_),
+        param.input->template data<DX>(),
+        reinterpret_cast<const TW*>(xpu_quant_filter_.data_ptr_),
         output,
         batch,
         img_c,
@@ -182,11 +162,27 @@ void XPUConv2dCompute<T, PType>::Run() {
 }  // namespace paddle
 
 namespace xpu = paddle::lite::kernels::xpu;
-using XPUConv2dFp32 = xpu::XPUConv2dCompute<int16_t, PRECISION(kFloat)>;
 
-using XPUConv2dInt8 = xpu::XPUConv2dCompute<int8_t, PRECISION(kInt8)>;
+using XPUConv2dFP32 =
+    xpu::XPUConv2dCompute<int, float, float, float, PRECISION(kFloat)>;
+
+using XPUConv2d_FP16_FP32_FP32 =
+    xpu::XPUConv2dCompute<int16_t, int16_t, float, float, PRECISION(kFloat)>;
+
+using XPUConv2dFp16 =
+    xpu::XPUConv2dCompute<int16_t, int16_t, float16, float16, PRECISION(kFP16)>;
+
+using XPUConv2d_FP16_FP16_FP32 =
+    xpu::XPUConv2dCompute<int16_t, int16_t, float16, float, PRECISION(kFP16)>;
 
-REGISTER_LITE_KERNEL(__xpu__conv2d, kXPU, kFloat, kNCHW, XPUConv2dFp32, def)
+using XPUConv2d_FP16_FP32_FP16 =
+    xpu::XPUConv2dCompute<int16_t, int16_t, float, float16, PRECISION(kFP16)>;
+
+using XPUConv2dInt8_FP32_FP32 =
+    xpu::XPUConv2dCompute<int8_t, int8_t, float, float, PRECISION(kInt8)>;
+
+REGISTER_LITE_KERNEL(
+    __xpu__conv2d, kXPU, kFloat, kNCHW, XPUConv2d_FP16_FP32_FP32, def)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
     .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
@@ -196,7 +192,71 @@ REGISTER_LITE_KERNEL(__xpu__conv2d, kXPU, kFloat, kNCHW, XPUConv2dFp32, def)
     .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(__xpu__conv2d, kXPU, kInt8, kNCHW, XPUConv2dInt8, def)
+REGISTER_LITE_KERNEL(
+    __xpu__conv2d, kXPU, kFloat, kNCHW, XPUConv2dFP32, XPU_Real_kFloat)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Branch", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    __xpu__conv2d, kXPU, kFP16, kNCHW, XPUConv2dFp16, XPU_FP16_FP16__FP16)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindInput("Branch",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(__xpu__conv2d,
+                     kXPU,
+                     kFP16,
+                     kNCHW,
+                     XPUConv2d_FP16_FP16_FP32,
+                     XPU_FP16_FP16__FP32)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindInput("Branch",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(__xpu__conv2d,
+                     kXPU,
+                     kFP16,
+                     kNCHW,
+                     XPUConv2d_FP16_FP32_FP16,
+                     XPU_FP16_FP32__FP16)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindInput("Branch",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(__xpu__conv2d,
+                     kXPU,
+                     kInt8,
+                     kNCHW,
+                     XPUConv2dInt8_FP32_FP32,
+                     XPU_Int8_FP32_FP32)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
     .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
diff --git a/lite/kernels/xpu/__xpu__conv2d_compute.h b/lite/kernels/xpu/__xpu__conv2d_compute.h
index 69a9aec69c8..c3c31d94743 100644
--- a/lite/kernels/xpu/__xpu__conv2d_compute.h
+++ b/lite/kernels/xpu/__xpu__conv2d_compute.h
@@ -21,8 +21,11 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace xpu {
-
-template <typename T, PrecisionType PType>
+template <typename TGEMM,
+          typename TW,
+          typename DX,
+          typename DY,
+          PrecisionType PType>
 class XPUConv2dCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::XPUBlockFuseParam;
diff --git a/lite/kernels/xpu/__xpu__fc_compute.cc b/lite/kernels/xpu/__xpu__fc_compute.cc
index c22f69354bf..024e5d6ed9e 100644
--- a/lite/kernels/xpu/__xpu__fc_compute.cc
+++ b/lite/kernels/xpu/__xpu__fc_compute.cc
@@ -24,10 +24,15 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-void XPUFcCompute::PrepareForRun() {
+template <typename TGEMM,
+          typename TW,
+          typename DX,
+          typename DY,
+          PrecisionType PType>
+void XPUFcCompute<TGEMM, TW, DX, DY, PType>::PrepareForRun() {
   auto& ctx = this->ctx_->template As<XPUContext>();
   auto& param = this->template Param<param_t>();
-  auto w_ptr = param.w->data<float>();
+  auto w_ptr = param.w->template data<float>();
   auto weight_dims = param.w->dims();
   bool quant_int8 = false;
   if (param.quant_w_max > 0.f) {
@@ -55,26 +60,25 @@ void XPUFcCompute::PrepareForRun() {
                                        sizeof(float) * max_ptr_size,
                                        IoDirection::HtoD);
     return;
-  }
-
-  if (param.precision == "int31") {
-    xpu_quant_weight_ =
-        TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, float>(
-            w_ptr, weight_dims, true);
-    CHECK(xpu_quant_weight_.max_ptr_ == nullptr)
-        << "int31 weight max should be null";
-  } else if (param.precision == "int16") {
-    xpu_quant_weight_ =
-        TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, int16_t>(
-            w_ptr, weight_dims, true);
-  } else if (param.precision == "int8") {
+  } else {
     xpu_quant_weight_ =
-        TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, int8_t>(
+        TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, TW>(
             w_ptr, weight_dims, true);
+    if (std::is_same<TW, float>::value) {
+      VLOG(6)
+          << "If fc compute precision is int31,must check weight max should "
+             "be null ";
+      CHECK(xpu_quant_weight_.max_ptr_ == nullptr)
+          << "int31 weight max should be null";
+    }
   }
 }
-
-void XPUFcCompute::Run() {
+template <typename TGEMM,
+          typename TW,
+          typename DX,
+          typename DY,
+          PrecisionType PType>
+void XPUFcCompute<TGEMM, TW, DX, DY, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
 
@@ -85,13 +89,15 @@ void XPUFcCompute::Run() {
   int n = param.w->dims()[1];
   bool quant_int8 = param.quant_w_max > 0.f;
 
-  float* output_max = quant_int8
-                          ? nullptr
-                          : param.output_max->mutable_data<float>(TARGET(kXPU));
-  const auto* bias = param.has_bias ? param.bias->data<float>() : nullptr;
+  float* output_max =
+      quant_int8 ? nullptr
+                 : param.output_max->template mutable_data<float>(TARGET(kXPU));
+  const auto* bias =
+      param.has_bias ? param.bias->template data<float>() : nullptr;
   const float* input_max =
       quant_int8 ? reinterpret_cast<float*>(input_max_guard_->addr_)
-                 : (param.input_max ? param.input_max->data<float>() : nullptr);
+                 : (param.input_max ? param.input_max->template data<float>()
+                                    : nullptr);
   xdnn::Activation_t act((xdnn::Activation_t::act_enum)param.act_type);
   if (param.act_type == 5) {
     act.leaky_alpha = param.act_param;
@@ -100,82 +106,28 @@ void XPUFcCompute::Run() {
     act.hard_sigmoid_slope = param.act_param;
   }
   // TODO(weihaoji): remove fc_int31 and fc_int16 after xpu fc wrapper refactor
-  if (param.precision == "int31") {
-    int r = xdnn::fc_fusion<float, float, float, int>(
-        ctx.GetRawContext(),                                          // ctx
-        param.input->data<float>(),                                   // x
-        reinterpret_cast<const float*>(xpu_quant_weight_.data_ptr_),  // w
-        param.output->mutable_data<float>(TARGET(kXPU)),              // y
-        m,                                                            // m
-        n,                                                            // n
-        k,                                                            // k
-        false,                                                        // x_trans
-        true,                                                         // w_trans
-        input_max,                                                   // x_maxptr
-        reinterpret_cast<const float*>(xpu_quant_weight_.max_ptr_),  // w_maxptr
-        output_max,                                                  // y_maxptr
-        k,                                                           // ldx
-        k,                                                           // ldw
-        n,                                                           // ldy
-        1.0f,                                                        // alpha
-        0.0f,                                                        // beta
-        bias,                                                        // bias
-        act);
-    CHECK_EQ(r, 0);
-  } else if (param.precision == "int16") {
-    int r = 0;
-    r = xdnn::fc_fusion<float, int16_t, float, int16_t>(
-        ctx.GetRawContext(),                                            // ctx
-        param.input->data<float>(),                                     // x
-        reinterpret_cast<const int16_t*>(xpu_quant_weight_.data_ptr_),  // w
-        param.output->mutable_data<float>(TARGET(kXPU)),                // y
-        m,                                                              // m
-        n,                                                              // n
-        k,                                                              // k
-        false,                                                       // x_trans
-        true,                                                        // w_trans
-        input_max,                                                   // x_maxptr
-        reinterpret_cast<const float*>(xpu_quant_weight_.max_ptr_),  // w_maxptr
-        output_max,                                                  // y_maxptr
-        k,                                                           // ldx
-        k,                                                           // ldw
-        n,                                                           // ldy
-        1.0f,                                                        // alpha
-        0.0f,                                                        // beta
-        bias,                                                        // bias
-        act);                                                        // act
-
-    CHECK_EQ(r, 0);
-  } else if (param.precision == "int8") {
-    bool x_trans = false;
-    bool w_trans = true;
-    int ldx = (x_trans ? m : k);
-    int ldw = (w_trans ? k : n);
-    int ldy = n;
-    int r = xdnn::fc_fusion<float, int8_t, float, int8_t>(
-        ctx.GetRawContext(),        /* context */
-        param.input->data<float>(), /* x */
-        reinterpret_cast<const int8_t*>(xpu_quant_weight_.data_ptr_),
-        param.output->mutable_data<float>(TARGET(kXPU)),      /* y */
-        m,                                                    /* m */
-        n,                                                    /* n */
-        k,                                                    /* k */
-        x_trans,                                              /* x_trans */
-        w_trans,                                              /* w_trans */
-        input_max,                                            /* x_max */
-        reinterpret_cast<float*>(xpu_quant_weight_.max_ptr_), /* w_max */
-        output_max,                                           /* y_max */
-        ldx,                                                  /* ldx */
-        ldw,                                                  /* ldw */
-        ldy,                                                  /* ldy */
-        1.0f,                                                 /* alpha */
-        0.0f,                                                 /* beta */
-        bias,                                                 /* bias */
-        act);                                                 /* act_type */
-    CHECK_EQ(r, 0);
-  } else {
-    LOG(FATAL) << "Unsupport XPUFC Precision: " << param.precision;
-  }
+
+  int r = xdnn::fc_fusion<DX, TW, DY, TGEMM>(
+      ctx.GetRawContext(),                                         // ctx
+      param.input->template data<DX>(),                            // x
+      reinterpret_cast<const TW*>(xpu_quant_weight_.data_ptr_),    // w
+      param.output->template mutable_data<DY>(TARGET(kXPU)),       // y
+      m,                                                           // m
+      n,                                                           // n
+      k,                                                           // k
+      false,                                                       // x_trans
+      true,                                                        // w_trans
+      input_max,                                                   // x_maxptr
+      reinterpret_cast<const float*>(xpu_quant_weight_.max_ptr_),  // w_maxptr
+      output_max,                                                  // y_maxptr
+      k,                                                           // ldx
+      k,                                                           // ldw
+      n,                                                           // ldy
+      1.0f,                                                        // alpha
+      0.0f,                                                        // beta
+      bias,                                                        // bias
+      act);
+  CHECK_EQ(r, 0);
 }
 
 }  // namespace xpu
@@ -183,12 +135,37 @@ void XPUFcCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(__xpu__fc,
-                     kXPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::xpu::XPUFcCompute,
-                     def)
+namespace xpu = paddle::lite::kernels::xpu;
+
+using XPUFC_FP32 =
+    xpu::XPUFcCompute<int, float, float, float, PRECISION(kFloat)>;
+
+using XPUFC_FP16_FP32_FP32 =
+    xpu::XPUFcCompute<int16_t, int16_t, float, float, PRECISION(kFloat)>;
+
+using XPUFC_FP16_FP16_FP16 =
+    xpu::XPUFcCompute<int16_t, int16_t, float16, float16, PRECISION(kFP16)>;
+
+using XPUFC_FP16_FP32_FP16 =
+    xpu::XPUFcCompute<int16_t, int16_t, float, float16, PRECISION(kFP16)>;
+
+using XPUFC_FP16_FP16_FP32 =
+    xpu::XPUFcCompute<int16_t, int16_t, float16, float, PRECISION(kFP16)>;
+
+using XPUFC_Int8_FP32_FP32 =
+    xpu::XPUFcCompute<int8_t, int8_t, float, float, PRECISION(kFloat)>;
+
+REGISTER_LITE_KERNEL(
+    __xpu__fc, kXPU, kFloat, kNCHW, XPUFC_FP32, XPU_Real_kFloat)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(__xpu__fc, kXPU, kFloat, kNCHW, XPUFC_FP16_FP32_FP32, def)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
     .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
@@ -196,3 +173,49 @@ REGISTER_LITE_KERNEL(__xpu__fc,
     .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(
+    __xpu__fc, kXPU, kFP16, kNCHW, XPUFC_FP16_FP16_FP16, XPUFC_FP16_FP16_FP16)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    __xpu__fc, kXPU, kFP16, kNCHW, XPUFC_FP16_FP32_FP16, XPUFC_FP16_FP32_FP16)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    __xpu__fc, kXPU, kFP16, kNCHW, XPUFC_FP16_FP16_FP32, XPUFC_FP16_FP16_FP32)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    __xpu__fc, kXPU, kFloat, kNCHW, XPUFC_Int8_FP32_FP32, XPU_Int8_FP32_FP32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/__xpu__fc_compute.h b/lite/kernels/xpu/__xpu__fc_compute.h
index 687f8d5e9c1..6d6dba66faf 100644
--- a/lite/kernels/xpu/__xpu__fc_compute.h
+++ b/lite/kernels/xpu/__xpu__fc_compute.h
@@ -20,8 +20,12 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace xpu {
-
-class XPUFcCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+template <typename TGEMM,
+          typename TW,
+          typename DX,
+          typename DY,
+          PrecisionType PType>
+class XPUFcCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::XPUFcParam;
 
@@ -32,8 +36,6 @@ class XPUFcCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
   virtual ~XPUFcCompute() = default;
 
  private:
-  // TODO(weihaoji): remove cpu w_max after xpu fc wrapper refactor
-  float w_max;
   XPUScratchPadGuard input_max_guard_;
   XPUQuantData xpu_quant_weight_;
 };
diff --git a/lite/kernels/xpu/calib_compute.cc b/lite/kernels/xpu/calib_compute.cc
index 34a6fb53d72..dc134fde02a 100644
--- a/lite/kernels/xpu/calib_compute.cc
+++ b/lite/kernels/xpu/calib_compute.cc
@@ -29,6 +29,9 @@ void CalibCompute<InType, OutType>::Run() {
   int numel = param.input->numel();
   const auto* in_data = param.input->template data<InType>();
   auto* out_data = param.output->template mutable_data<OutType>(TARGET(kXPU));
+  if (numel == 0) {
+    return;
+  }
   int r = xdnn::cast_v2<InType, OutType>(
       ctx.GetRawContext(), in_data, out_data, numel);
   CHECK_EQ(r, 0);
@@ -43,31 +46,69 @@ using xpu_calib_int64_to_int32 =
     paddle::lite::kernels::xpu::CalibCompute<int64_t, int32_t>;
 using xpu_calib_int32_to_int64 =
     paddle::lite::kernels::xpu::CalibCompute<int32_t, int64_t>;
+using xpu_calib_fp32_to_fp16 =
+    paddle::lite::kernels::xpu::CalibCompute<float, float16>;
+using xpu_calib_fp16_to_fp32 =
+    paddle::lite::kernels::xpu::CalibCompute<float16, float>;
 
 REGISTER_LITE_KERNEL(
-    calib, kXPU, kFloat, kNCHW, xpu_calib_int64_to_int32, int64_to_int32)
+    calib, kXPU, kFloat, kNCHW, xpu_calib_int64_to_int32, calib_int64_to_int32)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(
-    calib, kXPU, kFloat, kNCHW, xpu_calib_int32_to_int64, int32_to_int64)
+    calib, kXPU, kFloat, kNCHW, xpu_calib_int32_to_int64, calib_int32_to_int64)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(
-    calib_once, kXPU, kFloat, kNCHW, xpu_calib_int64_to_int32, int64_to_int32)
+    calib, kXPU, kFloat, kNCHW, xpu_calib_fp32_to_fp16, calib_fp32_to_fp16)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib, kXPU, kFloat, kNCHW, xpu_calib_fp16_to_fp32, calib_fp16_to_fp32)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(calib_once,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     xpu_calib_int64_to_int32,
+                     calib_int64_to_int32)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(
-    calib_once, kXPU, kFloat, kNCHW, xpu_calib_int32_to_int64, int32_to_int64)
+REGISTER_LITE_KERNEL(calib_once,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     xpu_calib_int32_to_int64,
+                     calib_int32_to_int64)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib_once, kXPU, kFloat, kNCHW, xpu_calib_fp32_to_fp16, calib_fp32_to_fp16)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib_once, kXPU, kFloat, kNCHW, xpu_calib_fp16_to_fp32, calib_fp16_to_fp32)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .Finalize();
diff --git a/lite/kernels/xpu/concat_compute.cc b/lite/kernels/xpu/concat_compute.cc
index e3fc5ef554d..9eceace16f5 100644
--- a/lite/kernels/xpu/concat_compute.cc
+++ b/lite/kernels/xpu/concat_compute.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "lite/kernels/xpu/concat_compute.h"
+
 #include <algorithm>
 #include <vector>
+
 #include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/op_registry.h"
 
@@ -23,8 +25,8 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-template <typename InType>
-void ConcatCompute<InType>::Run() {
+template <typename InType, PrecisionType PType>
+void ConcatCompute<InType, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
 
@@ -34,7 +36,7 @@ void ConcatCompute<InType>::Run() {
                      ? param.axis + static_cast<int>(ins[0]->dims().size())
                      : param.axis;
 
-  std::vector<const float*> x_list;
+  std::vector<const InType*> x_list;
   std::vector<std::vector<int>> xdims_list;
   for (int i = 0; i < ins.size(); i++) {
     if (ins[i]->numel() > 0) {
@@ -46,14 +48,14 @@ void ConcatCompute<InType>::Run() {
         xdims_list[i].back() = xdims_list[i].back() * 2;
       }
       x_list.push_back(
-          reinterpret_cast<const float*>(ins[i]->template data<InType>()));
+          reinterpret_cast<const InType*>(ins[i]->template data<InType>()));
     }
   }
   if (x_list.size() > 1) {
-    int r = xdnn::concat<float>(
+    int r = xdnn::concat<InType>(
         ctx.GetRawContext(),
         x_list,
-        reinterpret_cast<float*>(
+        reinterpret_cast<InType*>(
             out->template mutable_data<InType>(TARGET(kXPU))),
         xdims_list,
         axis);
@@ -75,37 +77,45 @@ void ConcatCompute<InType>::Run() {
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
-
-REGISTER_LITE_KERNEL(concat,
-                     kXPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::xpu::ConcatCompute<float>,
-                     def)
+using concatfp32 =
+    paddle::lite::kernels::xpu::ConcatCompute<float, PRECISION(kFloat)>;
+using concatfp16 =
+    paddle::lite::kernels::xpu::ConcatCompute<float16, PRECISION(kFP16)>;
+using concati16 =
+    paddle::lite::kernels::xpu::ConcatCompute<int, PRECISION(kInt16)>;
+using concati32 =
+    paddle::lite::kernels::xpu::ConcatCompute<int, PRECISION(kFloat)>;
+using concati64 =
+    paddle::lite::kernels::xpu::ConcatCompute<int64_t, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(concat, kXPU, kFloat, kNCHW, concatfp32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
     .BindInput("AxisTensor",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(concat,
-                     kXPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::xpu::ConcatCompute<int>,
-                     concat_i32)
+REGISTER_LITE_KERNEL(concat, kXPU, kFP16, kNCHW, concatfp16, concat_FP16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(concat, kXPU, kInt16, kNCHW, concati16, concat_INT16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt16))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt16))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(concat, kXPU, kInt32, kNCHW, concati32, concat_INT32)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindInput("AxisTensor",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(concat,
-                     kXPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::xpu::ConcatCompute<int64_t>,
-                     concat_i64)
+REGISTER_LITE_KERNEL(concat, kXPU, kInt64, kNCHW, concati64, concat_INT64)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .BindInput("AxisTensor",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
diff --git a/lite/kernels/xpu/concat_compute.h b/lite/kernels/xpu/concat_compute.h
index 218c4704557..964f94f8194 100644
--- a/lite/kernels/xpu/concat_compute.h
+++ b/lite/kernels/xpu/concat_compute.h
@@ -21,8 +21,8 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-template <typename InType>
-class ConcatCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+template <typename InType, PrecisionType PType>
+class ConcatCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::ConcatParam;
 
diff --git a/lite/kernels/xpu/conv3d_compute.cc b/lite/kernels/xpu/conv3d_compute.cc
index cc3ad389679..8416d964448 100644
--- a/lite/kernels/xpu/conv3d_compute.cc
+++ b/lite/kernels/xpu/conv3d_compute.cc
@@ -22,8 +22,26 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-template <>
-void Conv3DCompute<PRECISION(kFloat)>::Run() {
+template <typename TGEMM,
+          typename TW,
+          typename DX,
+          typename DY,
+          PrecisionType PType>
+void Conv3DCompute<TGEMM, TW, DX, DY, PType>::PrepareForRun() {
+  auto& param = this->template Param<param_t>();
+  auto filter_ptr = param.filter->template data<float>();
+  auto filter_dims = param.filter->dims();
+  xpu_quant_filter_ =
+      TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, TW>(
+          filter_ptr, filter_dims, false);
+}
+
+template <typename TGEMM,
+          typename TW,
+          typename DX,
+          typename DY,
+          PrecisionType PType>
+void Conv3DCompute<TGEMM, TW, DX, DY, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
 
@@ -34,11 +52,11 @@ void Conv3DCompute<PRECISION(kFloat)>::Run() {
   auto paddings = *param.paddings;
   auto dilations = *param.dilations;
 
-  int r = xdnn::conv3d<float, float, float, int16_t>(
+  int r = xdnn::conv3d<DX, TW, DY, TGEMM>(
       ctx.GetRawContext(), /* context */
-      param.x->data<float>(),
-      param.filter->data<float>(), /* weight */
-      param.output->mutable_data<float>(TARGET(kXPU)),
+      param.x->template data<DX>(),
+      reinterpret_cast<const TW*>(xpu_quant_filter_.data_ptr_), /* weight */
+      param.output->template mutable_data<DY>(TARGET(kXPU)),
       x_dims[0], /* input_n */
       x_dims[1], /* input_c */
       x_dims[2], /* input_d */
@@ -53,7 +71,7 @@ void Conv3DCompute<PRECISION(kFloat)>::Run() {
       dilations,
       groups,
       nullptr,
-      nullptr,
+      reinterpret_cast<const float*>(xpu_quant_filter_.max_ptr_),
       nullptr,
       true /*is_ncdhw*/);
   CHECK_EQ(r, 0);
@@ -65,11 +83,61 @@ void Conv3DCompute<PRECISION(kFloat)>::Run() {
 }  // namespace paddle
 
 namespace xpu = paddle::lite::kernels::xpu;
-using Conv3dFp32 = xpu::Conv3DCompute<PRECISION(kFloat)>;
 
-REGISTER_LITE_KERNEL(conv3d, kXPU, kFloat, kNCHW, Conv3dFp32, def)
+using XPUConv3dFP32 =
+    xpu::Conv3DCompute<int, float, float, float, PRECISION(kFloat)>;
+
+using XPUConv3d_FP16_FP32_FP32 =
+    xpu::Conv3DCompute<int16_t, int16_t, float, float, PRECISION(kFloat)>;
+
+using XPUConv3dFp16 =
+    xpu::Conv3DCompute<int16_t, int16_t, float16, float16, PRECISION(kFP16)>;
+
+using XPUConv3d_FP16_FP16_FP32 =
+    xpu::Conv3DCompute<int16_t, int16_t, float16, float, PRECISION(kFP16)>;
+
+using XPUConv3d_FP16_FP32_FP16 =
+    xpu::Conv3DCompute<int16_t, int16_t, float, float16, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(
+    conv3d, kXPU, kFloat, kNCHW, XPUConv3dFP32, XPU_Real_kFloat)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
-    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
     .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(conv3d, kXPU, kFloat, kNCHW, XPUConv3d_FP16_FP32_FP32, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    conv3d, kXPU, kFP16, kNCHW, XPUConv3dFp16, XPU_FP16_FP16_FP16)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    conv3d, kXPU, kFP16, kNCHW, XPUConv3d_FP16_FP16_FP32, XPU_FP16_FP16_FP32)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    conv3d, kXPU, kFP16, kNCHW, XPUConv3d_FP16_FP32_FP16, XPU_FP16_FP32_FP16)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
diff --git a/lite/kernels/xpu/conv3d_compute.h b/lite/kernels/xpu/conv3d_compute.h
index caadb82a1e8..4cd5fdaeca7 100644
--- a/lite/kernels/xpu/conv3d_compute.h
+++ b/lite/kernels/xpu/conv3d_compute.h
@@ -21,14 +21,22 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-template <PrecisionType FilterPtype>
-class Conv3DCompute : public KernelLite<TARGET(kXPU), FilterPtype> {
+template <typename TGEMM,
+          typename TW,
+          typename DX,
+          typename DY,
+          PrecisionType PType>
+class Conv3DCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::ConvParam;
+  void PrepareForRun() override;
 
   void Run() override;
 
   virtual ~Conv3DCompute() = default;
+
+ private:
+  XPUQuantData xpu_quant_filter_;
 };
 
 }  // namespace xpu
diff --git a/lite/kernels/xpu/gather_compute.cc b/lite/kernels/xpu/gather_compute.cc
index f3eafc878fb..697204689d9 100644
--- a/lite/kernels/xpu/gather_compute.cc
+++ b/lite/kernels/xpu/gather_compute.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "lite/kernels/xpu/gather_compute.h"
+
 #include <vector>
+
 #include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/op_registry.h"
 
@@ -22,8 +24,8 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-template <typename DataType, typename IndexType>
-void GatherCompute<DataType, IndexType>::Run() {
+template <typename DataType, typename IndexType, PrecisionType PType>
+void GatherCompute<DataType, IndexType, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
 
@@ -46,88 +48,16 @@ void GatherCompute<DataType, IndexType>::Run() {
     axis += x_dims.size();
   }
 
-  if (param.X->precision() == PrecisionType::kInt64 &&
-      param.Index->precision() == PrecisionType::kInt64) {
-    auto* index_int64 = param.Index->template data<int64_t>();
-    int size = param.Index->dims().production();
-    XPUScratchPadGuard index_xpu_guard_ =
-        TargetWrapperXPU::MallocScratchPad(size * sizeof(int));
-    int* index_int32_device = reinterpret_cast<int*>(index_xpu_guard_->addr_);
-
-    int r0 = xdnn::cast_v2<int64_t, int32_t>(
-        ctx.GetRawContext(), index_int64, index_int32_device, index->numel());
-    CHECK_EQ(r0, 0);
+  int r = xdnn::gather<DataType, IndexType>(
+      ctx.GetRawContext(),
+      x->template data<DataType>(),
+      index->template data<IndexType>(),
+      out->template mutable_data<DataType>(TARGET(kXPU)),
+      x_dims,
+      index->numel(),
+      axis);
 
-    int r1 = xdnn::gather<int64_t, int32_t>(
-        ctx.GetRawContext(),
-        x->template data<int64_t>(),
-        index_int32_device,
-        out->template mutable_data<int64_t>(TARGET(kXPU)),
-        x_dims,
-        index->numel(),
-        axis);
-    CHECK_EQ(r1, 0);
-  } else if (param.X->precision() == PrecisionType::kInt64 &&
-             param.Index->precision() == PrecisionType::kInt32) {
-    int r = xdnn::gather<int64_t, int32_t>(
-        ctx.GetRawContext(),
-        x->template data<int64_t>(),
-        index->template data<int32_t>(),
-        out->template mutable_data<int64_t>(TARGET(kXPU)),
-        x_dims,
-        index->numel(),
-        axis);
-    CHECK_EQ(r, 0);
-  } else if (param.X->precision() == PrecisionType::kInt32 &&
-             param.Index->precision() == PrecisionType::kInt32) {
-    int r = xdnn::gather<int32_t, int32_t>(
-        ctx.GetRawContext(),
-        x->template data<int32_t>(),
-        index->template data<int32_t>(),
-        out->template mutable_data<int32_t>(TARGET(kXPU)),
-        x_dims,
-        index->numel(),
-        axis);
-    CHECK_EQ(r, 0);
-  } else if (param.X->precision() == PrecisionType::kInt32 &&
-             param.Index->precision() == PrecisionType::kInt64) {
-    int r = xdnn::gather<int32_t, int64_t>(
-        ctx.GetRawContext(),
-        x->template data<int32_t>(),
-        index->template data<int64_t>(),
-        out->template mutable_data<int32_t>(TARGET(kXPU)),
-        x_dims,
-        index->numel(),
-        axis);
-    CHECK_EQ(r, 0);
-  } else if (param.X->precision() == PrecisionType::kFloat &&
-             param.Index->precision() == PrecisionType::kInt32) {
-    int r = xdnn::gather<float, int32_t>(
-        ctx.GetRawContext(),
-        x->template data<float>(),
-        index->template data<int32_t>(),
-        out->template mutable_data<float>(TARGET(kXPU)),
-        x_dims,
-        index->numel(),
-        axis);
-    CHECK_EQ(r, 0);
-  } else if (param.X->precision() == PrecisionType::kFloat &&
-             param.Index->precision() == PrecisionType::kInt64) {
-    int r = xdnn::gather<float, int64_t>(
-        ctx.GetRawContext(),
-        x->template data<float>(),
-        index->template data<int64_t>(),
-        out->template mutable_data<float>(TARGET(kXPU)),
-        x_dims,
-        index->numel(),
-        axis);
-    CHECK_EQ(r, 0);
-  } else {
-    LOG(FATAL) << "Unsupported gather op with x dtype: "
-               << lite_api::PrecisionToStr(param.X->precision())
-               << " and index dtype: "
-               << lite_api::PrecisionToStr(param.Index->precision());
-  }
+  CHECK_EQ(r, 0);
 }
 
 }  // namespace xpu
@@ -141,10 +71,21 @@ REGISTER_LITE_KERNEL(gather, kXPU, kFloat, kNCHW, GatherXPUFloatInt32, def)
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindInput("Axis",
                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
     .Finalize();
+
 REGISTER_LITE_KERNEL(
-    gather, kXPU, kFloat, kNCHW, GatherXPUFloatInt64, gather_float_i64)
+    gather, kXPU, kFP16, kNCHW, GatherXPUkFP16Int32, gather_FP16_Int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
+    .BindInput("Axis",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    gather, kXPU, kFloat, kNCHW, GatherXPUFloatInt64, gather_FP32_INT64)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
     .BindInput("Index",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
@@ -153,7 +94,7 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 REGISTER_LITE_KERNEL(
-    gather, kXPU, kFloat, kNCHW, GatherXPUInt32Int32, gather_i32_i32)
+    gather, kXPU, kInt32, kNCHW, GatherXPUInt32Int32, gather_INT32_INT32)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindInput("Index",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
@@ -162,7 +103,7 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 REGISTER_LITE_KERNEL(
-    gather, kXPU, kFloat, kNCHW, GatherXPUInt32Int64, gather_i32_i64)
+    gather, kXPU, kInt32, kNCHW, GatherXPUInt32Int64, gather_INT32_INT64)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindInput("Index",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
@@ -171,7 +112,7 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 REGISTER_LITE_KERNEL(
-    gather, kXPU, kFloat, kNCHW, GatherXPUInt64Int32, gather_i64_i32)
+    gather, kXPU, kInt64, kNCHW, GatherXPUInt64Int32, gather_INT64_INT32)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .BindInput("Index",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
@@ -179,12 +120,3 @@ REGISTER_LITE_KERNEL(
                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .Finalize();
-REGISTER_LITE_KERNEL(
-    gather, kXPU, kFloat, kNCHW, GatherXPUInt64Int64, gather_i64_i64)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
-    .BindInput("Index",
-               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
-    .BindInput("Axis",
-               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
-    .Finalize();
diff --git a/lite/kernels/xpu/gather_compute.h b/lite/kernels/xpu/gather_compute.h
index a78be677d09..2363e8651ca 100644
--- a/lite/kernels/xpu/gather_compute.h
+++ b/lite/kernels/xpu/gather_compute.h
@@ -21,8 +21,8 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-template <typename DataType, typename IndexType>
-class GatherCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+template <typename DataType, typename IndexType, PrecisionType PType>
+class GatherCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::GatherParam;
 
@@ -36,15 +36,27 @@ class GatherCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
 }  // namespace lite
 }  // namespace paddle
 
-typedef paddle::lite::kernels::xpu::GatherCompute<int32_t, int32_t>
+typedef paddle::lite::kernels::xpu::GatherCompute<int32_t,
+                                                  int32_t,
+                                                  PRECISION(kInt32)>
     GatherXPUInt32Int32;
-typedef paddle::lite::kernels::xpu::GatherCompute<int32_t, int64_t>
+typedef paddle::lite::kernels::xpu::GatherCompute<int32_t,
+                                                  int64_t,
+                                                  PRECISION(kInt32)>
     GatherXPUInt32Int64;
-typedef paddle::lite::kernels::xpu::GatherCompute<float, int32_t>
+typedef paddle::lite::kernels::xpu::GatherCompute<float,
+                                                  int32_t,
+                                                  PRECISION(kFloat)>
     GatherXPUFloatInt32;
-typedef paddle::lite::kernels::xpu::GatherCompute<float, int64_t>
+typedef paddle::lite::kernels::xpu::GatherCompute<float16,
+                                                  int32_t,
+                                                  PRECISION(kFP16)>
+    GatherXPUkFP16Int32;
+typedef paddle::lite::kernels::xpu::GatherCompute<float,
+                                                  int64_t,
+                                                  PRECISION(kFloat)>
     GatherXPUFloatInt64;
-typedef paddle::lite::kernels::xpu::GatherCompute<int64_t, int32_t>
+typedef paddle::lite::kernels::xpu::GatherCompute<int64_t,
+                                                  int32_t,
+                                                  PRECISION(kInt64)>
     GatherXPUInt64Int32;
-typedef paddle::lite::kernels::xpu::GatherCompute<int64_t, int64_t>
-    GatherXPUInt64Int64;
diff --git a/lite/kernels/xpu/pool_compute.cc b/lite/kernels/xpu/pool_compute.cc
index 9df03bc3c48..8211de7e438 100644
--- a/lite/kernels/xpu/pool_compute.cc
+++ b/lite/kernels/xpu/pool_compute.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "lite/kernels/xpu/pool_compute.h"
+
 #include <algorithm>
 #include <vector>
+
 #include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/op_registry.h"
 
@@ -22,8 +24,8 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace xpu {
-
-void Pool2DCompute::Run() {
+template <typename InType, PrecisionType PType>
+void Pool2DCompute<InType, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
 
@@ -55,8 +57,8 @@ void Pool2DCompute::Run() {
     if (param.pooling_type == "avg") {
       int r = xdnn::adaptive_avg_pool2d(
           ctx.GetRawContext(),
-          param.x->data<float>(),
-          param.output->mutable_data<float>(TARGET(kXPU)),
+          param.x->template data<InType>(),
+          param.output->template mutable_data<InType>(TARGET(kXPU)),
           x_dims[0],
           x_dims[1],
           x_dims[2],
@@ -68,8 +70,8 @@ void Pool2DCompute::Run() {
     } else {
       int r = xdnn::adaptive_max_pool2d(
           ctx.GetRawContext(),
-          param.x->data<float>(),
-          param.output->mutable_data<float>(TARGET(kXPU)),
+          param.x->template data<InType>(),
+          param.output->template mutable_data<InType>(TARGET(kXPU)),
           nullptr,
           x_dims[0],
           x_dims[1],
@@ -82,10 +84,10 @@ void Pool2DCompute::Run() {
     }
   } else {
     if (param.pooling_type == "avg") {
-      int r = xdnn::avg_pool2d<float>(
+      int r = xdnn::avg_pool2d<InType>(
           ctx.GetRawContext(),
-          param.x->data<float>(),
-          param.output->mutable_data<float>(TARGET(kXPU)),
+          param.x->template data<InType>(),
+          param.output->template mutable_data<InType>(TARGET(kXPU)),
           x_dims[0],
           x_dims[1],
           x_dims[2],
@@ -98,10 +100,10 @@ void Pool2DCompute::Run() {
       CHECK_EQ(r, 0);
     } else {
       if (param.pad_zero == true) {
-        int r = xdnn::max_pool2d<float>(
+        int r = xdnn::max_pool2d<InType>(
             ctx.GetRawContext(),
-            param.x->data<float>(),
-            param.output->mutable_data<float>(TARGET(kXPU)),
+            param.x->template data<InType>(),
+            param.output->template mutable_data<InType>(TARGET(kXPU)),
             nullptr,
             x_dims[0],
             x_dims[1],
@@ -113,7 +115,7 @@ void Pool2DCompute::Run() {
             true);
         CHECK_EQ(r, 0);
       } else {
-        const float* xpu_x_padded = nullptr;
+        const InType* xpu_x_padded = nullptr;
         std::vector<int> xpu_x_padded_dims{static_cast<int>(x_dims[0]),
                                            static_cast<int>(x_dims[1]),
                                            static_cast<int>(x_dims[2]),
@@ -121,7 +123,7 @@ void Pool2DCompute::Run() {
         XPUScratchPadGuard xpu_x_padded_guard_;
         if (paddings[0] == 0 && paddings[1] == 0 && paddings[2] == 0 &&
             paddings[3] == 0) {
-          xpu_x_padded = param.x->data<float>();
+          xpu_x_padded = param.x->template data<InType>();
         } else {
           std::vector<int> pad_left{0, 0, paddings[0], paddings[2]};
           std::vector<int> pad_right{0, 0, paddings[1], paddings[3]};
@@ -130,25 +132,25 @@ void Pool2DCompute::Run() {
           xpu_x_padded_dims[3] =
               xpu_x_padded_dims[3] + paddings[2] + paddings[3];
           xpu_x_padded_guard_ = TargetWrapperXPU::MallocScratchPad(
-              sizeof(float) * xpu_x_padded_dims[0] * xpu_x_padded_dims[1] *
+              sizeof(InType) * xpu_x_padded_dims[0] * xpu_x_padded_dims[1] *
               xpu_x_padded_dims[2] * xpu_x_padded_dims[3]);
-          xpu_x_padded = reinterpret_cast<float*>(xpu_x_padded_guard_->addr_);
-          int r = xdnn::pad<float>(ctx.GetRawContext(),
-                                   param.x->data<float>(),
-                                   const_cast<float*>(xpu_x_padded),
-                                   {static_cast<int>(x_dims[0]),
-                                    static_cast<int>(x_dims[1]),
-                                    static_cast<int>(x_dims[2]),
-                                    static_cast<int>(x_dims[3])},
-                                   pad_left,
-                                   pad_right,
-                                   -9999999.0f);
+          xpu_x_padded = reinterpret_cast<InType*>(xpu_x_padded_guard_->addr_);
+          int r = xdnn::pad<InType>(ctx.GetRawContext(),
+                                    param.x->template data<InType>(),
+                                    const_cast<InType*>(xpu_x_padded),
+                                    {static_cast<int>(x_dims[0]),
+                                     static_cast<int>(x_dims[1]),
+                                     static_cast<int>(x_dims[2]),
+                                     static_cast<int>(x_dims[3])},
+                                    pad_left,
+                                    pad_right,
+                                    -9999999.0f);
           CHECK_EQ(r, 0);
         }
-        int r = xdnn::max_pool2d<float>(
+        int r = xdnn::max_pool2d<InType>(
             ctx.GetRawContext(),
             xpu_x_padded,
-            param.output->mutable_data<float>(TARGET(kXPU)),
+            param.output->template mutable_data<InType>(TARGET(kXPU)),
             nullptr,
             xpu_x_padded_dims[0],
             xpu_x_padded_dims[1],
@@ -168,19 +170,29 @@ void Pool2DCompute::Run() {
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
+// (TODO:quwei) refactor pool2d
+
+using pool2d_fp32 =
+    paddle::lite::kernels::xpu::Pool2DCompute<float, PRECISION(kFloat)>;
+using pool2d_fp16 =
+    paddle::lite::kernels::xpu::Pool2DCompute<float16, PRECISION(kFP16)>;
+
+using max_pool2d_with_index_fp32 =
+    paddle::lite::kernels::xpu::Pool2DCompute<float, PRECISION(kFloat)>;
+
+REGISTER_LITE_KERNEL(pool2d, kXPU, kFloat, kNCHW, pool2d_fp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .Finalize();
 
 REGISTER_LITE_KERNEL(
-    pool2d, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::Pool2DCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    pool2d, kXPU, kFP16, kNCHW, pool2d_fp16, DISABLE_XPU1_pool2d_FP16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(max_pool2d_with_index,
-                     kXPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::xpu::Pool2DCompute,
-                     def)
+REGISTER_LITE_KERNEL(
+    max_pool2d_with_index, kXPU, kFloat, kNCHW, max_pool2d_with_index_fp32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kXPU))})
diff --git a/lite/kernels/xpu/pool_compute.h b/lite/kernels/xpu/pool_compute.h
index 39e14f04a8c..c107b2877b1 100644
--- a/lite/kernels/xpu/pool_compute.h
+++ b/lite/kernels/xpu/pool_compute.h
@@ -20,8 +20,8 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace xpu {
-
-class Pool2DCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+template <typename InType, PrecisionType PType>
+class Pool2DCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::PoolParam;
 
diff --git a/lite/operators/__xpu__fc_op.cc b/lite/operators/__xpu__fc_op.cc
index 21f6faebcb5..a15d6c4fa88 100644
--- a/lite/operators/__xpu__fc_op.cc
+++ b/lite/operators/__xpu__fc_op.cc
@@ -107,12 +107,8 @@ bool XPUFcOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
     param_.input_max =
         scope->FindVar(op_desc.Input("InputMax").front())->GetMutable<Tensor>();
   }
-  if (op_desc.HasAttr("precision")) {
-    param_.precision = op_desc.GetAttr<std::string>("precision");
-  }
+
   if (op_desc.HasAttr("enable_int8") && op_desc.GetAttr<bool>("enable_int8")) {
-    CHECK(param_.precision == "int8") << "enable_int8 precison:"
-                                      << param_.precision;
     param_.quant_input_max =
         127 * op_desc.GetAttr<std::vector<float>>("X0_scale")[0];
     param_.quant_w_max =