diff --git a/cmake/backends/xpu.cmake b/cmake/backends/xpu.cmake
index 3dd9acb7f3d..8ef1de035e0 100644
--- a/cmake/backends/xpu.cmake
+++ b/cmake/backends/xpu.cmake
@@ -23,7 +23,7 @@ set (XPU_DOWNLOAD_DIR ${XPU_SOURCE_DIR}/download)
 set (XPU_INSTALL_DIR ${THIRD_PARTY_PATH}/install)
 
 if (NOT XPU_SDK_URL)
-  set (XPU_SDK_URL "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev/20220519")
+  set (XPU_SDK_URL "https://baidu-kunlun-product.su.bcebos.com/klx-sdk/search/20220825")
 endif ()
 
 if (NOT XPU_SDK_ENV)
diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index e102917fb03..bc3a8bb185c 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -626,8 +626,12 @@ void CxxConfig::set_xpu_multi_encoder_method(const std::string &precision,
 void CxxConfig::set_xpu_conv_autotune(bool autotune,
                                       const std::string &autotune_file) {
 #ifdef LITE_WITH_XPU
-  lite::TargetWrapperXPU::conv_autotune = autotune;
-  lite::TargetWrapperXPU::conv_autotune_file = autotune_file;
+  LOG(WARNING)
+      << "This function "
+         "'set_xpu_conv_autotune' is deprecated, "
+         "if you want to use autotune, please refer to "
+         "http://agroup.baidu.com/share/md/f9233d84df11452488a1fdd4f859647f";
+
 #else
   LOG(WARNING) << "The invoking of the function "
                   "'set_xpu_conv_autotune' is ignored, please "
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index 077a812579d..1af29e74d21 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -472,6 +472,8 @@ class LITE_API CxxConfig : public ConfigBase {
 
   void set_xpu_gm_workspace_method(size_t gm_size);
 
+  // **DEPRECATED**, use environ variable to enable autotune
+  // check http://agroup.baidu.com/share/md/f9233d84df11452488a1fdd4f859647f
   void set_xpu_conv_autotune(bool autotune = true,
                              const std::string& autotune_file = "");
 
diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index fef39c7c570..bc8687a767d 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -107,6 +107,7 @@ USE_MIR_PASS(__xpu__bigru_fuse_pass);
 USE_MIR_PASS(__xpu__dynamic_lstm_fuse_pass);
 USE_MIR_PASS(__xpu__multi_softmax_fuse_pass);
 USE_MIR_PASS(__xpu__max_pooling_pad_zero_detect_fuse_pass);
+USE_MIR_PASS(__xpu__static_kernel_pick_pass);
 USE_MIR_PASS(x86_int8_attribute_pass);
 USE_MIR_PASS(fill_range_fuse_pass);
 USE_MIR_PASS(range_calc_offline_pass);
diff --git a/lite/backends/xpu/target_wrapper.cc b/lite/backends/xpu/target_wrapper.cc
index 3cf33ea02f6..29e26e392a7 100644
--- a/lite/backends/xpu/target_wrapper.cc
+++ b/lite/backends/xpu/target_wrapper.cc
@@ -43,9 +43,13 @@ void TargetWrapperXPU::MemcpySync(void* dst,
 
 template <typename Tcpu, typename Txpu>
 XPUQuantData TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight(
-    const Tcpu* cpu_data, const DDimLite& dims, bool data_transpose) {
+    const Tcpu* cpu_data,
+    const DDimLite& dims,
+    bool data_transpose,
+    size_t max_ptr_len) {
   CHECK(quantizer_.get());
-  return quantizer_->quant<Tcpu, Txpu>(cpu_data, dims, data_transpose);
+  return quantizer_->quant<Tcpu, Txpu>(
+      cpu_data, dims, data_transpose, max_ptr_len);
 }
 
 void TargetWrapperXPU::ScatterL3Cache(
@@ -145,16 +149,16 @@ void TargetWrapperXPU::FreeL3Cache() {
 
 template XPUQuantData
 TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, float>(
-    const float*, const DDimLite&, bool);
+    const float*, const DDimLite&, bool, size_t);
 template XPUQuantData
 TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, int16_t>(
-    const float*, const DDimLite&, bool);
+    const float*, const DDimLite&, bool, size_t);
 template XPUQuantData
 TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, int8_t>(
-    const float*, const DDimLite&, bool);
+    const float*, const DDimLite&, bool, size_t);
 template XPUQuantData
 TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<int8_t, int8_t>(
-    const int8_t*, const DDimLite&, bool);
+    const int8_t*, const DDimLite&, bool, size_t);
 
 // xpu context
 LITE_THREAD_LOCAL std::shared_ptr<xdnn::Context> TargetWrapperXPU::tls_raw_ctx_{
@@ -165,9 +169,6 @@ LITE_THREAD_LOCAL std::shared_ptr<void> TargetWrapperXPU::xpu_stream_{nullptr};
 LITE_THREAD_LOCAL std::string
     TargetWrapperXPU::multi_encoder_precision;  // NOLINT
 LITE_THREAD_LOCAL bool TargetWrapperXPU::multi_encoder_adaptive_seqlen{false};
-// conv autotune config
-LITE_THREAD_LOCAL bool TargetWrapperXPU::conv_autotune{false};
-LITE_THREAD_LOCAL std::string TargetWrapperXPU::conv_autotune_file;
 // l3 cache config
 LITE_THREAD_LOCAL bool TargetWrapperXPU::need_l3_mutex{false};
 LITE_THREAD_LOCAL size_t TargetWrapperXPU::local_l3_size{
diff --git a/lite/backends/xpu/target_wrapper.h b/lite/backends/xpu/target_wrapper.h
index ffc45305682..f02a55e0e2b 100644
--- a/lite/backends/xpu/target_wrapper.h
+++ b/lite/backends/xpu/target_wrapper.h
@@ -69,7 +69,8 @@ class TargetWrapper<TARGET(kXPU)> {
   template <typename Tcpu, typename Txpu>
   static XPUQuantData ConvertCPUWeightToXPUQuantWeight(const Tcpu* cpu_data,
                                                        const DDimLite& dims,
-                                                       bool data_transpose);
+                                                       bool data_transpose,
+                                                       size_t max_ptr_len);
 
   static xdnn::Context* GetRawContext() {
     if (tls_raw_ctx_.get() == nullptr) {
@@ -111,14 +112,6 @@ class TargetWrapper<TARGET(kXPU)> {
         quantizer_.reset(new XPUQuantizer());
       }
       CHECK(quantizer_.get());
-      if (conv_autotune) {
-        tls_raw_ctx_->_xpu1_conv_selector.set_autotune_loop(true);
-        tls_raw_ctx_->_xpu1_conv_selector.set_inference_mode(true);
-      }
-      if (!conv_autotune_file.empty()) {
-        tls_raw_ctx_->_xpu1_conv_selector.set_autotune_file(
-            conv_autotune_file.c_str());
-      }
       int devid = -1;
       uint64_t max_l3_size = 0;
       XPU_CALL(xpu_current_device(&devid));
@@ -173,9 +166,6 @@ class TargetWrapper<TARGET(kXPU)> {
   // multi encoder config
   static LITE_THREAD_LOCAL std::string multi_encoder_precision;  // NOLINT
   static LITE_THREAD_LOCAL bool multi_encoder_adaptive_seqlen;
-  // conv autotune config
-  static LITE_THREAD_LOCAL bool conv_autotune;
-  static LITE_THREAD_LOCAL std::string conv_autotune_file;  // NOLINT
   // l3 cache config
   static LITE_THREAD_LOCAL bool need_l3_mutex;    // model level l3 size
   static LITE_THREAD_LOCAL size_t local_l3_size;  // model level l3 size
diff --git a/lite/backends/xpu/xpu_quantizer.cc b/lite/backends/xpu/xpu_quantizer.cc
index dd1c24a3869..5e921cf0458 100644
--- a/lite/backends/xpu/xpu_quantizer.cc
+++ b/lite/backends/xpu/xpu_quantizer.cc
@@ -112,7 +112,8 @@ template <
 void XPUQuantizer::ConvertWithQuant(const Tcpu* cpu_data,
                                     const DDimLite& dims,
                                     bool data_transpose,
-                                    size_t hashed_key) {
+                                    size_t hashed_key,
+                                    size_t max_ptr_len) {
   LOG(FATAL) << "Not support for Tcpu is " << CppTypeToString<Tcpu>();
 }
 
@@ -123,7 +124,8 @@ template <
 void XPUQuantizer::ConvertWithQuant(const Tcpu* cpu_data,
                                     const DDimLite& dims,
                                     bool data_transpose,
-                                    size_t hashed_key) {
+                                    size_t hashed_key,
+                                    size_t max_ptr_len) {
   // transpose
   const Tcpu* cpu_ptr = nullptr;
   int numel = dims.production();
@@ -140,7 +142,7 @@ void XPUQuantizer::ConvertWithQuant(const Tcpu* cpu_data,
   XPUScratchPadGuard weight_max_guard;
   XPUScratchPadGuard quant_weight_guard;
   float max_val = paddle::lite::xpu::math::FindMaxAbs(cpu_ptr, numel);
-  int max_ptr_size = XPUMemory::get_max_ptr_size();
+  size_t max_ptr_size = max_ptr_len;
   std::vector<float> max_vec(max_ptr_size, max_val);
   weight_max_guard =
       std::move(XPUMemory::MallocScratchPad(max_ptr_size * sizeof(float)));
@@ -162,11 +164,12 @@ template <typename T>
 void XPUQuantizer::ConvertWithoutQuant(const T* cpu_data,
                                        const DDimLite& dims,
                                        bool data_transpose,
-                                       size_t hashed_key) {
+                                       size_t hashed_key,
+                                       size_t max_ptr_len) {
   // transpose
   const T* cpu_ptr = nullptr;
   int numel = dims.production();
-  int max_ptr_size = XPUMemory::get_max_ptr_size();
+  size_t max_ptr_size = max_ptr_len;
   std::vector<T> transpose_data(numel, 0);
   if (data_transpose) {
     CHECK(dims.size() == 2) << "Not support: dims.size = " << dims.size();
@@ -178,8 +181,9 @@ void XPUQuantizer::ConvertWithoutQuant(const T* cpu_data,
   }
   // copy to XPU
   XPUScratchPadGuard weight_max_guard(new XPUScratchPad(nullptr, 0));
-  if (std::is_same<T, int8_t>::value) {
+  if (std::is_same<T, int8_t>::value || std::is_same<T, int16_t>::value) {
     // prepare max_w space for slim int8 quant
+    // just allocate buffer, set max value in kernel
     weight_max_guard =
         std::move(XPUMemory::MallocScratchPad(max_ptr_size * sizeof(float)));
   }
@@ -196,7 +200,8 @@ void XPUQuantizer::ConvertWithoutQuant(const T* cpu_data,
 template <typename Tcpu, typename Txpu>
 XPUQuantData XPUQuantizer::quant(const Tcpu* cpu_data,
                                  const DDimLite& dims,
-                                 bool data_transpose) {
+                                 bool data_transpose,
+                                 size_t max_ptr_len) {
   int numel = dims.production();
   const std::string cpu_dtype = CppTypeToString<Tcpu>();
   const std::string xpu_dtype = CppTypeToString<Txpu>();
@@ -206,7 +211,8 @@ XPUQuantData XPUQuantizer::quant(const Tcpu* cpu_data,
           << ", precision=" << precision << ", transpose=" << data_transpose
           << ", hashed_key=" << hashed_key;
   if (weight_cache_.find(hashed_key) == weight_cache_.end()) {
-    ConvertWrapper<Tcpu, Txpu>(cpu_data, dims, data_transpose, hashed_key);
+    ConvertWrapper<Tcpu, Txpu>(
+        cpu_data, dims, data_transpose, hashed_key, max_ptr_len);
   }
 
   float* max_ptr =
@@ -218,15 +224,19 @@ XPUQuantData XPUQuantizer::quant(const Tcpu* cpu_data,
 
 template XPUQuantData XPUQuantizer::quant<float, float>(const float*,
                                                         const DDimLite&,
-                                                        bool);
+                                                        bool,
+                                                        size_t);
 template XPUQuantData XPUQuantizer::quant<float, int16_t>(const float*,
                                                           const DDimLite&,
-                                                          bool);
+                                                          bool,
+                                                          size_t);
 template XPUQuantData XPUQuantizer::quant<float, int8_t>(const float*,
                                                          const DDimLite&,
-                                                         bool);
+                                                         bool,
+                                                         size_t);
 template XPUQuantData XPUQuantizer::quant<int8_t, int8_t>(const int8_t*,
                                                           const DDimLite&,
-                                                          bool);
+                                                          bool,
+                                                          size_t);
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/xpu/xpu_quantizer.h b/lite/backends/xpu/xpu_quantizer.h
index e34a2dbec1d..1f8e21ca390 100644
--- a/lite/backends/xpu/xpu_quantizer.h
+++ b/lite/backends/xpu/xpu_quantizer.h
@@ -36,7 +36,8 @@ class XPUQuantizer {
   template <typename Tcpu, typename Txpu>
   XPUQuantData quant(const Tcpu* cpu_data,
                      const DDimLite& dims,
-                     bool data_transpose);
+                     bool data_transpose,
+                     size_t max_ptr_len);
 
  private:
   template <typename T>
@@ -49,7 +50,8 @@ class XPUQuantizer {
   void ConvertWithoutQuant(const T* cpu_data,
                            const DDimLite& dims,
                            bool data_transpose,
-                           size_t hashed_key);
+                           size_t hashed_key,
+                           size_t max_ptr_len);
 
   template <typename Tcpu,
             typename Txpu,
@@ -58,7 +60,8 @@ class XPUQuantizer {
   void ConvertWithQuant(const Tcpu* cpu_data,
                         const DDimLite& dims,
                         bool data_transpose,
-                        size_t hashed_key);
+                        size_t hashed_key,
+                        size_t max_ptr_len);
 
   template <typename Tcpu,
             typename Txpu,
@@ -67,7 +70,8 @@ class XPUQuantizer {
   void ConvertWithQuant(const Tcpu* cpu_data,
                         const DDimLite& dims,
                         bool data_transpose,
-                        size_t hashed_key);
+                        size_t hashed_key,
+                        size_t max_ptr_len);
 
   template <typename Tcpu,
             typename Txpu,
@@ -76,8 +80,10 @@ class XPUQuantizer {
   void ConvertWrapper(const Tcpu* cpu_data,
                       const DDimLite& dims,
                       bool data_transpose,
-                      size_t hashed_key) {
-    ConvertWithQuant<Tcpu, Txpu>(cpu_data, dims, data_transpose, hashed_key);
+                      size_t hashed_key,
+                      size_t max_ptr_len) {
+    ConvertWithQuant<Tcpu, Txpu>(
+        cpu_data, dims, data_transpose, hashed_key, max_ptr_len);
   }
 
   template <typename Tcpu,
@@ -87,8 +93,10 @@ class XPUQuantizer {
   void ConvertWrapper(const Tcpu* cpu_data,
                       const DDimLite& dims,
                       bool data_transpose,
-                      size_t hashed_key) {
-    ConvertWithoutQuant<Tcpu>(cpu_data, dims, data_transpose, hashed_key);
+                      size_t hashed_key,
+                      size_t max_ptr_len) {
+    ConvertWithoutQuant<Tcpu>(
+        cpu_data, dims, data_transpose, hashed_key, max_ptr_len);
   }
 
   // cpu data to xpu quant data
diff --git a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc
new file mode 100644
index 00000000000..d55b9aad45c
--- /dev/null
+++ b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc
@@ -0,0 +1,742 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h"
+#include <algorithm>
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/target_wrapper.h"
+#endif
+#include "lite/core/optimizer/mir/graph_visualize_pass.h"
+#include "lite/core/optimizer/mir/pass_registry.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+
+bool XPUKernelScoreCmp(const std::pair<float, std::unique_ptr<KernelBase>>& a,
+                       const std::pair<float, std::unique_ptr<KernelBase>>& b) {
+  return a.first > b.first;
+}
+
+void XPUStaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  kernel_pick_factors_.ConsiderTarget();
+  kernel_pick_factors_.ConsiderPrecision();
+  kernel_pick_factors_.ConsiderDataLayout();
+  CHECK(kernel_pick_factors_.any_factor_considered())
+      << "kernel_pick_factors should be specified first";
+  CHECK(graph) << "graph not valid";
+
+// Collect input data precision for each node in the graph
+#ifdef LITE_WITH_XPU
+  DicideUseFP16Optimizer(graph);
+  GetXPUDeviceType();
+  if (xpu_use_fp16_optimizer_) {
+    for (auto& node : graph->StmtTopologicalOrder()) {
+      if (!node->IsStmt()) continue;
+      if (xpu_special_op_.count(node->AsStmt().op_type())) {
+        SpecialNodeInputPrecision(node);
+        continue;
+      }
+
+      if (xpu_inplace_op_.count(node->AsStmt().op_type())) {
+        continue;
+      }
+
+      NodeInputPrecision(node, graph);
+    }
+
+    for (auto& node : graph->StmtTopologicalOrder()) {
+      if (!node->IsStmt()) continue;
+      if (xpu_inplace_op_.count(node->AsStmt().op_type()) == 0) {
+        continue;
+      }
+
+      InplaceNodeInputPrecision(node);
+    }
+  }
+#endif
+
+  // sort kernels by the factors.
+  VLOG(2) << "graph block_idx: " << graph->blockIdx();
+  VLOG(2) << "graph->mutable_nodes().size(): " << graph->mutable_nodes().size();
+  size_t idx = 0;
+  for (auto& node : graph->StmtTopologicalOrder()) {
+    if (!node->IsStmt()) continue;
+    auto& instruct = node->AsStmt();
+    VLOG(2) << "pick kernel for op : " << instruct.op_type() << ", in block "
+            << graph->blockIdx() << ", idx : " << idx++;
+
+    std::map<std::string, PrecisionType> in_types;
+    std::map<std::string, PrecisionType> out_types;
+    // threse precision info store in __model__ file, if selected fp16 kernel,
+    // the output precision should be changed
+    for (std::list<Node*>::iterator i = node->inlinks.begin();
+         i != node->inlinks.end();
+         ++i) {
+      if ((*i)->arg()->type)
+        in_types[(*i)->arg()->name] = (*i)->arg()->type->precision();
+    }
+    for (std::list<Node*>::iterator i = node->outlinks.begin();
+         i != node->outlinks.end();
+         ++i) {
+      if ((*i)->arg()->type)
+        out_types[(*i)->arg()->name] = (*i)->arg()->type->precision();
+    }
+    // Get candidate kernels
+    std::vector<std::pair<float, std::unique_ptr<KernelBase>>> scored;
+    CHECK(!instruct.kernels().empty()) << "No kernels found for "
+                                       << instruct.op_type();
+
+    VLOG(2) << "candidate kernels size:" << instruct.kernels().size();
+
+    for (auto&& kernel : instruct.kernels()) {
+      VLOG(2) << "current candidate kernel is: " << kernel->summary();
+      VLOG(2) << "valid_places size is: " << graph->valid_places().size();
+
+      float score = KernelGrade(node,
+                                *kernel,
+                                graph->valid_places(),
+                                in_types,
+                                out_types,
+                                instruct.op_info()->input_names(),
+                                instruct.op_info()->output_names());
+
+      scored.emplace_back(score, std::move(kernel));
+    }
+    std::stable_sort(scored.begin(), scored.end(), XPUKernelScoreCmp);
+    instruct.kernels().clear();
+
+    if (!instruct.op_info()->HasAttr("enable_int8")) {
+#ifdef LITE_WITH_XPU
+      if (xpu_use_fp16_optimizer_) {
+        if (xpu_special_op_.count(node->AsStmt().op_type())) {
+          SpecialNodeOutputPrecision(graph, node, scored.front().second);
+        } else if (xpu_inplace_op_.count(node->AsStmt().op_type())) {
+          InplaceNodeOutputPrecision(node->AsStmt(),
+                                     instruct.op_info()->input_names(),
+                                     instruct.op_info()->output_names());
+        } else {
+          NodeOutputPrecision(graph, node);
+        }
+      }
+#endif
+
+      instruct.kernels().emplace_back(std::move(scored.front().second));
+      VLOG(2) << "the final pick kernel is "
+              << instruct.kernels().front()->summary() << "\n\n";
+    } else {
+      // TODO(quwei): consider XPU int8 data precision
+      bool out_type_int8 = true;
+      // Quantized lstm has fp32 output
+      if (instruct.op_type() == "lstm" || instruct.op_type() == "gru" ||
+          instruct.op_type() == "__xpu__multi_encoder" ||
+          instruct.op_type() == "__xpu__fc") {
+        out_type_int8 = false;
+      }
+      // Only if all ops linked to this op output has enable_int8 attr,
+      // then the op output type is int8, or fp32.
+      // Note, the quantized op linked to lstm and gru should output fp32
+      // tensor.
+      for (auto* out_n : node->outlinks) {
+        CHECK(out_n->IsArg());
+        for (auto* tmp_op : out_n->outlinks) {
+          CHECK(tmp_op->IsStmt());
+          auto* tmp_op_info = tmp_op->AsStmt().op_info();
+          if (!tmp_op_info->HasAttr("enable_int8") ||
+              tmp_op_info->Type() == "lstm" || tmp_op_info->Type() == "gru" ||
+              instruct.op_type() == "__xpu__multi_encoder" ||
+              instruct.op_type() == "__xpu__fc") {
+            out_type_int8 = false;
+            break;
+          }
+        }
+        if (!out_type_int8) break;
+      }
+      // If the out_type_int8 is true, it turns out that the output type of
+      // this
+      // op can be int8.
+      // So we need to specify output scale for this op.
+      if (out_type_int8) {
+        auto out_node = node->outlinks.front();
+        CHECK(out_node->IsArg());
+        auto out_node_name = out_node->arg()->name;
+        auto one_adj_op_node = out_node->outlinks.front();
+        CHECK(one_adj_op_node->IsStmt());
+        auto& one_adj_instruct = one_adj_op_node->AsStmt();
+        CHECK(one_adj_instruct.op_info()->HasAttr("enable_int8"));
+        CHECK(one_adj_instruct.op_info()->HasInputScale(out_node_name));
+
+        instruct.mutable_op_info()->SetOutputScale(
+            out_node_name,
+            one_adj_instruct.op_info()->GetInputScale(out_node_name));
+
+        auto update_desc = *instruct.mutable_op_info();
+        instruct.ResetOp(update_desc, graph->valid_places());
+        scored.clear();
+        for (auto&& kernel : instruct.kernels()) {
+          float score = KernelGrade(node,
+                                    *kernel,
+                                    graph->valid_places(),
+                                    in_types,
+                                    out_types,
+                                    instruct.op_info()->input_names(),
+                                    instruct.op_info()->output_names());
+          scored.emplace_back(score, std::move(kernel));
+        }
+        std::stable_sort(scored.begin(), scored.end(), XPUKernelScoreCmp);
+        instruct.kernels().clear();
+      }
+      // If the out_type_int8 is true, we should pick the kernel with the
+      // int8 input and int8 output.
+      // If the out_type_int8 is false, we should pick the kernel with the
+      // int8 input and fp32 output.
+      auto output_arguments = instruct.op_info()->OutputArgumentNames();
+      for (auto& candidate : scored) {
+        bool all_output_type_match = true;
+        auto expect_output_type =
+            out_type_int8 ? PRECISION(kInt8) : PRECISION(kFloat);
+
+        for (auto& arg_name : output_arguments) {
+          const Type* out_arg_ty =
+              candidate.second->GetOutputDeclType(arg_name);
+          if (out_arg_ty->precision() != expect_output_type) {
+            all_output_type_match = false;
+          }
+        }
+
+        if (all_output_type_match) {
+          instruct.kernels().emplace_back(std::move(candidate.second));
+          VLOG(2) << "instruct.kernels.emplace_back "
+                  << instruct.kernels().front()->name();
+          break;
+        }
+      }
+      CHECK(!instruct.kernels().empty()) << "No kernels found for "
+                                         << instruct.op_type();
+    }
+  }
+}
+
+#ifdef LITE_WITH_XPU
+void XPUStaticKernelPickPass::DicideUseFP16Optimizer(
+    const std::unique_ptr<SSAGraph>& graph) {
+  if (GetStringFromEnv("XPUForceUseFP16", "false") == "true") {
+    xpu_use_fp16_optimizer_ = false;
+    VLOG(2) << "XPU force use data precision: FP16 ";
+    return;
+  }
+
+  if (graph->valid_places()[0].precision == PrecisionType::kFP16) {
+    xpu_use_fp16_optimizer_ = true;
+    VLOG(2) << "XPU auto use data precision: FP16/FP32/INT16 ";
+  }
+}
+
+void XPUStaticKernelPickPass::ForceUseFP32Kernel(
+    size_t* score,
+    const lite::KernelBase& kernel,
+    const paddle::lite::mir::Node::Stmt& instruct) {
+  if (kernel.place().target != TARGET(kXPU)) {
+    return;
+  }
+
+  // only use in FC，it will not use in future.
+  if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int31" ||
+      lite::TargetWrapperXPU::multi_encoder_precision == "int31") {
+    if (kernel.alias() == "XPU_Real_kFloat" &&
+        instruct.op_type() == "__xpu__fc") {
+      *score *= 2;
+      VLOG(6) << "__xpu__fc: force use PRECISON INT31: *2";
+    }
+    return;
+  }
+
+  if (GetStringFromEnv("XPU_COMPUTE_PRECISION", "int16") == "int31") {
+    if (kernel.alias() == "XPU_Real_kFloat" &&
+        PRECISION_INT31_OP_.count(instruct.op_type())) {
+      *score *= 2;
+      VLOG(6) << instruct.op_type() << ": force use PRECISON INT31: *2";
+    }
+    return;
+  }
+
+  if (kernel.alias() == "XPU_Real_kFloat") {
+    *score = 0;
+    VLOG(6) << "By default,XPU not use PRECISION INT31, so not pick "
+               "current kernel: "
+            << kernel.summary();
+  }
+}
+
+void XPUStaticKernelPickPass::ForceUseInt8Kernel(
+    size_t* score,
+    const lite::KernelBase& kernel,
+    const paddle::lite::mir::Node::Stmt& instruct) {
+  if (kernel.place().target != TARGET(kXPU)) {
+    return;
+  }
+
+  // only use in FC，it will not use in future.
+  if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int8" ||
+      lite::TargetWrapperXPU::multi_encoder_precision == "int8") {
+    if (kernel.alias() == "XPU_Int8_FP32_FP32" &&
+        instruct.op_type() == "__xpu__fc") {
+      *score *= 2;
+      VLOG(6) << "__xpu__fc: force use PRECISON INT8: *2";
+    }
+    return;
+  }
+
+  if (GetStringFromEnv("XPU_COMPUTE_PRECISION", "int16") == "int8") {
+    if (kernel.alias() == "XPU_Int8_FP32_FP32" &&
+        PRECISION_INT8_OP_.count(instruct.op_type())) {
+      *score *= 2;
+      VLOG(6) << instruct.op_type() << ": force use PRECISON INT8: *2";
+    }
+    return;
+  }
+
+  if (kernel.alias() == "XPU_Int8_FP32_FP32") {
+    *score = 0;
+    VLOG(6) << "By default,XPU not use PRECISION INT8, so not pick "
+               "current kernel: "
+            << kernel.summary();
+  }
+}
+
+void XPUStaticKernelPickPass::GetScore(PrecisionType precision,
+                                       size_t* score_tmp) {
+  if (precision == PrecisionType::kInt16) {
+    *score_tmp = *score_tmp > 9 ? *score_tmp : 9;
+  } else if (precision == PrecisionType::kFP16) {
+    *score_tmp = *score_tmp > 7 ? *score_tmp : 7;
+  } else if (precision == PrecisionType::kAny) {
+    *score_tmp = *score_tmp > 1 ? *score_tmp : 1;
+  } else {
+    *score_tmp = *score_tmp > 5 ? *score_tmp : 5;
+  }
+}
+
+void XPUStaticKernelPickPass::NodeOutputPrecision(
+    const std::unique_ptr<SSAGraph>& graph, lite::mir::Node* node) {
+  auto& inst = node->AsStmt();
+  if (inst.op_type() == "fetch") {
+    return;
+  }
+
+  const auto* op_info = inst.op_info();
+  for (auto* out_node : node->outlinks) {
+    auto& var = out_node->AsArg();
+    const auto& var_name = var.name;
+    std::string arg_name;
+    CHECK(op_info->GetOutputArgname(var_name, &arg_name))
+        << "Can not find the output argument,current var name : " << var_name;
+    VLOG(6) << " output arg name:" << arg_name << " var name:" << var_name;
+    Scope* scope = node->AsStmt().op()->scope();
+    auto* var_ptr = scope->FindVar(var_name);
+    if (var_ptr == nullptr) {
+      VLOG(6) << "Can't find ouput var_name:  " << var_name
+              << "in current scope.";
+      continue;
+    }
+
+    PrecisionType precison = var_ptr->GetMutable<lite::Tensor>()->precision();
+    xpu_output_type_.emplace(var_name, precison);
+  }
+}
+
+void XPUStaticKernelPickPass::SpecialNodeOutputPrecision(
+    const std::unique_ptr<SSAGraph>& graph,
+    lite::mir::Node* node,
+    const std::unique_ptr<lite::KernelBase>& kernel) {
+  auto& inst = node->AsStmt();
+
+  std::vector<std::string> out_var_names;
+  const auto* op_info = inst.op_info();
+  for (auto* out_node : node->outlinks) {
+    auto& var = out_node->AsArg();
+    const auto& var_name = var.name;
+    std::string arg_name;
+
+    CHECK(op_info->GetOutputArgname(var_name, &arg_name))
+        << "Can not find the output argument, current var name : " << var_name;
+    VLOG(6) << " output arg name:" << arg_name << " var name:" << var_name;
+    if (output_parameter_name_.count(arg_name) == 0) {
+      continue;
+    }
+
+    const auto* decl_type = kernel->GetOutputDeclType(arg_name);
+    CHECK(decl_type);
+    PrecisionType precison = decl_type->precision();
+    xpu_output_type_.emplace(var_name, precison);
+  }
+}
+
+void XPUStaticKernelPickPass::InplaceNodeOutputPrecision(
+    const paddle::lite::mir::Node::Stmt& instruct,
+    const std::vector<std::string>& in_names,
+    const std::vector<std::string>& out_names) {
+  PrecisionType pre_op_output_precision = PrecisionType::kUnk;
+  for (size_t i = 0; i < in_names.size(); ++i) {
+    std::string tmp;
+    CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp));
+    VLOG(6) << "current kernel input data variable name:" << in_names[i]
+            << "Parameter name:" << tmp;
+    if (input_parameter_name_.count(tmp) &&
+        xpu_output_type_.count(in_names[i])) {
+      pre_op_output_precision = xpu_output_type_[in_names[i]];
+    }
+  }
+
+  // collect inplace op output data precision
+  if (pre_op_output_precision != PrecisionType::kUnk) {
+    for (size_t i = 0; i < out_names.size(); ++i) {
+      std::string tmp;
+      CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp));
+      if (output_parameter_name_.count(tmp)) {
+        xpu_output_type_.emplace(out_names[i], pre_op_output_precision);
+      }
+    }
+  }
+}
+
+// Special nodes like conv2d, matmul ; collect input data precision for eatch
+// registry kernel as a candidate set.
+void XPUStaticKernelPickPass::SpecialNodeInputPrecision(lite::mir::Node* node) {
+  auto& inst = node->AsStmt();
+  const auto* op_info = inst.op_info();
+  for (auto* in_node : node->inlinks) {
+    auto& var = in_node->AsArg();
+    const auto& var_name = var.name;
+    std::string arg_name;
+    CHECK(op_info->GetInputArgname(var_name, &arg_name))
+        << "Can not find the input argument,current var name : " << var_name;
+    VLOG(6) << " input arg name:" << arg_name << " var name:" << var_name;
+    if (input_parameter_name_.count(arg_name) == 0) {
+      continue;
+    }
+
+    std::vector<std::map<std::string, PrecisionType>> kernel_input_type{};
+    for (auto&& kernel : inst.kernels()) {
+      if (kernel->summary().find(xpu_disable_flag_) != std::string::npos) {
+        VLOG(6) << " ignore collect current kernel:" << kernel->summary();
+        continue;
+      }
+
+      std::map<std::string, PrecisionType> tmp_map;
+      PrecisionType precison;
+
+      const auto* decl_type = kernel->GetInputDeclType(arg_name);
+      CHECK(decl_type);
+      precison = decl_type->precision();
+      tmp_map.emplace(kernel->summary(), precison);
+      kernel_input_type.emplace_back(std::move(tmp_map));
+    }
+
+    xpu_input_type_.emplace(var_name, kernel_input_type);
+  }
+}
+
+void XPUStaticKernelPickPass::NodeInputPrecision(
+    lite::mir::Node* node, const std::unique_ptr<SSAGraph>& graph) {
+  auto& inst = node->AsStmt();
+  if (inst.op_type() == "feed") {
+    return;
+  }
+
+  const auto* op_info = inst.op_info();
+  for (auto* in_node : node->inlinks) {
+    auto& var = in_node->AsArg();
+    const auto& var_name = var.name;
+    std::string arg_name;
+    CHECK(op_info->GetInputArgname(var_name, &arg_name))
+        << "Can not find the input argument,current var name : " << var_name;
+    VLOG(6) << " input arg name:" << arg_name << " var name:" << var_name;
+
+    std::vector<std::map<std::string, PrecisionType>> kernel_input_type{};
+    std::map<std::string, PrecisionType> tmp_map;
+    PrecisionType precison;
+    Scope* scope = node->AsStmt().op()->scope();
+
+    auto* var_ptr = scope->FindVar(var_name);
+    if (var_ptr == nullptr) {
+      VLOG(6) << "Can't find input var_name:  " << var_name
+              << "in current scope.";
+      continue;
+    }
+
+    precison = var_ptr->GetMutable<lite::Tensor>()->precision();
+    tmp_map.emplace(inst.op_type(), precison);
+    kernel_input_type.emplace_back(std::move(tmp_map));
+    xpu_input_type_.emplace(var_name, kernel_input_type);
+  }
+}
+
+// Special for inplace op.
+void XPUStaticKernelPickPass::InplaceNodeInputPrecision(lite::mir::Node* node) {
+  auto& inst = node->AsStmt();
+  const auto* op_info = inst.op_info();
+  // inplace op only has one inpute variable.
+  std::string inplace_op_input_name{"none"};
+  for (auto* in_node : node->inlinks) {
+    auto& var = in_node->AsArg();
+    const auto& var_name = var.name;
+    std::string arg_name;
+    CHECK(op_info->GetInputArgname(var_name, &arg_name))
+        << "Can not find the input argument,current var name : " << var_name;
+    VLOG(6) << " input arg name:" << arg_name << " var name:" << var_name;
+    if (input_parameter_name_.count(arg_name)) {
+      inplace_op_input_name = var_name;
+    }
+  }
+
+  for (auto* out_node : node->outlinks) {
+    auto& var = out_node->AsArg();
+    const auto& var_name = var.name;
+    std::string arg_name;
+    int num = 0;
+
+    CHECK(op_info->GetOutputArgname(var_name, &arg_name))
+        << "Can not find the output argument,current var name : " << var_name;
+    VLOG(6) << " output arg name:" << arg_name << " var name:" << var_name;
+    // inplace op only have one output variable,but ic can connect input
+    // variables of multiple Ops
+    int output_match_num = xpu_input_type_.count(var_name);
+    if (output_parameter_name_.count(arg_name) == 0 || output_match_num == 0) {
+      continue;
+    }
+
+    for (auto iter = xpu_input_type_.begin(); iter != xpu_input_type_.end();
+         ++iter) {
+      if (num >= output_match_num) {
+        break;
+      }
+
+      if (iter->first != var_name) {
+        continue;
+      }
+
+      ++num;
+      xpu_input_type_.emplace(inplace_op_input_name, iter->second);
+    }
+    VLOG(6) << "inplace op :" << inst.op_type() << "input prision"
+            << "replace by the next op input prision ";
+    VLOG(6) << "inplace op :" << inst.op_type()
+            << ", inpute name:" << inplace_op_input_name
+            << ",the next op input input name : " << var_name;
+  }
+}
+
+void XPUStaticKernelPickPass::InplaceOpScore(
+    const lite::KernelBase& kernel,
+    const paddle::lite::mir::Node::Stmt& instruct,
+    const std::vector<std::string>& in_names,
+    const std::vector<std::string>& out_names,
+    bool* type_match,
+    size_t* score) {
+  PrecisionType pre_op_output_precision = PrecisionType::kUnk;
+  for (size_t i = 0; i < in_names.size(); ++i) {
+    std::string tmp;
+    CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp));
+    VLOG(6) << "current kernel input data variable name:" << in_names[i]
+            << "Parameter name:" << tmp;
+    if (input_parameter_name_.count(tmp) &&
+        xpu_output_type_.count(in_names[i])) {
+      size_t score_tmp = 0;
+      pre_op_output_precision = xpu_output_type_[in_names[i]];
+      if (kernel.GetInputDeclType(tmp)->precision() == PrecisionType::kAny) {
+        GetScore(PrecisionType::kAny, &score_tmp);
+        VLOG(6) << "current inplace kernel input data precision:kAny";
+      }
+
+      if (pre_op_output_precision ==
+              kernel.GetInputDeclType(tmp)->precision() ||
+          pre_op_output_precision == PrecisionType::kAny) {
+        GetScore(pre_op_output_precision, &score_tmp);
+        *type_match = true;
+        VLOG(6) << "inplace op match input data precision";
+      }
+
+      *score += score_tmp;
+    }
+  }
+
+  // collect inplace op output data precision
+  if (pre_op_output_precision != PrecisionType::kUnk) {
+    for (size_t i = 0; i < out_names.size(); ++i) {
+      std::string tmp;
+      CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp));
+      if (output_parameter_name_.count(tmp)) {
+        xpu_output_type_.emplace(out_names[i], pre_op_output_precision);
+      }
+    }
+  }
+}
+
+void XPUStaticKernelPickPass::SpecialOpScore(
+    const lite::KernelBase& kernel,
+    const paddle::lite::mir::Node::Stmt& instruct,
+    const std::vector<std::string>& in_names,
+    const std::vector<std::string>& out_names,
+    bool* type_match,
+    size_t* score) {
+  size_t score_tmp_all = 0;
+  bool intput_match = true;
+  bool output_match = true;
+  bool consider_cpu = false;
+  // delete??
+  if (consider_cpu_op_.count(instruct.op_type())) {
+    consider_cpu = true;
+  }
+
+  if (!(kernel.place().target == TARGET(kXPU) || consider_cpu)) {
+    return;
+  }
+
+  // input data precision score
+  for (size_t i = 0; i < in_names.size(); ++i) {
+    std::string tmp;
+    CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp));
+    if (input_parameter_name_.count(tmp) == 0) {
+      continue;
+    }
+
+    if (xpu_output_type_.count(in_names[i]) == 0) {
+      continue;
+    }
+
+    VLOG(6) << "current kernel input data variable name:" << in_names[i]
+            << ", Parameter name:" << tmp;
+
+    size_t score_tmp = 0;
+    if (kernel.GetInputDeclType(tmp)->precision() == PrecisionType::kAny) {
+      GetScore(PrecisionType::kAny, &score_tmp);
+      VLOG(6) << "match input data precision:kAny";
+    }
+
+    if (xpu_output_type_[in_names[i]] ==
+            kernel.GetInputDeclType(tmp)->precision() ||
+        xpu_output_type_[in_names[i]] == PrecisionType::kAny) {
+      GetScore(xpu_output_type_[in_names[i]], &score_tmp);
+      VLOG(6) << "match input data precision";
+    }
+
+    if (score_tmp == 0) {
+      output_match = false;
+    }
+
+    score_tmp_all += score_tmp;
+  }
+
+  // output data precision score
+  for (size_t i = 0; i < out_names.size(); ++i) {
+    std::string tmp;
+    CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp));
+    int output_match_num = xpu_input_type_.count(out_names[i]);
+    if (output_parameter_name_.count(tmp) == 0) {
+      continue;
+    }
+
+    if (output_match_num == 0) {
+      continue;
+    }
+
+    VLOG(6) << "current kernel output data variable name:" << out_names[i]
+            << ", Parameter name:" << tmp;
+    int num = 0;
+    size_t score_tmp = 0;
+    for (auto iter = xpu_input_type_.begin(); iter != xpu_input_type_.end();
+         ++iter) {
+      if (num >= output_match_num) {
+        break;
+      }
+
+      if (iter->first != out_names[i]) {
+        continue;
+      }
+
+      ++num;
+      for (auto& map_kernel : iter->second) {
+        // Special op fetch
+        if (map_kernel.begin()->first.substr(0, 5) == "fetch") {
+          if (map_kernel.begin()->second ==
+              kernel.GetOutputDeclType(tmp)->precision()) {
+            score_tmp = 500;
+          }
+          continue;
+        }
+
+        if (kernel.GetOutputDeclType(tmp)->precision() == PrecisionType::kAny) {
+          VLOG(6) << "match precision:kAny,the next kernel's name:"
+                  << map_kernel.begin()->first;
+          GetScore(PrecisionType::kAny, &score_tmp);
+        }
+
+        if (map_kernel.begin()->second ==
+                kernel.GetOutputDeclType(tmp)->precision() ||
+            map_kernel.begin()->second == PrecisionType::kAny) {
+          VLOG(6) << "match next kernel's input data precision,the "
+                     "next kernel name:"
+                  << map_kernel.begin()->first;
+          GetScore(map_kernel.begin()->second, &score_tmp);
+        }
+      }
+    }
+
+    if (score_tmp == 0) {
+      output_match = false;
+    }
+    score_tmp_all += score_tmp;
+  }
+
+  if (score_tmp_all > 0) {
+    *type_match = intput_match & output_match;
+  }
+
+  *score += score_tmp_all;
+}
+
+void XPUStaticKernelPickPass::GetXPUDeviceType() {
+  int cur_dev_idx = 0;
+  uint64_t cur_dev_attr = 0;
+
+  XPU_CALL(xpu_current_device(&cur_dev_idx));
+  XPU_CALL(xpu_device_get_attr(&cur_dev_attr, XPUATTR_MODEL, cur_dev_idx));
+  if (cur_dev_attr <= 1) {
+    VLOG(4) << "Currents XPU device : XPU1";
+    xpu_disable_flag_ = "DISABLE_XPU1";
+  } else if (cur_dev_attr >= 2 && cur_dev_attr <= 299) {
+    VLOG(4) << "Currents XPU device : XPU2";
+    xpu_disable_flag_ = "DISABLE_XPU2";
+  } else if (cur_dev_attr >= 300 && cur_dev_attr <= 599) {
+    VLOG(4) << "Currents XPU device : XPU3";
+    xpu_disable_flag_ = "DISABLE_XPU3";
+  } else {
+    VLOG(4) << "invaid XPU device";
+    xpu_disable_flag_ = "NONE";
+  }
+}
+
+#endif
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__static_kernel_pick_pass,
+                  paddle::lite::mir::XPUStaticKernelPickPass)
+    .BindTargets({TARGET(kXPU)});
diff --git a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h
new file mode 100644
index 00000000000..38f786b5216
--- /dev/null
+++ b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h
@@ -0,0 +1,344 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <limits>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "lite/core/optimizer/mir/pass.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+/*
+ * XPUStaticKernelPickPass is a simple strategy for picking the kernel for each
+ * Operator using operator developer defined rule, there are many other tactics
+ * such as considering IO or kernel execution latency and we will implement them
+ * latter.
+ *
+ * There are two argument for this pass:
+ * - place, the target place.
+ * - kernel_pick_factors, the factors to consider in picking kernels.
+ * Set them first before execute the pass.
+ */
+class XPUStaticKernelPickPass : public mir::StmtPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+
+  const core::KernelPickFactor& kernel_pick_factors() const {
+    return kernel_pick_factors_;
+  }
+  core::KernelPickFactor* mutable_kernel_pick_factors() {
+    return &kernel_pick_factors_;
+  }
+
+ private:
+  // Score the kernel.
+  size_t KernelGrade(lite::mir::Node* node,
+                     const lite::KernelBase& kernel,
+                     const std::vector<Place>& places,
+                     const std::map<std::string, PrecisionType>& in_types,
+                     const std::map<std::string, PrecisionType>& out_types,
+                     const std::vector<std::string>& in_names,
+                     const std::vector<std::string>& out_names) {
+    const auto& instruct = node->AsStmt();
+    CHECK_GT(places.size(), static_cast<size_t>(0)) << "valid_places is empty.";
+    float final_score{-1.};
+    Place winner_place{places[0]};
+    const int kMax =
+        (std::numeric_limits<core::KernelPickFactor::value_type>::max)();
+    size_t place_size = places.size();
+
+    // NOTE: We compare kernel's place with place in valid_places to select the
+    // best match place
+    //       The place's order in valid_places array decide the user's
+    //       preference
+    // final_score = weight * socre
+    // weight: The weight is compute with (valid_places.size() - i) /
+    // valid_places.size() as default.
+    //         where i is the place's index in valid_places array.
+    // score:  score is the weighted sum of target、percision and layout
+    for (size_t i = 0; i < place_size; ++i) {
+      const auto& place = places[i];
+      float weight = static_cast<float>(place_size - i) / place_size;
+      VLOG(4) << "current place is " << place.DebugString() << ", idx : " << i
+              << ", weight : " << weight;
+      size_t score{};
+
+      // The more important factor comes first
+      if (kernel_pick_factors_.IsTargetConsidered() &&
+          (place.target == kernel.target() || kernel.target() == TARGET(kAny) ||
+           place.target == TARGET(kAny))) {
+        size_t target_score =
+            kMax /
+            static_cast<int>(core::KernelPickFactor::Factor::TargetFirst);
+        score += target_score;
+        VLOG(4) << "[TargetConsidered score]:" << target_score;
+      }
+      VLOG(4) << "[score s1]:" << score;
+
+      if (kernel_pick_factors_.IsPrecisionConsidered() &&
+          (place.precision == kernel.precision() ||
+           kernel.precision() == PRECISION(kAny) ||
+           place.precision == PRECISION(kAny) ||
+           // fp16 may also pick FP32 kernel preciison
+           (xpu_use_fp16_optimizer_ &&
+            kernel.precision() == PRECISION(kFloat) &&
+            place.precision == PRECISION(kFP16)))) {
+        // score skipped, if kernel is int8, but op is not int8
+        if (!(kernel.precision() == PRECISION(kInt8) &&
+              !instruct.op_info()->HasAttr("enable_int8"))) {
+          size_t precision_score =
+              kMax /
+              static_cast<int>(core::KernelPickFactor::Factor::PrecisionFirst);
+          score += precision_score;
+          VLOG(4) << "[PrecisionConsidered score]:" << precision_score;
+        }
+      }
+      VLOG(4) << "[score s2]:" << score;
+
+      if (kernel_pick_factors_.IsDataLayoutConsidered() &&
+          (place.layout == kernel.layout() ||
+           kernel.layout() == DATALAYOUT(kAny) ||
+           place.layout == DATALAYOUT(kAny))) {
+        size_t datalayout_score =
+            kMax /
+            static_cast<int>(core::KernelPickFactor::Factor::DataLayoutFirst);
+        score += datalayout_score;
+        VLOG(4) << "[DataLayoutConsidered score]:" << datalayout_score;
+      }
+      VLOG(4) << "[score s3]:" << score;
+
+      // add new rules for precision: When the input types are consistent with
+      // kernel's input types, select the kernel of the precision. However, if
+      // the op is feed, we should compare the output precision type.
+      // Note that this strategy is not compatible with quantization, so skip
+      // quantization op.
+      if (!instruct.op_info()->HasAttr("enable_int8")) {
+        bool type_match = true;
+        if (instruct.op_type() == "feed") {
+          for (size_t i = 0; i < out_names.size(); ++i) {
+            std::string tmp;
+            CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp));
+            if (out_types.count(out_names[i]) &&
+                out_types.at(out_names[i]) !=
+                    kernel.GetOutputDeclType(tmp)->precision()) {
+              type_match = false;
+            }
+          }
+        } else {
+          for (size_t i = 0; i < in_names.size(); ++i) {
+            std::string tmp;
+            CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp));
+            if (in_types.count(in_names[i]) &&
+                !PrecTypeCompatible(
+                    in_types.at(in_names[i]),
+                    kernel.GetInputDeclType(tmp)->precision())) {
+              type_match = false;
+            }
+          }
+        }
+#ifdef LITE_WITH_XPU
+        if (xpu_use_fp16_optimizer_ &&
+            (xpu_special_op_.count(instruct.op_type()) ||
+             xpu_inplace_op_.count(instruct.op_type()))) {
+          type_match = false;
+          if (kernel.summary().find(xpu_disable_flag_) != std::string::npos) {
+            score = 0;
+            VLOG(6) << " ignore pick current kernel:" << kernel.summary();
+          } else if (xpu_inplace_op_.count(instruct.op_type())) {
+            InplaceOpScore(
+                kernel, instruct, in_names, out_names, &type_match, &score);
+          } else {
+            SpecialOpScore(
+                kernel, instruct, in_names, out_names, &type_match, &score);
+          }
+        }
+#endif
+
+        if (type_match) {
+          score *= 2;
+          VLOG(4) << "[Input/Output precision compatible]: *2";
+        }
+        VLOG(4) << "[score s4]:" << score;
+      }
+#ifdef LITE_WITH_XPU
+      ForceUseFP32Kernel(&score, kernel, instruct);
+      ForceUseInt8Kernel(&score, kernel, instruct);
+#endif
+
+      // add new rules for datatype: When the input types are consistent with
+      // kernel's input types, select the kernel of the datatype.
+      if (instruct.op_info()->Type() != "conditional_block" &&
+          instruct.op_info()->Type() != "while" &&
+          instruct.op_info()->Type() != "subgraph") {
+        bool datatype_match = true;
+        for (auto* in : node->inlinks) {
+          if (!in->IsArg()) continue;
+          if (in->AsArg().name == "feed" || in->AsArg().is_persist) continue;
+          std::string argname;
+          instruct.op_info()->GetInputArgname(in->AsArg().name, &argname);
+          VLOG(5) << "intput var name : " << in->AsArg().name;
+          // only when datatype is LOD_TENSOR, LOD_TENSOR_ARRAY, STEP_SCOPES,
+          // the type pointer is not null;
+          if (in->AsArg().type) {
+            VLOG(5) << "input datatype : "
+                    << static_cast<int>(in->AsArg().type->id());
+            VLOG(5) << "kernel bind datatype : "
+                    << static_cast<int>(kernel.GetInputDeclType(argname)->id());
+            if (static_cast<int>(in->AsArg().type->id()) !=
+                static_cast<int>(kernel.GetInputDeclType(argname)->id()))
+              datatype_match = false;
+          } else {
+            datatype_match = false;
+          }
+        }
+        if (datatype_match) {
+          score *= 2;
+          VLOG(4) << "[Input datatype compatible]: *2";
+        }
+        VLOG(4) << "[score s5]:" << score;
+      }
+
+      if (weight * score > final_score) {
+        final_score = weight * score;
+        winner_place = place;
+      }
+    }
+
+    VLOG(2) << "-------- score summary for candidate kernel : "
+            << kernel.summary() << " --------";
+    VLOG(2) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
+            << " " << DataLayoutToStr(winner_place.layout) << " "
+            << TargetToStr(winner_place.target);
+    VLOG(2) << " ===> kernel.place():"
+            << PrecisionToStr(kernel.place().precision) << " "
+            << DataLayoutToStr(kernel.place().layout) << " "
+            << TargetToStr(kernel.place().target);
+    VLOG(4) << "kernel.op_type():" << kernel.op_type();
+    VLOG(4) << "kernel picker factors:" << kernel_pick_factors_;
+    VLOG(4) << "winner_picker place:" << winner_place.DebugString();
+    VLOG(4) << "[score(final)]:" << final_score;
+    VLOG(4) << "------------------------------";
+
+    // The data layout is not considered, for the input and output arguments
+    // might have different data layout.
+    // TODO(Superjomn) reconsider the idea of taking the data layout as a kernel
+    // specification.
+    return final_score;
+  }
+
+  // Compatible for PrecisionType.
+  // For cuda, in the process of choosing kernel, fp16 and fp32 are compatiable.
+  // If kernel's declared type is kAny, it is matched.
+  bool PrecTypeCompatible(const PrecisionType& p1, const PrecisionType& p2) {
+    if (p1 == p2 || p2 == PRECISION(kAny)) {
+      return true;
+    } else if ((p1 == PRECISION(kFP16) || p1 == PRECISION(kFloat)) &&
+               (p2 == PRECISION(kFP16) || p2 == PRECISION(kFloat))) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+#ifdef LITE_WITH_XPU
+  void DicideUseFP16Optimizer(const std::unique_ptr<SSAGraph>& graph);
+  void ForceUseFP32Kernel(size_t* score,
+                          const lite::KernelBase& kernel,
+                          const paddle::lite::mir::Node::Stmt& instruct);
+  void ForceUseInt8Kernel(size_t* score,
+                          const lite::KernelBase& kernel,
+                          const paddle::lite::mir::Node::Stmt& instruct);
+  void GetScore(PrecisionType precision, size_t* score_tmp);
+
+  void NodeInputPrecision(lite::mir::Node* node,
+                          const std::unique_ptr<SSAGraph>& graph);
+  void InplaceNodeInputPrecision(lite::mir::Node* node);
+  void SpecialNodeInputPrecision(lite::mir::Node* node);
+
+  void NodeOutputPrecision(const std::unique_ptr<SSAGraph>& graph,
+                           lite::mir::Node* node);
+  void InplaceNodeOutputPrecision(const paddle::lite::mir::Node::Stmt& instruct,
+                                  const std::vector<std::string>& in_names,
+                                  const std::vector<std::string>& out_names);
+  void SpecialNodeOutputPrecision(
+      const std::unique_ptr<SSAGraph>& graph,
+      lite::mir::Node* node,
+      const std::unique_ptr<lite::KernelBase>& kernel);
+
+  void SpecialOpScore(const lite::KernelBase& kernel,
+                      const paddle::lite::mir::Node::Stmt& instruct,
+                      const std::vector<std::string>& in_names,
+                      const std::vector<std::string>& out_names,
+                      bool* type_match,
+                      size_t* score);
+  void GetXPUDeviceType();
+  void InplaceOpScore(const lite::KernelBase& kernel,
+                      const paddle::lite::mir::Node::Stmt& instruct,
+                      const std::vector<std::string>& in_names,
+                      const std::vector<std::string>& out_names,
+                      bool* type_match,
+                      size_t* score);
+#endif
+
+ private:
+  core::KernelPickFactor kernel_pick_factors_;
+
+  bool xpu_use_fp16_optimizer_{false};
+#ifdef LITE_WITH_XPU
+  // TODO(quwei:) addn more op
+  const std::set<std::string> PRECISION_INT31_OP_{"__xpu__fc"};
+  const std::set<std::string> PRECISION_INT8_OP_{"__xpu__fc"};
+  const std::set<std::string> input_parameter_name_{
+      "Input", "X", "Y", "Branch", "BBoxes", "Scores", "repeat_times_tensor"};
+  const std::set<std::string> output_parameter_name_{
+      "Output", "Out", "Boxes", "Scores", "Y"};
+  std::multimap<std::string, std::vector<std::map<std::string, PrecisionType>>>
+      xpu_input_type_{};
+  std::map<std::string, PrecisionType> xpu_output_type_{};
+  std::string xpu_disable_flag_{};
+  const std::set<std::string> consider_cpu_op_{"cast"};
+  const std::set<std::string> xpu_special_op_{"__xpu__fc",
+                                              "conv3d",
+                                              "__xpu__conv2d",
+                                              "gather",
+                                              "pool2d",
+                                              "concat",
+                                              "calib",
+                                              "relu",
+                                              "tanh",
+                                              "sigmoid",
+                                              "leaky_relu",
+                                              "conv2d_transpose",
+                                              "elementwise_mul",
+                                              "elementwise_add",
+                                              "reduce_mean"};
+  const std::set<std::string> xpu_inplace_op_{"reshape",
+                                              "reshape2",
+                                              "flatten",
+                                              "flatten2",
+                                              "squeeze",
+                                              "squeeze2",
+                                              "unsqueeze",
+                                              "unsqueeze2"};
+#endif
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc
index 0e3f3b0335d..74b1e72a974 100644
--- a/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc
+++ b/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc
@@ -82,27 +82,6 @@ class XPUFcFuser : public FuseBase {
     op_desc.SetInput("Input", {matched.at("x")->arg()->name});
     op_desc.SetInput("Filter", {matched.at("W")->arg()->name});
 
-    std::string precision = "int16";
-#ifdef LITE_WITH_XPU
-    if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int31" ||
-        lite::TargetWrapperXPU::multi_encoder_precision == "int31") {
-      precision = "int31";
-      VLOG(3) << "Use int31 in XPUFcOp";
-    } else if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int8" ||
-               lite::TargetWrapperXPU::multi_encoder_precision == "int8") {
-      precision = "int8";
-      if (op_desc.HasAttr("enable_int8") &&
-          op_desc.GetAttr<bool>("enable_int8")) {
-        CHECK(op_desc.HasAttr("X0_scale")) << " quant model fc no X0_scale";
-        CHECK(op_desc.HasAttr("Y0_scale")) << " quant model fc no Y0_scale";
-        VLOG(3) << "Use int8 quant model in XPUFcOp, InputMax:"
-                << 127 * op_desc.GetAttr<std::vector<float>>("X0_scale")[0]
-                << ", WeightMax: "
-                << 127 * op_desc.GetAttr<std::vector<float>>("Y0_scale")[0];
-      }
-      VLOG(3) << "Use int8 in XPUFcOp";
-    }
-#endif
     if (with_bias_) {
       op_desc.SetInput("Bias", {matched.at("bias")->arg()->name});
     }
@@ -118,8 +97,48 @@ class XPUFcFuser : public FuseBase {
       output_name = matched.at("mul_out")->arg()->name;
       output_node_name = "mul_out";
     }
+    bool per_channel = false;
+    int weight_scale_size = 1;
+    auto* op_info = matched.at("mul")->stmt()->op_info();
+    auto mul_input_y_name = op_info->Input("Y").front();
+    auto mul_y_shape = scope->FindMutableTensor(mul_input_y_name)->dims();
+    CHECK_EQ(mul_y_shape.size(), 2) << "mul_y_shape.size: "
+                                    << mul_y_shape.size();
+    const bool quant = op_info->HasAttr("enable_int8") &&
+                       op_info->GetAttr<bool>("enable_int8");
+    op_desc.SetAttr<bool>("enable_int8", quant);
+    // X0_scale is already in op_desc when copy from mul
+    if (quant) {
+      CHECK(op_info->HasAttr("Y0_scale")) << "quant model no Y0_scale";
+      weight_scale_size =
+          op_info->GetAttr<std::vector<float>>("Y0_scale").size();
+      CHECK_EQ(weight_scale_size, mul_y_shape[1])
+          << "weight_scale_size: " << weight_scale_size
+          << ", mul_y_shape:" << mul_y_shape;
+      CHECK_GE(weight_scale_size, 1) << weight_scale_size;
+      std::vector<float> weight_max;
+      if (is_per_tensor(op_info->GetAttr<std::vector<float>>("Y0_scale"))) {
+        per_channel = false;
+        VLOG(3) << "xpu fc per tensor";
+        weight_max.push_back(
+            op_info->GetAttr<std::vector<float>>("Y0_scale")[0] * 127);
+      } else {
+        per_channel = true;
+        VLOG(3) << "xpu fc per channel, first channel max:"
+                << op_info->GetAttr<std::vector<float>>("Y0_scale")[0] * 127
+                << ", last channel max: "
+                << op_info->GetAttr<std::vector<float>>(
+                       "Y0_scale")[weight_scale_size - 1] *
+                       127;
+        for (auto wm : op_info->GetAttr<std::vector<float>>("Y0_scale")) {
+          weight_max.push_back(wm * 127);
+        }
+      }
+      VLOG(3) << "weight_max size:" << weight_max.size();
+      op_desc.SetAttr<std::vector<float>>("Y0_max", weight_max);
+      op_desc.SetAttr<bool>("per_channel", per_channel);
+    }
     op_desc.SetOutput("Output", {output_name});
-    op_desc.SetAttr<std::string>("precision", precision);
     std::map<std::string, int> act_map{{"linear", 0},
                                        {"relu", 1},
                                        {"sigmoid", 2},
@@ -169,6 +188,19 @@ class XPUFcFuser : public FuseBase {
  private:
   bool with_bias_;
   std::string act_type_;
+  std::string mul_type_;
+  bool is_per_tensor(const std::vector<float>& weight_max) {
+    bool per_tensor = true;
+    CHECK_GT(weight_max.size(), 0) << "fc channel size: " << weight_max.size();
+    auto first = weight_max[0];
+    for (int i = 1; i < weight_max.size(); ++i) {
+      if (std::abs(first - weight_max[i]) > 1e-6) {
+        per_tensor = false;
+        break;
+      }
+    }
+    return per_tensor;
+  }
 };
 
 }  // namespace fusion
diff --git a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_adaptive_seqlen_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_adaptive_seqlen_fuse_pass.cc
index f15667422bb..95bc14151e5 100644
--- a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_adaptive_seqlen_fuse_pass.cc
+++ b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_adaptive_seqlen_fuse_pass.cc
@@ -61,14 +61,14 @@ namespace fusion {
 class XPUMultiEncoderAdaptiveSeqlenFuser : public FuseBase {
  public:
   explicit XPUMultiEncoderAdaptiveSeqlenFuser(
-      const std::string& matmul_type = "matmul")
-      : matmul_type_(matmul_type) {}
+      const std::string& matmul_type = "matmul", bool pre_ln = false)
+      : matmul_type_(matmul_type), pre_ln_(pre_ln) {}
 
   void BuildPattern() override {
     auto* mask = VarNode("mask")
                      ->assert_is_op_input(matmul_type_, "X")
                      ->assert_is_op_input(matmul_type_, "Y");
-    auto* matmul = OpNode("matmul", matmul_type_)->AsIntermediate();
+    auto* matmul = OpNode(matmul_type_, matmul_type_)->AsIntermediate();
     auto* matmul_out = VarNode("matmul_out")
                            ->assert_is_op_input("scale", "X")
                            ->assert_is_op_output(matmul_type_, "Out")
@@ -85,20 +85,37 @@ class XPUMultiEncoderAdaptiveSeqlenFuser : public FuseBase {
                           ->AsIntermediate();
     auto* xpu_embedding =
         OpNode("xpu_embedding", "__xpu__embedding_with_eltwise_add");
-    auto* embedding_out =
-        VarNode("embedding_out")
-            ->assert_is_op_output("__xpu__embedding_with_eltwise_add", "Output")
-            ->assert_is_op_input("layer_norm", "X");
-    auto* layer_norm = OpNode("layer_norm", "layer_norm");
-    auto* layer_norm_out =
-        VarNode("layer_norm_out")
-            ->assert_is_op_output("layer_norm", "Y")
-            ->assert_is_op_input("__xpu__multi_encoder", "Input");
+
+    PMNode* embedding_out = nullptr;
+    PMNode* layer_norm = nullptr;
+    PMNode* layer_norm_out = nullptr;
+
+    if (pre_ln_) {
+      embedding_out = VarNode("embedding_out")
+                          ->assert_is_op_output(
+                              "__xpu__embedding_with_eltwise_add", "Output")
+                          ->assert_is_op_input("__xpu__multi_encoder", "Input");
+    } else {
+      embedding_out = VarNode("embedding_out")
+                          ->assert_is_op_output(
+                              "__xpu__embedding_with_eltwise_add", "Output")
+                          ->assert_is_op_input("layer_norm", "X");
+      layer_norm = OpNode("layer_norm", "layer_norm");
+      layer_norm_out =
+          VarNode("layer_norm_out")
+              ->assert_is_op_output("layer_norm", "Y")
+              ->assert_is_op_input("__xpu__multi_encoder", "Input");
+    }
     auto* xpu_encoder = OpNode("xpu_encoder", "__xpu__multi_encoder")
                             ->assert_op_attr<bool>("adaptive_seqlen", true);
+    if (pre_ln_) {
+      xpu_encoder->assert_op_attr<bool>("norm_before", true);
+      *xpu_embedding >> *embedding_out >> *xpu_encoder;
+    } else {
+      *xpu_embedding >> *embedding_out >> *layer_norm >> *layer_norm_out >>
+          *xpu_encoder;
+    }
 
-    *xpu_embedding >> *embedding_out >> *layer_norm >> *layer_norm_out >>
-        *xpu_encoder;
     *mask >> *matmul >> *matmul_out >> *scale >> *scale_out >> *stack >>
         *stack_out >> *xpu_encoder;
   }
@@ -147,6 +164,7 @@ class XPUMultiEncoderAdaptiveSeqlenFuser : public FuseBase {
 
  private:
   std::string matmul_type_;
+  bool pre_ln_;
 };
 
 }  // namespace fusion
@@ -155,9 +173,12 @@ class XPUMultiEncoderAdaptiveSeqlenFusePass : public ProgramPass {
  public:
   void Apply(const std::unique_ptr<SSAGraph>& graph) override {
     std::vector<std::string> matmul_types{"matmul", "matmul_v2"};
+    std::vector<bool> pre_lns{true, false};
     for (auto& matmul_type : matmul_types) {
-      fusion::XPUMultiEncoderAdaptiveSeqlenFuser fuser(matmul_type);
-      fuser(graph.get());
+      for (auto pre_ln : pre_lns) {
+        fusion::XPUMultiEncoderAdaptiveSeqlenFuser fuser(matmul_type, pre_ln);
+        fuser(graph.get());
+      }
     }
   }
 };
diff --git a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
index e47e12270ba..01c091ffe71 100644
--- a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
@@ -101,7 +101,7 @@ class XPUSingleEncoderFuser : public FuseBase {
     auto* q_reshape2_xshape = VarNode("q_reshape2_xshape")
                                   ->assert_is_op_output("reshape2", "XShape")
                                   ->AsIntermediate();
-    std::string target_op_type = "matmul";
+    std::string target_op_type = matmul_type_;
     if (with_q_scale_) {
       target_op_type = "scale";
     }
@@ -121,7 +121,7 @@ class XPUSingleEncoderFuser : public FuseBase {
       q_scale = OpNode("q_scale", "scale")->AsIntermediate();
       q_scale_out = VarNode("q_scale_out")
                         ->assert_is_op_output("scale", "Out")
-                        ->assert_is_op_input("matmul", "X")
+                        ->assert_is_op_input(matmul_type_, "X")
                         ->AsIntermediate();
     }
 
@@ -151,16 +151,16 @@ class XPUSingleEncoderFuser : public FuseBase {
     auto* k_transpose2 = OpNode("k_transpose2", "transpose2")->AsIntermediate();
     auto* k_transpose2_out = VarNode("k_transpose2_out")
                                  ->assert_is_op_output("transpose2", "Out")
-                                 ->assert_is_op_input("matmul", "Y")
+                                 ->assert_is_op_input(matmul_type_, "Y")
                                  ->AsIntermediate();
     auto* k_transpose2_xshape =
         VarNode("k_transpose2_xshape")
             ->assert_is_op_output("transpose2", "XShape")
             ->AsIntermediate();
 
-    auto* qk_matmul = OpNode("qk_matmul", "matmul")->AsIntermediate();
+    auto* qk_matmul = OpNode("qk_matmul", matmul_type_)->AsIntermediate();
     auto* qk_matmul_out = VarNode("qk_matmul_out")
-                              ->assert_is_op_output("matmul", "Out")
+                              ->assert_is_op_output(matmul_type_, "Out")
                               ->assert_is_op_input("elementwise_add", "X")
                               ->AsIntermediate();
     auto* qk_mask = VarNode("qk_mask")
@@ -508,67 +508,23 @@ class XPUSingleEncoderFuser : public FuseBase {
     CHECK_EQ(q_mul_y_shape[0], qkv_mul_y_shape[1]);
     CHECK_EQ(q_mul_y_shape[1], qkv_mul_y_shape[0]);
     CHECK_GT(hidden_dim, 0) << "invalid hidden_dim: " << hidden_dim;
-    // mul input_max, output_max * 6 + matmul x_max,y_max,output_max * 2
-    std::vector<float> fc_input_max;
-    set_quant_info(matched, &fc_input_max);
-    // mul & matmul input/output max
-    op_desc.SetAttr<std::vector<float>>("fc_input_max", fc_input_max);
 
     if (q_mul_op_info->HasAttr("enable_int8") &&
         q_mul_op_info->GetAttr<bool>("enable_int8")) {
       op_desc.SetAttr<bool>("enable_int8", true);
-      op_desc.SetAttr<std::vector<float>>(
-          "Y0_max",
-          {
-              127 *
-                  matched.at("q_mul")
-                      ->stmt()
-                      ->op_info()
-                      ->GetAttr<std::vector<float>>("Y0_scale")[0],
-              127 *
-                  matched.at("k_mul")
-                      ->stmt()
-                      ->op_info()
-                      ->GetAttr<std::vector<float>>("Y0_scale")[0],
-              127 *
-                  matched.at("v_mul")
-                      ->stmt()
-                      ->op_info()
-                      ->GetAttr<std::vector<float>>("Y0_scale")[0],
-              127 *
-                  matched.at("qkv_mul")
-                      ->stmt()
-                      ->op_info()
-                      ->GetAttr<std::vector<float>>("Y0_scale")[0],
-              127 *
-                  matched.at("qkv_mul_3")
-                      ->stmt()
-                      ->op_info()
-                      ->GetAttr<std::vector<float>>("Y0_scale")[0],
-              127 *
-                  matched.at("qkv_mul_4")
-                      ->stmt()
-                      ->op_info()
-                      ->GetAttr<std::vector<float>>("Y0_scale")[0],
-          });
-      VLOG(3) << "q/k/v weight_max: "
-              << 127 *
-                     matched.at("q_mul")
-                         ->stmt()
-                         ->op_info()
-                         ->GetAttr<std::vector<float>>("Y0_scale")[0]
-              << ", "
-              << 127 *
-                     matched.at("k_mul")
-                         ->stmt()
-                         ->op_info()
-                         ->GetAttr<std::vector<float>>("Y0_scale")[0]
-              << ", "
-              << 127 *
-                     matched.at("v_mul")
-                         ->stmt()
-                         ->op_info()
-                         ->GetAttr<std::vector<float>>("Y0_scale")[0];
+      // mul input_max, output_max * 6 + matmul x_max,y_max,output_max * 2
+      std::vector<float> fc_input_max;
+      std::vector<float> fc_weight_max;
+      std::vector<int> fc_channels;
+      set_quant_info(matched, &fc_input_max);  // set input/output scale
+      bool per_channel = false;
+      set_weight_info(
+          scope, matched, &fc_weight_max, &per_channel, &fc_channels);
+      op_desc.SetAttr<bool>("per_channel", per_channel);
+      op_desc.SetAttr<std::vector<int>>("fc_channels", fc_channels);
+      // mul & matmul input/output max
+      op_desc.SetAttr<std::vector<float>>("fc_input_max", fc_input_max);
+      op_desc.SetAttr<std::vector<float>>("Y0_max", fc_weight_max);
     }
     // extra traits to distill
     auto* reshape_op_info = matched.at("q_reshape2")->stmt()->op_info();
@@ -675,6 +631,8 @@ class XPUSingleEncoderFuser : public FuseBase {
               "X0_scale")[0]);
       // ew_add out_threshold for output quant
       auto& quant_ew = mul_add_ops[i];
+      CHECK(matched.at(quant_ew)->stmt()->op_info()->HasAttr("out_threshold"))
+          << "act after quant mul has no out_threshold";
       quant_info->push_back(
           matched.at(quant_ew)->stmt()->op_info()->GetAttr<float>(
               "out_threshold"));
@@ -697,7 +655,7 @@ class XPUSingleEncoderFuser : public FuseBase {
 
     if (matmul_quant) {
       auto* qkv_matmul_op_info = matched.at("qkv_matmul")->stmt()->op_info();
-      CHECK(qkv_matmul_op_info->HasAttr("X0_scale") == true);
+      CHECK(qkv_matmul_op_info->HasAttr("X0_scale"));
       float softmax_out_threshold = matched.at("qk_softmax")
                                         ->stmt()
                                         ->op_info()
@@ -709,7 +667,7 @@ class XPUSingleEncoderFuser : public FuseBase {
                             "X0_scale")[0] *
                             127),
                1e-5);
-      CHECK(qk_matmul_op_info->HasAttr("X0_scale") == true);
+      CHECK(qk_matmul_op_info->HasAttr("X0_scale"));
       quant_info->push_back(max_qkv_output);
       quant_info->push_back(max_qkv_output);
       quant_info->push_back(softmax_out_threshold);
@@ -720,6 +678,68 @@ class XPUSingleEncoderFuser : public FuseBase {
       CHECK_EQ(quant_info->size(), 18);
     }
   }
+  bool is_per_tensor(const std::vector<float>& weight_max) {
+    bool per_tensor = true;
+    CHECK_GT(weight_max.size(), 0) << "fc channel size: " << weight_max.size();
+    auto first = weight_max[0];
+    for (int i = 1; i < weight_max.size(); ++i) {
+      if (std::abs(first - weight_max[i]) > 1e-6) {
+        per_tensor = false;
+        break;
+      }
+    }
+    return per_tensor;
+  }
+  void set_weight_info(Scope* scope,
+                       const key2nodes_t& matched,
+                       std::vector<float>* weight_info,
+                       bool* per_channel,
+                       std::vector<int>* fc_channels) {
+    const std::vector<std::string> quant_mul_ops = {
+        "q_mul", "k_mul", "v_mul", "qkv_mul", "qkv_mul_3", "qkv_mul_4"};
+    bool tmp_pc = false;
+    for (int i = 0; i < quant_mul_ops.size(); ++i) {
+      auto& mul_op = quant_mul_ops[i];
+      auto op_info = matched.at(mul_op)->stmt()->op_info();
+      auto weight_name = op_info->Input("Y").front();
+      auto weight_shape = scope->FindMutableTensor(weight_name)->dims();
+      CHECK_EQ(weight_shape.size(), 2) << "weight_shape: " << weight_shape;
+      CHECK(op_info->HasAttr("Y0_scale")) << " quant op has no Y0_scale";
+      int weight_scale_size =
+          op_info->GetAttr<std::vector<float>>("Y0_scale").size();
+      CHECK_EQ(weight_scale_size, weight_shape[1])
+          << "weight_scale_size: " << weight_scale_size
+          << ", weight_shape: " << weight_shape;
+      CHECK_GT(weight_scale_size, 3)
+          << mul_op << ", weight_scale_size: " << weight_scale_size;
+      fc_channels->push_back(weight_scale_size);
+      if (i == 0) {
+        if (is_per_tensor(op_info->GetAttr<std::vector<float>>("Y0_scale"))) {
+          tmp_pc = false;
+          VLOG(3) << "mul quant using weight_max per tensor";
+        } else {
+          tmp_pc = true;
+          VLOG(3) << "mul quant using weight_max per channel";
+        }
+      }
+      for (int j = 0; j < weight_scale_size; ++j) {
+        weight_info->push_back(
+            127 * op_info->GetAttr<std::vector<float>>("Y0_scale")[j]);
+        if (!tmp_pc) break;
+      }
+      if (i < 3) {
+        if (tmp_pc) {
+          VLOG(3)
+              << mul_op << " weight max first channel: "
+              << (*weight_info)[i * weight_scale_size] << ", last channel:"
+              << (*weight_info)[i * weight_scale_size + weight_scale_size - 1];
+        } else {
+          VLOG(3) << mul_op << " weight max per tensor: " << (*weight_info)[i];
+        }
+      }
+    }
+    *per_channel = tmp_pc;
+  }
 };
 
 class XPUMultiEncoderFuser {
@@ -783,25 +803,60 @@ class XPUMultiEncoderFuser {
 
       std::set<const Node*> to_remove;
       Node* first_encoder = all_encoders[0];
+      auto* multi_encoder_stmt = first_encoder->stmt();
+      auto* first_encoder_op_info = multi_encoder_stmt->op_info();
+      bool per_channel = false;
+      if (first_encoder_op_info->HasAttr("per_channel")) {
+        per_channel = first_encoder_op_info->GetAttr<bool>("per_channel");
+      }
+      const int hidden_dim = first_encoder_op_info->GetAttr<int>("hidden_dim");
       std::string in_name, out_name;
       std::vector<std::string> arg_names{
           "FCWeight", "FCBias", "LNScale", "LNBias"};
       std::map<std::string, std::vector<std::string>> arg_map;
       std::vector<float> fc_weight_max;
       std::vector<float> fc_input_max;
+
+      std::vector<int> fc_channels;
+      int single_encoder_weight_scale_size = 0;
+      if (per_channel) {
+        for (auto channel :
+             first_encoder_op_info->GetAttr<std::vector<int>>("fc_channels")) {
+          single_encoder_weight_scale_size += channel;
+        }
+      } else {
+        // non-quant or per tensor quant
+        single_encoder_weight_scale_size = 6;
+      }
+      fc_weight_max.resize(all_encoders.size() *
+                           single_encoder_weight_scale_size);
+
       for (size_t i = 0; i < all_encoders.size(); ++i) {
         Node* cur_encoder = all_encoders[i];
         auto* op_info = cur_encoder->stmt()->op_info();
         if (enable_int8) {
           CHECK(op_info->HasAttr("enable_int8")) << "no enable_int8 attr";
           CHECK(op_info->HasAttr("Y0_max")) << "no Y0_max attr";
+          CHECK(op_info->HasAttr("per_channel")) << "no per_channel attr";
           CHECK(op_info->HasAttr("fc_input_max")) << "no fc_input_max attr";
-          for (auto y0 : op_info->GetAttr<std::vector<float>>("Y0_max")) {
-            fc_weight_max.push_back(y0);
+          CHECK_EQ(op_info->GetAttr<std::vector<float>>("Y0_max").size(),
+                   single_encoder_weight_scale_size)
+              << "invalid weight scale size: "
+              << op_info->GetAttr<std::vector<float>>("Y0_max").size() << ", "
+              << single_encoder_weight_scale_size;
+          for (int j = 0; j < single_encoder_weight_scale_size; ++j) {
+            fc_weight_max[i * single_encoder_weight_scale_size + j] =
+                op_info->GetAttr<std::vector<float>>("Y0_max")[j];
           }
           for (auto x0 : op_info->GetAttr<std::vector<float>>("fc_input_max")) {
             fc_input_max.push_back(x0);
           }
+          if (per_channel) {
+            for (auto channel :
+                 op_info->GetAttr<std::vector<int>>("fc_channels")) {
+              fc_channels.push_back(channel);
+            }
+          }
         }
         for (auto arg_name : arg_names) {
           auto real_names = op_info->Input(arg_name);
@@ -836,7 +891,6 @@ class XPUMultiEncoderFuser {
       }
       GraphSafeRemoveNodes(graph, to_remove);
 
-      auto* multi_encoder_stmt = first_encoder->stmt();
       cpp::OpDesc op_desc;
       op_desc.SetType("__xpu__multi_encoder");
       op_desc.SetInput("Input", {in_name});
@@ -850,30 +904,43 @@ class XPUMultiEncoderFuser {
       op_desc.SetAttr<bool>("enable_int8", enable_int8);
       if (enable_int8) {
         CHECK_EQ(fc_precision_, "int8");
-        CHECK_EQ(fc_weight_max.size(), all_encoders.size() * 6);
+        if (per_channel) {
+          CHECK_EQ(fc_weight_max.size(),
+                   all_encoders.size() * single_encoder_weight_scale_size)
+              << " fc_weight_max.size:" << fc_weight_max.size()
+              << ", all_encoders.size():" << all_encoders.size()
+              << ", single_encoder_weight_scale_size: "
+              << single_encoder_weight_scale_size;
+          CHECK_EQ(fc_channels.size(), all_encoders.size() * 6)
+              << "fc_channels.size:" << fc_channels.size();
+        } else {
+          CHECK_EQ(fc_weight_max.size(), all_encoders.size() * 6)
+              << " fc_weight_max.size:" << fc_weight_max.size()
+              << ", all_encoders.size():" << all_encoders.size();
+          CHECK_EQ(fc_channels.size(), 0) << fc_channels.size();
+        }
         CHECK((fc_input_max.size() == all_encoders.size() * 12) ||
               (fc_input_max.size() == all_encoders.size() * 18))
             << fc_input_max.size()
             << ", all_encoders.size:" << all_encoders.size();
-        for (int i = 0; i < fc_weight_max.size(); i += 6) {
-          CHECK_LT(std::abs(fc_weight_max[i] - fc_weight_max[i + 1]), 1e-5)
-              << " quanted ernie's q/k weight scale should be euqal: "
-              << fc_weight_max[i] << ", " << fc_weight_max[i + 1];
-          CHECK_LT(std::abs(fc_weight_max[i] - fc_weight_max[i + 2]), 1e-5)
-              << " quanted ernie's q/v weight scale should be euqal: "
-              << fc_weight_max[i] << ", " << fc_weight_max[i + 2];
+        if (!per_channel) {
+          for (int i = 0; i < fc_weight_max.size(); i += 6) {
+            CHECK_LT(std::abs(fc_weight_max[i] - fc_weight_max[i + 1]), 1e-5)
+                << " quanted ernie's q/k weight scale should be euqal: "
+                << fc_weight_max[i] << ", " << fc_weight_max[i + 1];
+            CHECK_LT(std::abs(fc_weight_max[i] - fc_weight_max[i + 2]), 1e-5)
+                << " quanted ernie's q/v weight scale should be euqal: "
+                << fc_weight_max[i] << ", " << fc_weight_max[i + 2];
+          }
         }
         op_desc.SetAttr<std::vector<float>>("FCInputMax", fc_input_max);
-        // "FCWeightMax" is also stored as "Input" now
-        op_desc.SetAttr<std::vector<float>>("FCWeightMax", fc_weight_max);
+        VLOG(3) << "fc_input_max size: " << fc_input_max.size();
         // only support adaptive_seqlen in int8 quant model
         CHECK_EQ(adaptive_seqlen_, true);
       } else {
-        fc_weight_max.resize(arg_map["FCWeight"].size());
+        CHECK_EQ(per_channel, false) << "per_channel in non-quant model";
       }
-      auto* first_encoder_op_info = multi_encoder_stmt->op_info();
-      op_desc.SetAttr<int>("hidden_dim",
-                           first_encoder_op_info->GetAttr<int>("hidden_dim"));
+      op_desc.SetAttr<int>("hidden_dim", hidden_dim);
       op_desc.SetAttr<int>("head_num",
                            first_encoder_op_info->GetAttr<int>("head_num"));
       op_desc.SetAttr<int>(
@@ -884,17 +951,20 @@ class XPUMultiEncoderFuser {
           "act_type", first_encoder_op_info->GetAttr<std::string>("act_type"));
       op_desc.SetAttr<std::string>("precision", fc_precision_);
       op_desc.SetAttr<bool>("adaptive_seqlen", adaptive_seqlen_);
+      op_desc.SetAttr<bool>("per_channel", per_channel);
+      if (per_channel) {
+        op_desc.SetAttr<std::vector<int>>("fc_channels", fc_channels);
+      }
 
       // q/k/v fusion
       bool enable_qkv_fusion = true;
-      if (norm_before_0) {
+      if (norm_before_0 && !adaptive_seqlen_) {
         enable_qkv_fusion = false;
       }
       op_desc.SetAttr<bool>("enable_qkv_fusion", enable_qkv_fusion);
 
       auto* scope = multi_encoder_stmt->op()->scope();
       auto& fc_weight_names = arg_map["FCWeight"];
-      CHECK_EQ(fc_weight_max.size(), fc_weight_names.size());
       for (size_t i = 0; i < fc_weight_names.size(); ++i) {
         if (enable_qkv_fusion && (i % 6 == 0)) {
           auto weight_tensor_tmp = scope->FindMutableTensor(fc_weight_names[i]);
@@ -968,7 +1038,6 @@ class XPUMultiEncoderFuser {
                qkv_len * sizeof(float));
       }
 
-      // TODO(mayang02): we could use attr to store FCWeightMax
       std::string max_name = "encoder_max_" + fc_weight_names[0];
       VLOG(3) << "multi-encoder max weight name: " << max_name;
       auto* max_filter_node = graph->RetrieveArgument(max_name);
@@ -1082,9 +1151,7 @@ class XPUMultiEncoderFuser {
     int qkv_offset = 0;
     if (enable_int8) {
       CHECK_EQ(fc_precision_, "int8");
-      CHECK(end <= fc_weight_max->size());
       std::unique_ptr<int8_t[]> weight_qkv_trans(new int8_t[qkv_len]);
-      float max_f = (*fc_weight_max)[start];
       for (int i = 0; i < (end - start); ++i) {
         // the quanted weight is alreay int8 in quanted model
         int8_t* weight_host_ptr = weight_tensor_vec[i]->mutable_data<int8_t>();
@@ -1098,17 +1165,9 @@ class XPUMultiEncoderFuser {
                weight_host_trans.get(),
                weight_len_vec[i] * sizeof(int8_t));
         qkv_offset += weight_len_vec[i];
-        if (i > 0) {
-          max_f = std::max(max_f, (*fc_weight_max)[start + i]);
-          VLOG(5) << "start+i:" << start + i
-                  << ", weigh_max: " << (*fc_weight_max)[start + i]
-                  << ", max_f:" << max_f;
-        }
       }
       CHECK_EQ(qkv_offset, qkv_len);
       weight_tensor_vec[0]->Resize({weight_dim1_acc, weight_dims_vec[0][0]});
-      (*fc_weight_max)[start] = max_f;
-      VLOG(3) << "QKV fused FC-" << start << ", weight_max:" << max_f;
       memcpy(weight_tensor_vec[0]->mutable_data<int8_t>(),
              weight_qkv_trans.get(),
              qkv_len * sizeof(int8_t));
@@ -1174,9 +1233,9 @@ class XPUMultiEncoderFusePass : public ProgramPass {
     std::vector<std::string> input_poss{"X", "Y"};
     std::vector<std::string> qkv_ln_2_out_poss{"X", "Y"};
     std::vector<std::string> matmul_types{"matmul", "matmul_v2"};
-    std::vector<std::string> mul_types{"mul", "matmul"};
+    std::vector<std::string> mul_types{"mul", "matmul", "matmul_v2"};
     std::vector<bool> with_q_scales{true, false};
-    std::vector<bool> norm_befores{false};
+    std::vector<bool> norm_befores{true, false};
 
     std::string fc_precision;
     bool adaptive_seqlen = false;
diff --git a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_slice_link_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_slice_link_fuse_pass.cc
index 505e912ff5d..2d009df752e 100644
--- a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_slice_link_fuse_pass.cc
+++ b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_slice_link_fuse_pass.cc
@@ -25,14 +25,17 @@ namespace fusion {
 
 class XPUMultiEncoderSliceLinkFuser : public FuseBase {
  public:
+  explicit XPUMultiEncoderSliceLinkFuser(bool pre_ln = false)
+      : pre_ln_(pre_ln) {}
   void BuildPattern() override {
     auto* xpu_encoder = OpNode("xpu_encoder", "__xpu__multi_encoder");
     auto* encoder_out =
         VarNode("encoder_out")
             ->assert_is_op_output("__xpu__multi_encoder", "Output")
-            ->assert_is_op_input("slice", "Input")
-            ->assert_only_one_output()
-            ->AsIntermediate();
+            ->assert_only_one_output();
+    PMNode* layer_norm = nullptr;
+    PMNode* layer_norm_out = nullptr;
+
     auto* slice = OpNode("slice", "slice")
                       ->assert_op_attr_satisfied<std::vector<int>>(
                           "axes",
@@ -45,13 +48,28 @@ class XPUMultiEncoderSliceLinkFuser : public FuseBase {
                             return attr.size() == 1 && attr[0] == 0;
                           })
                       ->assert_op_attr_satisfied<std::vector<int>>(
-                          "ends",
-                          [](const std::vector<int>& attr) {
+                          "ends", [](const std::vector<int>& attr) {
                             return attr.size() == 1 && attr[0] == 1;
-                          })
-                      ->AsIntermediate();
+                          });
+    if (pre_ln_) {
+      xpu_encoder->assert_op_attr<bool>("norm_before", true);
+      encoder_out->assert_is_op_input("layer_norm", "X");
+      layer_norm = OpNode("layer_norm", "layer_norm");
+      layer_norm_out = VarNode("layer_norm_out")
+                           ->assert_is_op_output("layer_norm", "Y")
+                           ->assert_is_op_input("slice", "Input");
+    } else {
+      xpu_encoder->assert_op_attr<bool>("norm_before", false);
+      encoder_out->assert_is_op_input("slice", "Input")->AsIntermediate();
+      slice->AsIntermediate();
+    }
     auto* slice_out = VarNode("slice_out")->assert_is_op_output("slice", "Out");
-    *xpu_encoder >> *encoder_out >> *slice >> *slice_out;
+    if (pre_ln_) {
+      *xpu_encoder >> *encoder_out >> *layer_norm >> *layer_norm_out >>
+          *slice >> *slice_out;
+    } else {
+      *xpu_encoder >> *encoder_out >> *slice >> *slice_out;
+    }
   }
 
   void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
@@ -62,7 +80,9 @@ class XPUMultiEncoderSliceLinkFuser : public FuseBase {
     auto slice_op_desc = *slice_instruct->op_info();
     std::string slice_out_name = matched.at("slice_out")->arg()->name;
 
-    encoder_op_desc.SetOutput("Output", {slice_out_name});
+    if (!pre_ln_) {
+      encoder_op_desc.SetOutput("Output", {slice_out_name});
+    }
     auto slice_axes = slice_op_desc.GetAttr<std::vector<int>>("axes");
     encoder_op_desc.SetAttr("slice_axes", slice_axes);
     if (slice_op_desc.HasAttr("starts")) {
@@ -79,8 +99,13 @@ class XPUMultiEncoderSliceLinkFuser : public FuseBase {
       encoder_op_desc.SetAttr("slice_decrease_axis", slice_decrease_axis);
     }
     encoder_instruct->ResetOp(encoder_op_desc, encoder_op->valid_places());
-    DirectedLink(matched.at("xpu_encoder"), matched.at("slice_out"));
+    if (!pre_ln_) {
+      DirectedLink(matched.at("xpu_encoder"), matched.at("slice_out"));
+    }
   }
+
+ private:
+  bool pre_ln_;
 };
 
 }  // namespace fusion
@@ -88,8 +113,11 @@ class XPUMultiEncoderSliceLinkFuser : public FuseBase {
 class XPUMultiEncoderSliceLinkFusePass : public ProgramPass {
  public:
   void Apply(const std::unique_ptr<SSAGraph>& graph) override {
-    fusion::XPUMultiEncoderSliceLinkFuser fuser;
-    fuser(graph.get());
+    std::vector<bool> pre_lns{true, false};
+    for (auto pre_ln : pre_lns) {
+      fusion::XPUMultiEncoderSliceLinkFuser fuser(pre_ln);
+      fuser(graph.get());
+    }
   }
 };
 
diff --git a/lite/core/optimizer/mir/static_kernel_pick_pass.cc b/lite/core/optimizer/mir/static_kernel_pick_pass.cc
index 92695aa9ed7..236173558d0 100644
--- a/lite/core/optimizer/mir/static_kernel_pick_pass.cc
+++ b/lite/core/optimizer/mir/static_kernel_pick_pass.cc
@@ -193,4 +193,5 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(static_kernel_pick_pass,
                   paddle::lite::mir::StaticKernelPickPass)
-    .BindTargets({TARGET(kAny)});
+    .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kXPU)});
diff --git a/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc b/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc
index d5032333a57..8bfc85493d1 100644
--- a/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc
+++ b/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc
@@ -140,27 +140,6 @@ void XPUMemoryOptimizePass::CollectLifeCycleByDevice(SSAGraph* graph) {
       var_nodes.insert(
           var_nodes.end(), op_node->outlinks.begin(), op_node->outlinks.end());
       TargetType target_type;
-      for (auto* var_node : var_nodes) {
-        CHECK(var_node->IsArg());
-        auto& arg = var_node->AsArg();
-        if (arg.is_weight || arg.is_persist) continue;
-        std::string var_name = arg.name;
-        VLOG(4) << "OP VAR NAME IS " << var_name;
-        if (var_name.find("_xpu_max") != std::string::npos) continue;
-        if (invalid_var_names.count(var_name)) continue;
-        target_type = arg.type->target();
-        if (is_host(target_type)) target_type = TARGET(kHost);
-
-        if (!lifecycles[TargetToStr(target_type)].count(var_name)) {
-          lifecycles[TargetToStr(target_type)].emplace(
-              var_name, std::make_pair(max_lifecycle_, max_lifecycle_));
-        } else {
-          int cur_life = lifecycles[TargetToStr(target_type)][var_name].second;
-          lifecycles[TargetToStr(target_type)][var_name].second =
-              (std::max)(max_lifecycle_, cur_life);
-        }
-      }
-      ++max_lifecycle_;
 
       auto inplace_op_node = inplace_op_nodes.find(op_type);
       if (inplace_op_node != inplace_op_nodes.end()) {
@@ -171,6 +150,8 @@ void XPUMemoryOptimizePass::CollectLifeCycleByDevice(SSAGraph* graph) {
         if (inplace) {
           auto in_arg_name = op_info->Input("X")[0];
           auto out_arg_name = op_info->Output("Out")[0];
+          if (invalid_var_names.count(in_arg_name)) continue;
+          if (invalid_var_names.count(out_arg_name)) continue;
           bool reuse = false;
           int i = 0;
           for (const auto& reuse_var_names : inpalce_reuse_var_names) {
@@ -190,6 +171,28 @@ void XPUMemoryOptimizePass::CollectLifeCycleByDevice(SSAGraph* graph) {
           }
         }
       }
+
+      for (auto* var_node : var_nodes) {
+        CHECK(var_node->IsArg());
+        auto& arg = var_node->AsArg();
+        if (arg.is_weight || arg.is_persist) continue;
+        std::string var_name = arg.name;
+        VLOG(4) << "OP VAR NAME IS " << var_name;
+        if (var_name.find("_xpu_max") != std::string::npos) continue;
+        if (invalid_var_names.count(var_name)) continue;
+        target_type = arg.type->target();
+        if (is_host(target_type)) target_type = TARGET(kHost);
+
+        if (!lifecycles[TargetToStr(target_type)].count(var_name)) {
+          lifecycles[TargetToStr(target_type)].emplace(
+              var_name, std::make_pair(max_lifecycle_, max_lifecycle_));
+        } else {
+          int cur_life = lifecycles[TargetToStr(target_type)][var_name].second;
+          lifecycles[TargetToStr(target_type)][var_name].second =
+              (std::max)(max_lifecycle_, cur_life);
+        }
+      }
+      ++max_lifecycle_;
     }
   }
 
diff --git a/lite/core/optimizer/optimizer.cc b/lite/core/optimizer/optimizer.cc
index 17a6a62ba47..68ba7e91107 100644
--- a/lite/core/optimizer/optimizer.cc
+++ b/lite/core/optimizer/optimizer.cc
@@ -14,6 +14,9 @@
 
 #include "lite/core/optimizer/optimizer.h"
 #include <fstream>
+#ifdef LITE_WITH_XPU
+#include "lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h"
+#endif
 #include "lite/core/optimizer/mir/static_kernel_pick_pass.h"
 #include "lite/core/optimizer/mir/type_target_cast_pass.h"
 #include "lite/model_parser/model_parser.h"
@@ -49,7 +52,6 @@ std::unique_ptr<RuntimeProgram> Optimizer::Run(Program&& program) {
     graph->SetValidPlaces(valid_places_);
     graphs_.emplace_back(std::move(graph));
   }
-
   SpecifyKernelPickTactic(kernel_pick_factor_);
   InitTargetTypeTransformPass();
   InitControlFlowOpUnusedInputsAndOutputsEliminatePass();
@@ -63,8 +65,12 @@ std::unique_ptr<RuntimeProgram> Optimizer::Run(Program&& program) {
 }
 
 void Optimizer::SpecifyKernelPickTactic(core::KernelPickFactor factor) {
+  std::string static_pick_name = "static_kernel_pick_pass";
+#ifdef LITE_WITH_XPU
+  static_pick_name = "__xpu__static_kernel_pick_pass";
+#endif
   auto* pass = mir::PassManager::Global().LookUp<mir::StaticKernelPickPass>(
-      "static_kernel_pick_pass");
+      static_pick_name);
   CHECK(pass);
 
   *pass->mutable_kernel_pick_factors() = factor;
@@ -218,6 +224,9 @@ std::unique_ptr<RuntimeProgram> RunDefaultOptimizer(
        "fpga_concat_fuse_pass",
        "control_flow_op_unused_inputs_and_outputs_eliminate_pass",
        "static_kernel_pick_pass",  // pick original kernel from graph
+#ifdef LITE_WITH_XPU
+       "__xpu__static_kernel_pick_pass",  // xpu pick original kernel from graph
+#endif
 
        "remove_tf_redundant_ops_pass",
        "variable_place_inference_pass",  // inference arg/var's
diff --git a/lite/kernels/host/tile_compute.cc b/lite/kernels/host/tile_compute.cc
index b1a61aebc41..11d4d013cc6 100644
--- a/lite/kernels/host/tile_compute.cc
+++ b/lite/kernels/host/tile_compute.cc
@@ -85,9 +85,10 @@ void TileCompute<T, PType>::Run() {
       int dst_stride = in_stride[i + 1] * right;
       for (int m = 0; m < num; m++) {
         for (int j = 0; j < bcast_dims[i]; j++) {
-          std::memcpy(tmp_dst + j * dst_stride / bcast_dims[i] + m * dst_stride,
-                      tmp_src + m * dst_stride / bcast_dims[i],
-                      dst_stride / bcast_dims[i] * sizeof(T));
+          std::memcpy(
+              tmp_dst + j * (dst_stride / bcast_dims[i]) + m * dst_stride,
+              tmp_src + m * (dst_stride / bcast_dims[i]),
+              dst_stride / bcast_dims[i] * sizeof(T));
         }
       }
       tmp_src_tensor.CopyDataFrom(tmp_dst_tensor);
diff --git a/lite/kernels/x86/slice_compute.cc b/lite/kernels/x86/slice_compute.cc
index 88194d5c8c0..3bd26fb4511 100644
--- a/lite/kernels/x86/slice_compute.cc
+++ b/lite/kernels/x86/slice_compute.cc
@@ -33,6 +33,25 @@ REGISTER_LITE_KERNEL(slice,
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(slice,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SliceCompute<float>,
+                     array_def)
+    .BindInput("Input",
+               {LiteType::GetTensorListTy(TARGET(kX86), PRECISION(kFloat))})
+    .BindInput("StartsTensor",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))})
+    .BindInput("EndsTensor",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(slice,
                      kX86,
                      kFloat,
@@ -52,6 +71,25 @@ REGISTER_LITE_KERNEL(slice,
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(slice,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SliceCompute<int>,
+                     array_int32)
+    .BindInput("Input",
+               {LiteType::GetTensorListTy(TARGET(kX86), PRECISION(kInt32))})
+    .BindInput("StartsTensor",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))})
+    .BindInput("EndsTensor",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(slice,
                      kX86,
                      kFloat,
@@ -70,3 +108,22 @@ REGISTER_LITE_KERNEL(slice,
                {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(slice,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SliceCompute<int64_t>,
+                     array_int64)
+    .BindInput("Input",
+               {LiteType::GetTensorListTy(TARGET(kX86), PRECISION(kInt64))})
+    .BindInput("StartsTensor",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))})
+    .BindInput("EndsTensor",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/x86/slice_compute.h b/lite/kernels/x86/slice_compute.h
index 1de0c368c68..4d448f47049 100644
--- a/lite/kernels/x86/slice_compute.h
+++ b/lite/kernels/x86/slice_compute.h
@@ -28,6 +28,47 @@ namespace lite {
 namespace kernels {
 namespace x86 {
 
+void DealTensorArray(const std::vector<lite::Tensor>* XTensorList,
+                     std::vector<lite::Tensor>* OutTensorList,
+                     lite::Tensor* Out,
+                     const std::vector<int>& starts,
+                     const std::vector<int>& ends,
+                     bool out_is_array) {
+  auto in_array = XTensorList;
+  // If the input is LoDTensorArray, the rank of input is 1.
+  int64_t in_size = in_array->size();
+  int64_t start = starts[0] < 0 ? (starts[0] + in_size) : starts[0];
+  int64_t end = ends[0] < 0 ? (ends[0] + in_size) : ends[0];
+
+  start = std::max(start, static_cast<int64_t>(0));
+  end = std::max(end, static_cast<int64_t>(0));
+  end = std::min(end, in_size);
+
+  CHECK_GT(end, start) << "end should greater than start";
+  int64_t out_size = end - start;
+
+  if (out_is_array) {
+    auto out_array = OutTensorList;
+    out_array->resize(out_size);
+    for (int i = 0; i < out_size; ++i) {
+      auto* out_tensor = &out_array->at(i);
+      auto in_tensor = in_array->at(i + start);
+      out_tensor->set_lod(in_tensor.lod());
+      if (in_tensor.memory_size() > 0) {
+        out_tensor->CopyDataFrom(in_tensor);
+      } else {
+        VLOG(4) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
+                   "nothing has been written to output array["
+                << i << "].";
+      }
+    }
+  } else {
+    auto out_tensor = Out;
+    auto in_tensor = in_array->at(start);
+    out_tensor->CopyDataFrom(in_tensor);
+  }
+}
+
 inline std::vector<int> GetIntDataFromTensorList(
     const std::vector<lite::Tensor*>& list_tensor) {
   std::vector<int> vec_data;
@@ -219,6 +260,8 @@ void slice_compute(const lite::Tensor* in,
 template <class T>
 void slice_compute_(const lite::Tensor* Input,
                     lite::Tensor* Out,
+                    const std::vector<lite::Tensor>* XTensorList,
+                    std::vector<lite::Tensor>* OutTensorList,
                     std::vector<int> axes,
                     std::vector<int> starts,
                     std::vector<int> ends,
@@ -228,6 +271,38 @@ void slice_compute_(const lite::Tensor* Input,
                     std::vector<lite::Tensor*> StartsTensorList,
                     std::vector<lite::Tensor*> EndsTensorList,
                     std::vector<int> infer_flags) {
+  if (Input == nullptr && XTensorList != nullptr) {
+    bool need_infer = false;
+    if (StartsTensor || EndsTensor) {
+      need_infer = true;
+    }
+    if (StartsTensorList.size() > 0 || EndsTensorList.size() > 0) {
+      need_infer = true;
+    }
+    if (need_infer) {
+      if (StartsTensor) {
+        starts = GetIntDataFromTensor(StartsTensor);
+      } else if (StartsTensorList.size() > 0) {
+        starts = GetIntDataFromTensorList(StartsTensorList);
+      }
+      CHECK_EQ(starts.size(), axes.size())
+          << "The size of starts must be equal to the size of axes.";
+      if (EndsTensor) {
+        ends = GetIntDataFromTensor(EndsTensor);
+      } else if (EndsTensorList.size() > 0) {
+        ends = GetIntDataFromTensorList(EndsTensorList);
+      }
+      CHECK_EQ(ends.size(), axes.size())
+          << "The size of starts must be equal to the size of axes.";
+    }
+    DealTensorArray(XTensorList,
+                    OutTensorList,
+                    Out,
+                    starts,
+                    ends,
+                    (Out == nullptr && OutTensorList != nullptr));
+    return;
+  }
   int rank = Input->dims().size();
   switch (rank) {
     case 1:
@@ -320,6 +395,8 @@ class SliceCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto& param = *param_.get_mutable<param_t>();
     slice_compute_<T>(param.X,
                       param.Out,
+                      param.XTensorList,
+                      param.OutTensorList,
                       param.axes,
                       param.starts,
                       param.ends,
diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
index 29266862dea..1efed16ac05 100644
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
@@ -30,6 +30,7 @@ add_kernel(gru_compute_xpu XPU basic SRCS gru_compute.cc)
 add_kernel(gru_unit_compute_xpu XPU basic SRCS gru_unit_compute.cc)
 add_kernel(stack_compute_xpu XPU basic SRCS stack_compute.cc)
 add_kernel(slice_compute_xpu XPU basic SRCS slice_compute.cc)
+add_kernel(tile_compute_xpu XPU basic SRCS tile_compute.cc)
 add_kernel(cast_compute_xpu XPU basic SRCS cast_compute.cc)
 add_kernel(sequence_topk_avg_pooling_compute_xpu XPU basic SRCS sequence_topk_avg_pooling_compute.cc)
 add_kernel(concat_compute_xpu XPU basic SRCS concat_compute.cc)
diff --git a/lite/kernels/xpu/__xpu__bigru_compute.cc b/lite/kernels/xpu/__xpu__bigru_compute.cc
index 0fb9e3c3fe2..780f904e525 100644
--- a/lite/kernels/xpu/__xpu__bigru_compute.cc
+++ b/lite/kernels/xpu/__xpu__bigru_compute.cc
@@ -55,13 +55,14 @@ void XPUBiGRUCompute::PrepareBiasForRun(bool forward) {
 void XPUBiGRUCompute::PrepareMulWeightForRun(bool forward) {
   auto& mul_quant_weight_ =
       forward ? fw_mul_quant_weight_ : bw_mul_quant_weight_;
+  auto& ctx = this->ctx_->template As<XPUContext>();
   auto& param = this->template Param<param_t>();
   auto* weight = forward ? param.fw_mul_w : param.bw_mul_w;
   auto weight_ptr = weight->data<float>();
   auto weight_dims = weight->dims();
   mul_quant_weight_ =
       TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, int16_t>(
-          weight_ptr, weight_dims, true);
+          weight_ptr, weight_dims, true, ctx.GetRawContext()->max_ptr_size());
 }
 
 void XPUBiGRUCompute::PrepareGRUWeightForRun(bool forward) {
diff --git a/lite/kernels/xpu/__xpu__conv2d_compute.cc b/lite/kernels/xpu/__xpu__conv2d_compute.cc
index cad9a4fd691..8c267843f8b 100644
--- a/lite/kernels/xpu/__xpu__conv2d_compute.cc
+++ b/lite/kernels/xpu/__xpu__conv2d_compute.cc
@@ -22,36 +22,12 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-template <typename T>
-bool QuantFilter(const float* filter_on_host,
-                 T* quant_res,
-                 float max,
-                 int64_t len) {
-  return false;
-}
-
-template <>
-bool QuantFilter<int16_t>(const float* filter_on_host,
-                          int16_t* quant_res,
-                          float max,
-                          int64_t len) {
-  paddle::lite::xpu::math::ConvertFP32ToInt16(
-      filter_on_host, quant_res, max, len);
-  return true;
-}
-
-template <>
-bool QuantFilter<int8_t>(const float* filter_on_host,
-                         int8_t* quant_res,
-                         float max,
-                         int64_t len) {
-  paddle::lite::xpu::math::ConvertFP32ToInt8(
-      filter_on_host, quant_res, max, len);
-  return true;
-}
-
-template <typename T, PrecisionType PType>
-void XPUConv2dCompute<T, PType>::PrepareForRun() {
+template <typename TGEMM,
+          typename TW,
+          typename DX,
+          typename DY,
+          PrecisionType PType>
+void XPUConv2dCompute<TGEMM, TW, DX, DY, PType>::PrepareForRun() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
   int max_ptr_size = ctx.GetRawContext()->max_ptr_size();
@@ -60,12 +36,16 @@ void XPUConv2dCompute<T, PType>::PrepareForRun() {
   auto filter_dims = param.filter->dims();
 
   xpu_quant_filter_ =
-      TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, T>(
-          filter_ptr, filter_dims, false);
+      TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, TW>(
+          filter_ptr, filter_dims, false, max_ptr_size);
 }
 
-template <typename T, PrecisionType PType>
-void XPUConv2dCompute<T, PType>::Run() {
+template <typename TGEMM,
+          typename TW,
+          typename DX,
+          typename DY,
+          PrecisionType PType>
+void XPUConv2dCompute<TGEMM, TW, DX, DY, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
 
@@ -86,8 +66,8 @@ void XPUConv2dCompute<T, PType>::Run() {
       param.output_max->template mutable_data<float>(TARGET(kXPU));
   const auto* bias =
       param.has_bias ? param.bias->template data<float>() : nullptr;
-  const float* branch =
-      param.has_branch ? param.branch->template data<float>() : nullptr;
+  const DY* branch =
+      param.has_branch ? param.branch->template data<DY>() : nullptr;
   const float* input_max =
       param.input_max ? param.input_max->template data<float>() : nullptr;
   xdnn::Activation_t act((xdnn::Activation_t::act_enum)act_type);
@@ -101,15 +81,15 @@ void XPUConv2dCompute<T, PType>::Run() {
     CHECK_EQ(act_type, 0);
     if (branch_broadcast_guard_.get() == nullptr) {
       branch_broadcast_guard_ = TargetWrapperXPU::MallocScratchPad(
-          param.output->numel() * sizeof(float));
+          param.output->numel() * sizeof(DY));
     } else {
-      branch_broadcast_guard_->Reserve(param.output->numel() * sizeof(float));
+      branch_broadcast_guard_->Reserve(param.output->numel() * sizeof(DY));
     }
-    int r = xdnn::conv2d_fusion<float, T, float, T>(
+    int r = xdnn::conv2d_fusion<DX, TW, DY, TGEMM>(
         ctx.GetRawContext(),
-        param.input->template data<float>(),
-        reinterpret_cast<const T*>(xpu_quant_filter_.data_ptr_),
-        reinterpret_cast<float*>(branch_broadcast_guard_->addr_),
+        param.input->template data<DX>(),
+        reinterpret_cast<const TW*>(xpu_quant_filter_.data_ptr_),
+        reinterpret_cast<DY*>(branch_broadcast_guard_->addr_),
         batch,
         img_c,
         img_h,
@@ -139,21 +119,21 @@ void XPUConv2dCompute<T, PType>::Run() {
     if (branch_shape > conv_out_shape) {
       param.output->Resize(lite::DDim(branch_shape));
     }
-    float* output = param.output->template mutable_data<float>(TARGET(kXPU));
-    r = xdnn::broadcast_add<float>(
+    DY* output = param.output->template mutable_data<DY>(TARGET(kXPU));
+    r = xdnn::broadcast_add<DY>(
         ctx.GetRawContext(),
-        reinterpret_cast<float*>(branch_broadcast_guard_->addr_),
+        reinterpret_cast<DY*>(branch_broadcast_guard_->addr_),
         branch,
         output,
         xshape,
         yshape);
     CHECK_EQ(r, 0);
   } else {
-    float* output = param.output->template mutable_data<float>(TARGET(kXPU));
-    int r = xdnn::conv2d_fusion<float, T, float, T>(
+    DY* output = param.output->template mutable_data<DY>(TARGET(kXPU));
+    int r = xdnn::conv2d_fusion<DX, TW, DY, TGEMM>(
         ctx.GetRawContext(),
-        param.input->template data<float>(),
-        reinterpret_cast<const T*>(xpu_quant_filter_.data_ptr_),
+        param.input->template data<DX>(),
+        reinterpret_cast<const TW*>(xpu_quant_filter_.data_ptr_),
         output,
         batch,
         img_c,
@@ -182,11 +162,27 @@ void XPUConv2dCompute<T, PType>::Run() {
 }  // namespace paddle
 
 namespace xpu = paddle::lite::kernels::xpu;
-using XPUConv2dFp32 = xpu::XPUConv2dCompute<int16_t, PRECISION(kFloat)>;
 
-using XPUConv2dInt8 = xpu::XPUConv2dCompute<int8_t, PRECISION(kInt8)>;
+using XPUConv2dFP32 =
+    xpu::XPUConv2dCompute<int, float, float, float, PRECISION(kFloat)>;
+
+using XPUConv2d_FP16_FP32_FP32 =
+    xpu::XPUConv2dCompute<int16_t, int16_t, float, float, PRECISION(kFloat)>;
+
+using XPUConv2dFp16 =
+    xpu::XPUConv2dCompute<int16_t, int16_t, float16, float16, PRECISION(kFP16)>;
+
+using XPUConv2d_FP16_FP16_FP32 =
+    xpu::XPUConv2dCompute<int16_t, int16_t, float16, float, PRECISION(kFP16)>;
 
-REGISTER_LITE_KERNEL(__xpu__conv2d, kXPU, kFloat, kNCHW, XPUConv2dFp32, def)
+using XPUConv2d_FP16_FP32_FP16 =
+    xpu::XPUConv2dCompute<int16_t, int16_t, float, float16, PRECISION(kFP16)>;
+
+using XPUConv2dInt8_FP32_FP32 =
+    xpu::XPUConv2dCompute<int8_t, int8_t, float, float, PRECISION(kInt8)>;
+
+REGISTER_LITE_KERNEL(
+    __xpu__conv2d, kXPU, kFloat, kNCHW, XPUConv2d_FP16_FP32_FP32, def)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
     .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
@@ -196,7 +192,71 @@ REGISTER_LITE_KERNEL(__xpu__conv2d, kXPU, kFloat, kNCHW, XPUConv2dFp32, def)
     .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(__xpu__conv2d, kXPU, kInt8, kNCHW, XPUConv2dInt8, def)
+REGISTER_LITE_KERNEL(
+    __xpu__conv2d, kXPU, kFloat, kNCHW, XPUConv2dFP32, XPU_Real_kFloat)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Branch", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    __xpu__conv2d, kXPU, kFP16, kNCHW, XPUConv2dFp16, XPU_FP16_FP16__FP16)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindInput("Branch",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(__xpu__conv2d,
+                     kXPU,
+                     kFP16,
+                     kNCHW,
+                     XPUConv2d_FP16_FP16_FP32,
+                     XPU_FP16_FP16__FP32)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindInput("Branch",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(__xpu__conv2d,
+                     kXPU,
+                     kFP16,
+                     kNCHW,
+                     XPUConv2d_FP16_FP32_FP16,
+                     XPU_FP16_FP32__FP16)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindInput("Branch",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(__xpu__conv2d,
+                     kXPU,
+                     kInt8,
+                     kNCHW,
+                     XPUConv2dInt8_FP32_FP32,
+                     XPU_Int8_FP32_FP32)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
     .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
diff --git a/lite/kernels/xpu/__xpu__conv2d_compute.h b/lite/kernels/xpu/__xpu__conv2d_compute.h
index 69a9aec69c8..c3c31d94743 100644
--- a/lite/kernels/xpu/__xpu__conv2d_compute.h
+++ b/lite/kernels/xpu/__xpu__conv2d_compute.h
@@ -21,8 +21,11 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace xpu {
-
-template <typename T, PrecisionType PType>
+template <typename TGEMM,
+          typename TW,
+          typename DX,
+          typename DY,
+          PrecisionType PType>
 class XPUConv2dCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::XPUBlockFuseParam;
diff --git a/lite/kernels/xpu/__xpu__fc_compute.cc b/lite/kernels/xpu/__xpu__fc_compute.cc
index 4cd429a836e..e3e465da5d2 100644
--- a/lite/kernels/xpu/__xpu__fc_compute.cc
+++ b/lite/kernels/xpu/__xpu__fc_compute.cc
@@ -24,75 +24,102 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-void XPUFcCompute::PrepareForRun() {
+template <typename TGEMM,
+          typename TW,
+          typename DX,
+          typename DY,
+          PrecisionType PType>
+void XPUFcCompute<TGEMM, TW, DX, DY, PType>::PrepareForRun() {
   auto& ctx = this->ctx_->template As<XPUContext>();
   auto& param = this->template Param<param_t>();
-  auto w_ptr = param.w->data<float>();
+  auto w_ptr = param.w->template data<float>();
   auto weight_dims = param.w->dims();
-  bool quant_int8 = false;
-  if (param.quant_w_max > 0.f) {
-    quant_int8 = true;
-  }
+  bool w_trans = param.transpose_w;
+  enable_int8_ = param.enable_int8;
+  per_channel_ = param.per_channel;
   // max
   int max_ptr_size = ctx.GetRawContext()->max_ptr_size();
-  input_max_guard_ =
-      TargetWrapperXPU::MallocScratchPad(max_ptr_size * sizeof(float));
-  if (quant_int8) {  // for paddle slim int8 quant
+  if (enable_int8_) {  // for paddle slim int8 quant
+    input_max_guard_ =
+        TargetWrapperXPU::MallocScratchPad(max_ptr_size * sizeof(float));
     xpu_quant_weight_ =
         TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<int8_t, int8_t>(
-            reinterpret_cast<const int8_t*>(w_ptr), weight_dims, true);
-    std::vector<float> cpu_w_max(max_ptr_size, param.quant_w_max);
+            reinterpret_cast<const int8_t*>(w_ptr),
+            weight_dims,
+            w_trans,
+            per_channel_ ? param.weight_max.size() : max_ptr_size);
     CHECK(xpu_quant_weight_.max_ptr_ != nullptr)
         << "slim int8 quant xpu_quant_weight_max_ptr should't be null";
-    lite::TargetWrapperXPU::MemcpySync(xpu_quant_weight_.max_ptr_,
-                                       cpu_w_max.data(),
-                                       sizeof(float) * max_ptr_size,
-                                       IoDirection::HtoD);
     std::vector<float> cpu_input_max(max_ptr_size, param.quant_input_max);
     lite::TargetWrapperXPU::MemcpySync(input_max_guard_->addr_,
                                        cpu_input_max.data(),
                                        sizeof(float) * max_ptr_size,
                                        IoDirection::HtoD);
+    if (per_channel_) {
+      lite::TargetWrapperXPU::MemcpySync(
+          xpu_quant_weight_.max_ptr_,
+          param.weight_max.data(),
+          sizeof(float) * param.weight_max.size(),
+          IoDirection::HtoD);
+    } else {
+      VLOG(3) << "set weight max :" << max_ptr_size
+              << ", param.weight_max[0]:" << param.weight_max[0];
+      std::vector<float> cpu_w_max(max_ptr_size, param.weight_max[0]);
+      lite::TargetWrapperXPU::MemcpySync(xpu_quant_weight_.max_ptr_,
+                                         cpu_w_max.data(),
+                                         sizeof(float) * max_ptr_size,
+                                         IoDirection::HtoD);
+    }
     return;
-  }
-
-  if (param.precision == "int31") {
-    xpu_quant_weight_ =
-        TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, float>(
-            w_ptr, weight_dims, true);
-    CHECK(xpu_quant_weight_.max_ptr_ == nullptr)
-        << "int31 weight max should be null";
-  } else if (param.precision == "int16") {
-    xpu_quant_weight_ =
-        TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, int16_t>(
-            w_ptr, weight_dims, true);
-  } else if (param.precision == "int8") {
+  } else {
     xpu_quant_weight_ =
-        TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, int8_t>(
-            w_ptr, weight_dims, true);
+        TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, TW>(
+            w_ptr, weight_dims, w_trans, max_ptr_size);
+    if (std::is_same<TW, float>::value) {
+      VLOG(6)
+          << "If fc compute precision is int31,must check weight max should "
+             "be null ";
+      CHECK(xpu_quant_weight_.max_ptr_ == nullptr)
+          << "int31 weight max should be null";
+    }
   }
 }
-
-void XPUFcCompute::Run() {
+template <typename TGEMM,
+          typename TW,
+          typename DX,
+          typename DY,
+          PrecisionType PType>
+void XPUFcCompute<TGEMM, TW, DX, DY, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
 
   auto input_dims = param.input->dims();
+  if (param.in_num_col_dims == -1) {
+    param.in_num_col_dims += input_dims.size();
+  }
   auto in_mat_dims = input_dims.Flatten2D(param.in_num_col_dims);
   int m = in_mat_dims[0];
   int k = in_mat_dims[1];
   int n = param.w->dims()[1];
-  bool quant_int8 = param.quant_w_max > 0.f;
   int max_ptr_size = ctx.GetRawContext()->max_ptr_size();
   param.output_max->Resize({max_ptr_size});
 
-  float* output_max = quant_int8
-                          ? nullptr
-                          : param.output_max->mutable_data<float>(TARGET(kXPU));
-  const auto* bias = param.has_bias ? param.bias->data<float>() : nullptr;
+  bool x_trans = param.transpose_x;
+  bool w_trans = param.transpose_w;
+  int ldx = (x_trans ? m : k);
+  int ldw = (w_trans ? k : n);
+  int ldy = n;
+
+  float* output_max =
+      enable_int8_
+          ? nullptr
+          : param.output_max->template mutable_data<float>(TARGET(kXPU));
+  const auto* bias =
+      param.has_bias ? param.bias->template data<float>() : nullptr;
   const float* input_max =
-      quant_int8 ? reinterpret_cast<float*>(input_max_guard_->addr_)
-                 : (param.input_max ? param.input_max->data<float>() : nullptr);
+      enable_int8_ ? reinterpret_cast<float*>(input_max_guard_->addr_)
+                   : (param.input_max ? param.input_max->template data<float>()
+                                      : nullptr);
   xdnn::Activation_t act((xdnn::Activation_t::act_enum)param.act_type);
   if (param.act_type == 5) {
     act.leaky_alpha = param.act_param;
@@ -101,82 +128,53 @@ void XPUFcCompute::Run() {
     act.hard_sigmoid_slope = param.act_param;
   }
   // TODO(weihaoji): remove fc_int31 and fc_int16 after xpu fc wrapper refactor
-  if (param.precision == "int31") {
-    int r = xdnn::fc_fusion<float, float, float, int>(
-        ctx.GetRawContext(),                                          // ctx
-        param.input->data<float>(),                                   // x
-        reinterpret_cast<const float*>(xpu_quant_weight_.data_ptr_),  // w
-        param.output->mutable_data<float>(TARGET(kXPU)),              // y
-        m,                                                            // m
-        n,                                                            // n
-        k,                                                            // k
-        false,                                                        // x_trans
-        true,                                                         // w_trans
-        input_max,                                                   // x_maxptr
-        reinterpret_cast<const float*>(xpu_quant_weight_.max_ptr_),  // w_maxptr
-        output_max,                                                  // y_maxptr
-        k,                                                           // ldx
-        k,                                                           // ldw
-        n,                                                           // ldy
-        1.0f,                                                        // alpha
-        0.0f,                                                        // beta
-        bias,                                                        // bias
+  int r = 0;
+  if (per_channel_) {
+    r = xdnn::fc_fusion_pc<DX, TW, DY, TGEMM>(
+        ctx.GetRawContext(),                                       // ctx
+        param.input->template data<DX>(),                          // x
+        reinterpret_cast<const TW*>(xpu_quant_weight_.data_ptr_),  // w
+        param.output->template mutable_data<DY>(TARGET(kXPU)),     // y
+        m,                                                         // m
+        n,                                                         // n
+        k,                                                         // k
+        x_trans,                                                   // x_trans
+        w_trans,                                                   // w_trans
+        input_max,                                                 // x_maxptr
+        nullptr,                                                   // w_maxptr
+        output_max,                                                // y_maxptr
+        ldx,                                                       // ldx
+        ldw,                                                       // ldw
+        ldy,                                                       // ldy
+        1.0f,                                                      // alpha
+        0.0f,                                                      // beta
+        bias,                                                      // bias
+        reinterpret_cast<const float*>(
+            xpu_quant_weight_.max_ptr_),  // per channel weight_max
         act);
-    CHECK_EQ(r, 0);
-  } else if (param.precision == "int16") {
-    int r = 0;
-    r = xdnn::fc_fusion<float, int16_t, float, int16_t>(
-        ctx.GetRawContext(),                                            // ctx
-        param.input->data<float>(),                                     // x
-        reinterpret_cast<const int16_t*>(xpu_quant_weight_.data_ptr_),  // w
-        param.output->mutable_data<float>(TARGET(kXPU)),                // y
-        m,                                                              // m
-        n,                                                              // n
-        k,                                                              // k
-        false,                                                       // x_trans
-        true,                                                        // w_trans
+  } else {
+    r = xdnn::fc_fusion<DX, TW, DY, TGEMM>(
+        ctx.GetRawContext(),                                         // ctx
+        param.input->template data<DX>(),                            // x
+        reinterpret_cast<const TW*>(xpu_quant_weight_.data_ptr_),    // w
+        param.output->template mutable_data<DY>(TARGET(kXPU)),       // y
+        m,                                                           // m
+        n,                                                           // n
+        k,                                                           // k
+        x_trans,                                                     // x_trans
+        w_trans,                                                     // w_trans
         input_max,                                                   // x_maxptr
         reinterpret_cast<const float*>(xpu_quant_weight_.max_ptr_),  // w_maxptr
         output_max,                                                  // y_maxptr
-        k,                                                           // ldx
-        k,                                                           // ldw
-        n,                                                           // ldy
+        ldx,                                                         // ldx
+        ldw,                                                         // ldw
+        ldy,                                                         // ldy
         1.0f,                                                        // alpha
         0.0f,                                                        // beta
         bias,                                                        // bias
-        act);                                                        // act
-
-    CHECK_EQ(r, 0);
-  } else if (param.precision == "int8") {
-    bool x_trans = false;
-    bool w_trans = true;
-    int ldx = (x_trans ? m : k);
-    int ldw = (w_trans ? k : n);
-    int ldy = n;
-    int r = xdnn::fc_fusion<float, int8_t, float, int8_t>(
-        ctx.GetRawContext(),        /* context */
-        param.input->data<float>(), /* x */
-        reinterpret_cast<const int8_t*>(xpu_quant_weight_.data_ptr_),
-        param.output->mutable_data<float>(TARGET(kXPU)),      /* y */
-        m,                                                    /* m */
-        n,                                                    /* n */
-        k,                                                    /* k */
-        x_trans,                                              /* x_trans */
-        w_trans,                                              /* w_trans */
-        input_max,                                            /* x_max */
-        reinterpret_cast<float*>(xpu_quant_weight_.max_ptr_), /* w_max */
-        output_max,                                           /* y_max */
-        ldx,                                                  /* ldx */
-        ldw,                                                  /* ldw */
-        ldy,                                                  /* ldy */
-        1.0f,                                                 /* alpha */
-        0.0f,                                                 /* beta */
-        bias,                                                 /* bias */
-        act);                                                 /* act_type */
-    CHECK_EQ(r, 0);
-  } else {
-    LOG(FATAL) << "Unsupport XPUFC Precision: " << param.precision;
+        act);
   }
+  CHECK_EQ(r, 0);
 }
 
 }  // namespace xpu
@@ -184,12 +182,28 @@ void XPUFcCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(__xpu__fc,
-                     kXPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::xpu::XPUFcCompute,
-                     def)
+namespace xpu = paddle::lite::kernels::xpu;
+
+using XPUFC_FP32 =
+    xpu::XPUFcCompute<int, float, float, float, PRECISION(kFloat)>;
+
+using XPUFC_FP16_FP32_FP32 =
+    xpu::XPUFcCompute<int16_t, int16_t, float, float, PRECISION(kFloat)>;
+
+using XPUFC_FP16_FP16_FP16 =
+    xpu::XPUFcCompute<int16_t, int16_t, float16, float16, PRECISION(kFP16)>;
+
+using XPUFC_FP16_FP32_FP16 =
+    xpu::XPUFcCompute<int16_t, int16_t, float, float16, PRECISION(kFP16)>;
+
+using XPUFC_FP16_FP16_FP32 =
+    xpu::XPUFcCompute<int16_t, int16_t, float16, float, PRECISION(kFP16)>;
+
+using XPUFC_Int8_FP32_FP32 =
+    xpu::XPUFcCompute<int8_t, int8_t, float, float, PRECISION(kFloat)>;
+
+REGISTER_LITE_KERNEL(
+    __xpu__fc, kXPU, kFloat, kNCHW, XPUFC_FP32, XPU_Real_kFloat)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
     .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
@@ -197,3 +211,58 @@ REGISTER_LITE_KERNEL(__xpu__fc,
     .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(__xpu__fc, kXPU, kFloat, kNCHW, XPUFC_FP16_FP32_FP32, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    __xpu__fc, kXPU, kFP16, kNCHW, XPUFC_FP16_FP16_FP16, XPUFC_FP16_FP16_FP16)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    __xpu__fc, kXPU, kFP16, kNCHW, XPUFC_FP16_FP32_FP16, XPUFC_FP16_FP32_FP16)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    __xpu__fc, kXPU, kFP16, kNCHW, XPUFC_FP16_FP16_FP32, XPUFC_FP16_FP16_FP32)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    __xpu__fc, kXPU, kFloat, kNCHW, XPUFC_Int8_FP32_FP32, XPU_Int8_FP32_FP32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/__xpu__fc_compute.h b/lite/kernels/xpu/__xpu__fc_compute.h
index 687f8d5e9c1..ffb17c8abe2 100644
--- a/lite/kernels/xpu/__xpu__fc_compute.h
+++ b/lite/kernels/xpu/__xpu__fc_compute.h
@@ -20,8 +20,12 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace xpu {
-
-class XPUFcCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+template <typename TGEMM,
+          typename TW,
+          typename DX,
+          typename DY,
+          PrecisionType PType>
+class XPUFcCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::XPUFcParam;
 
@@ -32,10 +36,10 @@ class XPUFcCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
   virtual ~XPUFcCompute() = default;
 
  private:
-  // TODO(weihaoji): remove cpu w_max after xpu fc wrapper refactor
-  float w_max;
   XPUScratchPadGuard input_max_guard_;
   XPUQuantData xpu_quant_weight_;
+  bool per_channel_;
+  bool enable_int8_;
 };
 
 }  // namespace xpu
diff --git a/lite/kernels/xpu/__xpu__multi_encoder_compute.cc b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
index 82444bbec2f..e96f7121a11 100644
--- a/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
@@ -91,7 +91,51 @@ void XPUMultiEncoderCompute::prepare_quant_max(
   }
   return;
 }
-
+void XPUMultiEncoderCompute::prepare_weight_max(
+    int n_layers,
+    bool per_channel,
+    const lite::Tensor* weight_max,
+    int max_ptr_len,
+    const std::vector<int>& fc_channels,
+    std::vector<const float*>& max_xpu_ptrs) {
+  // prepare weight_max
+  int max_ext_times = max_ptr_len;
+  int total_channels = 0;
+  if (per_channel) {
+    max_ext_times = 1;
+    CHECK_EQ(fc_channels.size(), n_layers * 6) << fc_channels.size();
+    for (auto channel : fc_channels) {
+      total_channels += channel;
+    }
+    CHECK_EQ(weight_max->numel(), total_channels)
+        << "weight_max->numel: " << weight_max->numel()
+        << ", total_channels: " << total_channels;
+  }
+  int len = weight_max->numel() * max_ext_times * sizeof(float);
+  weight_max_guard_ = TargetWrapperXPU::MallocScratchPad(len);
+  float* weight_max_ptr = reinterpret_cast<float*>(weight_max_guard_->addr_);
+  if (per_channel) {
+    lite::TargetWrapperXPU::MemcpySync(
+        weight_max_ptr, weight_max->data<float>(), len, IoDirection::HtoD);
+    float* cur_ptr = weight_max_ptr;
+    for (int i = 0; i < fc_channels.size(); ++i) {
+      max_xpu_ptrs.push_back(cur_ptr);
+      cur_ptr += fc_channels[i];
+    }
+    CHECK_EQ(cur_ptr - weight_max_ptr, total_channels)
+        << weight_max_ptr << ", cur_ptr:" << cur_ptr;
+  } else {
+    for (int i = 0; i < weight_max->numel(); i++) {
+      float* cur_weight_max_ptr = weight_max_ptr + i * max_ptr_len;
+      std::vector<float> cpu_max(max_ptr_len, weight_max->data<float>()[i]);
+      lite::TargetWrapperXPU::MemcpySync(cur_weight_max_ptr,
+                                         cpu_max.data(),
+                                         sizeof(float) * max_ptr_len,
+                                         IoDirection::HtoD);
+      max_xpu_ptrs.push_back(cur_weight_max_ptr);
+    }
+  }
+}
 void XPUMultiEncoderCompute::PrepareForRun() {
   auto& ctx = this->ctx_->template As<XPUContext>();
   auto& param = this->template Param<param_t>();
@@ -115,23 +159,15 @@ void XPUMultiEncoderCompute::PrepareForRun() {
   } else if (param.precision == "int31") {
     arg_fc_weight_fp32_ = prepare_weight<float>(param.fc_weight);
   }
+  const int n_layers = param.fc_weight.size() / 6;
   const int XPU_QUANT_SCALE_NUM = ctx.GetRawContext()->max_ptr_size();
-  // prepare weight_max
-  weight_max_guard_ = TargetWrapperXPU::MallocScratchPad(
-      param.fc_weight_max->numel() * XPU_QUANT_SCALE_NUM * sizeof(float));
-  float* weight_max_ptr = reinterpret_cast<float*>(weight_max_guard_->addr_);
-  for (int i = 0; i < param.fc_weight_max->numel(); i++) {
-    float* cur_weight_max_ptr = weight_max_ptr + i * XPU_QUANT_SCALE_NUM;
-    std::vector<float> cpu_max(XPU_QUANT_SCALE_NUM,
-                               param.fc_weight_max->data<float>()[i]);
-    lite::TargetWrapperXPU::MemcpySync(cur_weight_max_ptr,
-                                       cpu_max.data(),
-                                       sizeof(float) * XPU_QUANT_SCALE_NUM,
-                                       IoDirection::HtoD);
-    fc_weight_max_.push_back(cur_weight_max_ptr);
-  }
+  prepare_weight_max(n_layers,
+                     param.per_channel,
+                     param.weight_max,
+                     XPU_QUANT_SCALE_NUM,
+                     param.fc_channels,
+                     fc_weight_max_);
   // prepare quant max, mul&matmul input/output max
-  const int n_layers = param.fc_weight.size() / 6;
   prepare_quant_max(
       param.input_max, n_layers, XPU_QUANT_SCALE_NUM, fc_input_max_);
   // prepare act_type
@@ -170,7 +206,9 @@ void XPUMultiEncoderCompute::run_encoder(const T* in, T* out) {
                                       slice_idx,
                                       true /* qkv fusion */,
                                       max_pad_seqlen,
-                                      param.hidden_dim);
+                                      param.hidden_dim,
+                                      param.norm_before, /*is_pre_norm*/
+                                      param.per_channel);
     if (std::is_same<TGEMM, int8_t>::value) {
       CHECK_GT(fc_input_max_.size(), 0);
     }
@@ -202,7 +240,8 @@ void XPUMultiEncoderCompute::run_encoder(const T* in, T* out) {
                                       qkv_act,
                                       slice_idx,
                                       true,
-                                      param.hidden_dim);
+                                      param.hidden_dim,
+                                      param.norm_before);
     int r = xdnn::transformer_encoder<T, TW, TGEMM>(
         ctx.GetRawContext(),
         in,
diff --git a/lite/kernels/xpu/__xpu__multi_encoder_compute.h b/lite/kernels/xpu/__xpu__multi_encoder_compute.h
index a32a413ff74..08ce7645eb8 100644
--- a/lite/kernels/xpu/__xpu__multi_encoder_compute.h
+++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.h
@@ -56,6 +56,12 @@ class XPUMultiEncoderCompute
                          int n_layers,
                          int max_ptr_len,
                          std::vector<const float *> &max_xpu_ptrs);
+  void prepare_weight_max(int n_layers,
+                          bool per_channel,
+                          const lite::Tensor *weight_max,
+                          int max_ptr_len,
+                          const std::vector<int> &fc_channels,
+                          std::vector<const float *> &max_xpu_ptrs);
   template <typename T, typename TW, typename TGEMM>
   void run_encoder(const T *in, T *out);
 };
diff --git a/lite/kernels/xpu/__xpu__squeeze_excitation_compute.cc b/lite/kernels/xpu/__xpu__squeeze_excitation_compute.cc
index 10a789b8428..3c51c269970 100644
--- a/lite/kernels/xpu/__xpu__squeeze_excitation_compute.cc
+++ b/lite/kernels/xpu/__xpu__squeeze_excitation_compute.cc
@@ -25,6 +25,7 @@ namespace xpu {
 
 void XPUSqueezeExcitationCompute::PrepareForRun() {
   auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
   auto weight_ptr = param.filter->data<float>();
   auto weight_len = param.filter->numel();
   auto weight1_len = weight_len / 2;
@@ -33,12 +34,13 @@ void XPUSqueezeExcitationCompute::PrepareForRun() {
   auto weight2_dims = paddle::lite::DDimLite();
   weight1_dims.ConstructFrom({weight1_len});
   weight2_dims.ConstructFrom({weight2_len});
+  auto max_ptr_len = ctx.GetRawContext()->max_ptr_size();
   quant_weight1_ =
       TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, int16_t>(
-          weight_ptr, weight1_dims, false);
+          weight_ptr, weight1_dims, false, max_ptr_len);
   quant_weight2_ =
       TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, int16_t>(
-          weight_ptr + weight1_len, weight2_dims, false);
+          weight_ptr + weight1_len, weight2_dims, false, max_ptr_len);
 }
 
 void XPUSqueezeExcitationCompute::Run() {
diff --git a/lite/kernels/xpu/activation_compute.cc b/lite/kernels/xpu/activation_compute.cc
index 867acb68205..bb92854f0b8 100644
--- a/lite/kernels/xpu/activation_compute.cc
+++ b/lite/kernels/xpu/activation_compute.cc
@@ -21,13 +21,14 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-void ReluCompute::Run() {
+template <typename T, PrecisionType PType>
+void ReluCompute<T, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
 
   int r = xdnn::relu(ctx.GetRawContext(),
-                     param.X->data<float>(),
-                     param.Out->mutable_data<float>(TARGET(kXPU)),
+                     param.X->template data<T>(),
+                     param.Out->template mutable_data<T>(TARGET(kXPU)),
                      param.X->numel());
   CHECK_EQ(r, 0);
 }
@@ -54,24 +55,26 @@ void GeluCompute::Run() {
   CHECK_EQ(r, 0);
 }
 
-void TanhCompute::Run() {
+template <typename T, PrecisionType PType>
+void TanhCompute<T, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
 
   int r = xdnn::tanh(ctx.GetRawContext(),
-                     param.X->data<float>(),
-                     param.Out->mutable_data<float>(TARGET(kXPU)),
+                     param.X->template data<T>(),
+                     param.Out->template mutable_data<T>(TARGET(kXPU)),
                      param.X->numel());
   CHECK_EQ(r, 0);
 }
 
-void SigmoidCompute::Run() {
+template <typename T, PrecisionType PType>
+void SigmoidCompute<T, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
 
   int r = xdnn::sigmoid(ctx.GetRawContext(),
-                        param.X->data<float>(),
-                        param.Out->mutable_data<float>(TARGET(kXPU)),
+                        param.X->template data<T>(),
+                        param.Out->template mutable_data<T>(TARGET(kXPU)),
                         param.X->numel());
   CHECK_EQ(r, 0);
 }
@@ -205,13 +208,13 @@ void HardSigmoidCompute::Run() {
   CHECK_EQ(r, 0);
 }
 
-void LeakyReluCompute::Run() {
+template <typename T, PrecisionType PType>
+void LeakyReluCompute<T, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
-
   int r = xdnn::leaky_relu(ctx.GetRawContext(),
-                           param.X->data<float>(),
-                           param.Out->mutable_data<float>(TARGET(kXPU)),
+                           param.X->template data<T>(),
+                           param.Out->template mutable_data<T>(TARGET(kXPU)),
                            param.X->numel(),
                            param.Leaky_relu_alpha);
   CHECK_EQ(r, 0);
@@ -274,12 +277,20 @@ void PReluCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(
-    relu, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ReluCompute, def)
+using reluFP32 =
+    paddle::lite::kernels::xpu::ReluCompute<float, PRECISION(kFloat)>;
+using reluFP16 =
+    paddle::lite::kernels::xpu::ReluCompute<float16, PRECISION(kFP16)>;
+REGISTER_LITE_KERNEL(relu, kXPU, kFloat, kNCHW, reluFP32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(relu, kXPU, kFP16, kNCHW, reluFP16, reluFP16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(
     relu6, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::Relu6Compute, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
@@ -292,21 +303,31 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(
-    tanh, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::TanhCompute, def)
+using tanhFP32 =
+    paddle::lite::kernels::xpu::TanhCompute<float, PRECISION(kFloat)>;
+using tanhFP16 =
+    paddle::lite::kernels::xpu::TanhCompute<float16, PRECISION(kFP16)>;
+REGISTER_LITE_KERNEL(tanh, kXPU, kFloat, kNCHW, tanhFP32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
+REGISTER_LITE_KERNEL(tanh, kXPU, kFP16, kNCHW, tanhFP16, tanhFP16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
 
-REGISTER_LITE_KERNEL(sigmoid,
-                     kXPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::xpu::SigmoidCompute,
-                     def)
+using sigmoidFP32 =
+    paddle::lite::kernels::xpu::SigmoidCompute<float, PRECISION(kFloat)>;
+using sigmoidFP16 =
+    paddle::lite::kernels::xpu::SigmoidCompute<float16, PRECISION(kFP16)>;
+REGISTER_LITE_KERNEL(sigmoid, kXPU, kFloat, kNCHW, sigmoidFP32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
+REGISTER_LITE_KERNEL(sigmoid, kXPU, kFP16, kNCHW, sigmoidFP16, sigmoidFP16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
 
 REGISTER_LITE_KERNEL(
     abs, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::AbsCompute, def)
@@ -386,16 +407,21 @@ REGISTER_LITE_KERNEL(hard_swish,
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(leaky_relu,
-                     kXPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::xpu::LeakyReluCompute,
-                     def)
+using leaky_reluFP32 =
+    paddle::lite::kernels::xpu::LeakyReluCompute<float, PRECISION(kFloat)>;
+using leaky_reluFP16 =
+    paddle::lite::kernels::xpu::LeakyReluCompute<float16, PRECISION(kFP16)>;
+REGISTER_LITE_KERNEL(leaky_relu, kXPU, kFloat, kNCHW, leaky_reluFP32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(
+    leaky_relu, kXPU, kFP16, kNCHW, leaky_reluFP16, leaky_reluFP16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(softsign,
                      kXPU,
                      kFloat,
diff --git a/lite/kernels/xpu/activation_compute.h b/lite/kernels/xpu/activation_compute.h
index 057d527ef89..ab47e5ed580 100644
--- a/lite/kernels/xpu/activation_compute.h
+++ b/lite/kernels/xpu/activation_compute.h
@@ -20,7 +20,8 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-class ReluCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class ReluCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::ActivationParam;
 
@@ -47,7 +48,8 @@ class GeluCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
   virtual ~GeluCompute() = default;
 };
 
-class TanhCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class TanhCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::ActivationParam;
 
@@ -56,7 +58,8 @@ class TanhCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
   virtual ~TanhCompute() = default;
 };
 
-class SigmoidCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class SigmoidCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::ActivationParam;
 
@@ -164,7 +167,8 @@ class HardSigmoidCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
   virtual ~HardSigmoidCompute() = default;
 };
 
-class LeakyReluCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class LeakyReluCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::ActivationParam;
 
diff --git a/lite/kernels/xpu/calib_compute.cc b/lite/kernels/xpu/calib_compute.cc
index 34a6fb53d72..dc134fde02a 100644
--- a/lite/kernels/xpu/calib_compute.cc
+++ b/lite/kernels/xpu/calib_compute.cc
@@ -29,6 +29,9 @@ void CalibCompute<InType, OutType>::Run() {
   int numel = param.input->numel();
   const auto* in_data = param.input->template data<InType>();
   auto* out_data = param.output->template mutable_data<OutType>(TARGET(kXPU));
+  if (numel == 0) {
+    return;
+  }
   int r = xdnn::cast_v2<InType, OutType>(
       ctx.GetRawContext(), in_data, out_data, numel);
   CHECK_EQ(r, 0);
@@ -43,31 +46,69 @@ using xpu_calib_int64_to_int32 =
     paddle::lite::kernels::xpu::CalibCompute<int64_t, int32_t>;
 using xpu_calib_int32_to_int64 =
     paddle::lite::kernels::xpu::CalibCompute<int32_t, int64_t>;
+using xpu_calib_fp32_to_fp16 =
+    paddle::lite::kernels::xpu::CalibCompute<float, float16>;
+using xpu_calib_fp16_to_fp32 =
+    paddle::lite::kernels::xpu::CalibCompute<float16, float>;
 
 REGISTER_LITE_KERNEL(
-    calib, kXPU, kFloat, kNCHW, xpu_calib_int64_to_int32, int64_to_int32)
+    calib, kXPU, kFloat, kNCHW, xpu_calib_int64_to_int32, calib_int64_to_int32)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(
-    calib, kXPU, kFloat, kNCHW, xpu_calib_int32_to_int64, int32_to_int64)
+    calib, kXPU, kFloat, kNCHW, xpu_calib_int32_to_int64, calib_int32_to_int64)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(
-    calib_once, kXPU, kFloat, kNCHW, xpu_calib_int64_to_int32, int64_to_int32)
+    calib, kXPU, kFloat, kNCHW, xpu_calib_fp32_to_fp16, calib_fp32_to_fp16)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib, kXPU, kFloat, kNCHW, xpu_calib_fp16_to_fp32, calib_fp16_to_fp32)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(calib_once,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     xpu_calib_int64_to_int32,
+                     calib_int64_to_int32)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(
-    calib_once, kXPU, kFloat, kNCHW, xpu_calib_int32_to_int64, int32_to_int64)
+REGISTER_LITE_KERNEL(calib_once,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     xpu_calib_int32_to_int64,
+                     calib_int32_to_int64)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib_once, kXPU, kFloat, kNCHW, xpu_calib_fp32_to_fp16, calib_fp32_to_fp16)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib_once, kXPU, kFloat, kNCHW, xpu_calib_fp16_to_fp32, calib_fp16_to_fp32)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .Finalize();
diff --git a/lite/kernels/xpu/compare_compute.cc b/lite/kernels/xpu/compare_compute.cc
index 51b92cc092a..d4cf45714fa 100644
--- a/lite/kernels/xpu/compare_compute.cc
+++ b/lite/kernels/xpu/compare_compute.cc
@@ -46,6 +46,18 @@ struct EqualFunctor {
   }
 };
 
+template <typename T>
+struct GreaterThanFunctor {
+  inline int operator()(xdnn::Context* ctx,
+                        const T* x,
+                        const T* y,
+                        bool* z,
+                        const std::vector<int>& xshape,
+                        const std::vector<int>& yshape) const {
+    return xdnn::broadcast_greater_than<T>(ctx, x, y, z, xshape, yshape);
+  }
+};
+
 template <PrecisionType PType, class T, class Functor>
 void CompareCompute<PType, T, Functor>::Run() {
   auto& param = this->template Param<operators::CompareParam>();
@@ -224,3 +236,65 @@ REGISTER_LITE_KERNEL(equal, kXPU, kFloat, kAny, euqal_int64, int64)
                                        DATALAYOUT(kAny))})
     .BindPaddleOpVersion("equal", 1)
     .Finalize();
+
+using greater_than_float = paddle::lite::kernels::xpu::CompareCompute<
+    PRECISION(kFloat),
+    float,
+    paddle::lite::kernels::xpu::GreaterThanFunctor<float>>;
+REGISTER_LITE_KERNEL(greater_than, kXPU, kFloat, kAny, greater_than_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kXPU),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kAny))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kXPU),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kXPU),
+                                       PRECISION(kBool),
+                                       DATALAYOUT(kAny))})
+    .BindPaddleOpVersion("greater_than", 1)
+    .Finalize();
+
+using greater_than_int32 = paddle::lite::kernels::xpu::CompareCompute<
+    PRECISION(kFloat),
+    int,
+    paddle::lite::kernels::xpu::GreaterThanFunctor<int>>;
+REGISTER_LITE_KERNEL(
+    greater_than, kXPU, kFloat, kAny, greater_than_int32, int32)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kXPU),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kAny))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kXPU),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kXPU),
+                                       PRECISION(kBool),
+                                       DATALAYOUT(kAny))})
+    .BindPaddleOpVersion("greater_than", 1)
+    .Finalize();
+
+using greater_than_int64 = paddle::lite::kernels::xpu::CompareCompute<
+    PRECISION(kFloat),
+    int64_t,
+    paddle::lite::kernels::xpu::GreaterThanFunctor<int64_t>>;
+REGISTER_LITE_KERNEL(
+    greater_than, kXPU, kFloat, kAny, greater_than_int64, int64)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kXPU),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kAny))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kXPU),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kXPU),
+                                       PRECISION(kBool),
+                                       DATALAYOUT(kAny))})
+    .BindPaddleOpVersion("greater_than", 1)
+    .Finalize();
diff --git a/lite/kernels/xpu/concat_compute.cc b/lite/kernels/xpu/concat_compute.cc
index e3fc5ef554d..9eceace16f5 100644
--- a/lite/kernels/xpu/concat_compute.cc
+++ b/lite/kernels/xpu/concat_compute.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "lite/kernels/xpu/concat_compute.h"
+
 #include <algorithm>
 #include <vector>
+
 #include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/op_registry.h"
 
@@ -23,8 +25,8 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-template <typename InType>
-void ConcatCompute<InType>::Run() {
+template <typename InType, PrecisionType PType>
+void ConcatCompute<InType, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
 
@@ -34,7 +36,7 @@ void ConcatCompute<InType>::Run() {
                      ? param.axis + static_cast<int>(ins[0]->dims().size())
                      : param.axis;
 
-  std::vector<const float*> x_list;
+  std::vector<const InType*> x_list;
   std::vector<std::vector<int>> xdims_list;
   for (int i = 0; i < ins.size(); i++) {
     if (ins[i]->numel() > 0) {
@@ -46,14 +48,14 @@ void ConcatCompute<InType>::Run() {
         xdims_list[i].back() = xdims_list[i].back() * 2;
       }
       x_list.push_back(
-          reinterpret_cast<const float*>(ins[i]->template data<InType>()));
+          reinterpret_cast<const InType*>(ins[i]->template data<InType>()));
     }
   }
   if (x_list.size() > 1) {
-    int r = xdnn::concat<float>(
+    int r = xdnn::concat<InType>(
         ctx.GetRawContext(),
         x_list,
-        reinterpret_cast<float*>(
+        reinterpret_cast<InType*>(
             out->template mutable_data<InType>(TARGET(kXPU))),
         xdims_list,
         axis);
@@ -75,37 +77,45 @@ void ConcatCompute<InType>::Run() {
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
-
-REGISTER_LITE_KERNEL(concat,
-                     kXPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::xpu::ConcatCompute<float>,
-                     def)
+using concatfp32 =
+    paddle::lite::kernels::xpu::ConcatCompute<float, PRECISION(kFloat)>;
+using concatfp16 =
+    paddle::lite::kernels::xpu::ConcatCompute<float16, PRECISION(kFP16)>;
+using concati16 =
+    paddle::lite::kernels::xpu::ConcatCompute<int, PRECISION(kInt16)>;
+using concati32 =
+    paddle::lite::kernels::xpu::ConcatCompute<int, PRECISION(kFloat)>;
+using concati64 =
+    paddle::lite::kernels::xpu::ConcatCompute<int64_t, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(concat, kXPU, kFloat, kNCHW, concatfp32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
     .BindInput("AxisTensor",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(concat,
-                     kXPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::xpu::ConcatCompute<int>,
-                     concat_i32)
+REGISTER_LITE_KERNEL(concat, kXPU, kFP16, kNCHW, concatfp16, concat_FP16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(concat, kXPU, kInt16, kNCHW, concati16, concat_INT16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt16))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt16))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(concat, kXPU, kInt32, kNCHW, concati32, concat_INT32)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindInput("AxisTensor",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(concat,
-                     kXPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::xpu::ConcatCompute<int64_t>,
-                     concat_i64)
+REGISTER_LITE_KERNEL(concat, kXPU, kInt64, kNCHW, concati64, concat_INT64)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .BindInput("AxisTensor",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
diff --git a/lite/kernels/xpu/concat_compute.h b/lite/kernels/xpu/concat_compute.h
index 218c4704557..964f94f8194 100644
--- a/lite/kernels/xpu/concat_compute.h
+++ b/lite/kernels/xpu/concat_compute.h
@@ -21,8 +21,8 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-template <typename InType>
-class ConcatCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+template <typename InType, PrecisionType PType>
+class ConcatCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::ConcatParam;
 
diff --git a/lite/kernels/xpu/conv2d_transpose_compute.cc b/lite/kernels/xpu/conv2d_transpose_compute.cc
index 7949b193c56..0ec8532b4bc 100644
--- a/lite/kernels/xpu/conv2d_transpose_compute.cc
+++ b/lite/kernels/xpu/conv2d_transpose_compute.cc
@@ -22,6 +22,23 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
+template <>
+void Conv2dTransposeCompute<PRECISION(kFloat)>::PrepareForRun() {
+  int cur_dev_idx = 0;
+
+  XPU_CALL(xpu_current_device(&cur_dev_idx));
+  XPU_CALL(xpu_device_get_attr(&cur_dev_attr_, XPUATTR_MODEL, cur_dev_idx));
+  if (cur_dev_attr_ <= 1) {
+    VLOG(4) << "Currents XPU device : XPU1";
+  } else if (cur_dev_attr_ >= 2 && cur_dev_attr_ <= 299) {
+    VLOG(4) << "Currents XPU device : XPU2";
+  } else if (cur_dev_attr_ >= 300 && cur_dev_attr_ <= 599) {
+    VLOG(4) << "Currents XPU device : XPU3";
+  } else {
+    VLOG(4) << "invaid XPU device";
+  }
+}
+
 template <>
 void Conv2dTransposeCompute<PRECISION(kFloat)>::Run() {
   auto& param = this->template Param<param_t>();
@@ -37,27 +54,53 @@ void Conv2dTransposeCompute<PRECISION(kFloat)>::Run() {
   auto dilations = *param.dilations;
 
   if (param.output_padding.empty()) {
-    int ret = xdnn::conv2d_transpose<float, float, float, int16_t>(
-        ctx.GetRawContext(),
-        param.x->data<float>(),
-        param.filter->data<float>(),
-        param.output->mutable_data<float>(TARGET(kXPU)),
-        in_dims[0],
-        in_dims[1],
-        in_dims[2],
-        in_dims[3],
-        out_dims[1],
-        std::vector<int>{static_cast<int>(w_dims[2]),
-                         static_cast<int>(w_dims[3])},
-        strides,
-        paddings,
-        dilations,
-        groups,
-        nullptr,
-        nullptr,
-        nullptr,
-        true);
-    CHECK_EQ(ret, 0);
+    if (cur_dev_attr_ <= 1) {
+      int ret = xdnn::conv2d_transpose<float, float, float, int16_t>(
+          ctx.GetRawContext(),
+          param.x->data<float>(),
+          param.filter->data<float>(),
+          param.output->mutable_data<float>(TARGET(kXPU)),
+          in_dims[0],
+          in_dims[1],
+          in_dims[2],
+          in_dims[3],
+          out_dims[1],
+          std::vector<int>{static_cast<int>(w_dims[2]),
+                           static_cast<int>(w_dims[3])},
+          strides,
+          paddings,
+          dilations,
+          groups,
+          nullptr,
+          nullptr,
+          nullptr,
+          true);
+      CHECK_EQ(ret, 0);
+    } else {
+      int ret = xdnn::conv2d_transpose_fusion<float, float, float, int16_t>(
+          ctx.GetRawContext(),
+          param.x->data<float>(),
+          param.filter->data<float>(),
+          param.output->mutable_data<float>(TARGET(kXPU)),
+          in_dims[0],
+          in_dims[1],
+          in_dims[2],
+          in_dims[3],
+          out_dims[1],
+          std::vector<int>{static_cast<int>(w_dims[2]),
+                           static_cast<int>(w_dims[3])},
+          strides,
+          paddings,
+          dilations,
+          groups,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          xdnn::Activation_t::LINEAR,
+          true);
+      CHECK_EQ(ret, 0);
+    }
   } else {
     int n = in_dims[0];
     int yc = in_dims[1];
diff --git a/lite/kernels/xpu/conv2d_transpose_compute.h b/lite/kernels/xpu/conv2d_transpose_compute.h
index 5a3d8714fd4..6e779fc42ad 100644
--- a/lite/kernels/xpu/conv2d_transpose_compute.h
+++ b/lite/kernels/xpu/conv2d_transpose_compute.h
@@ -28,9 +28,11 @@ class Conv2dTransposeCompute : public KernelLite<TARGET(kXPU), FilterPtype> {
  public:
   using param_t = operators::ConvParam;
 
+  void PrepareForRun() override;
   void Run() override;
 
   virtual ~Conv2dTransposeCompute() = default;
+  uint64_t cur_dev_attr_ = 0;
 };
 
 }  // namespace xpu
diff --git a/lite/kernels/xpu/conv3d_compute.cc b/lite/kernels/xpu/conv3d_compute.cc
index cc3ad389679..cd5b79c21fc 100644
--- a/lite/kernels/xpu/conv3d_compute.cc
+++ b/lite/kernels/xpu/conv3d_compute.cc
@@ -22,8 +22,27 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-template <>
-void Conv3DCompute<PRECISION(kFloat)>::Run() {
+template <typename TGEMM,
+          typename TW,
+          typename DX,
+          typename DY,
+          PrecisionType PType>
+void Conv3DCompute<TGEMM, TW, DX, DY, PType>::PrepareForRun() {
+  auto& ctx = this->ctx_->template As<XPUContext>();
+  auto& param = this->template Param<param_t>();
+  auto filter_ptr = param.filter->template data<float>();
+  auto filter_dims = param.filter->dims();
+  xpu_quant_filter_ =
+      TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, TW>(
+          filter_ptr, filter_dims, false, ctx.GetRawContext()->max_ptr_size());
+}
+
+template <typename TGEMM,
+          typename TW,
+          typename DX,
+          typename DY,
+          PrecisionType PType>
+void Conv3DCompute<TGEMM, TW, DX, DY, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
 
@@ -34,11 +53,11 @@ void Conv3DCompute<PRECISION(kFloat)>::Run() {
   auto paddings = *param.paddings;
   auto dilations = *param.dilations;
 
-  int r = xdnn::conv3d<float, float, float, int16_t>(
+  int r = xdnn::conv3d<DX, TW, DY, TGEMM>(
       ctx.GetRawContext(), /* context */
-      param.x->data<float>(),
-      param.filter->data<float>(), /* weight */
-      param.output->mutable_data<float>(TARGET(kXPU)),
+      param.x->template data<DX>(),
+      reinterpret_cast<const TW*>(xpu_quant_filter_.data_ptr_), /* weight */
+      param.output->template mutable_data<DY>(TARGET(kXPU)),
       x_dims[0], /* input_n */
       x_dims[1], /* input_c */
       x_dims[2], /* input_d */
@@ -53,7 +72,7 @@ void Conv3DCompute<PRECISION(kFloat)>::Run() {
       dilations,
       groups,
       nullptr,
-      nullptr,
+      reinterpret_cast<const float*>(xpu_quant_filter_.max_ptr_),
       nullptr,
       true /*is_ncdhw*/);
   CHECK_EQ(r, 0);
@@ -65,11 +84,61 @@ void Conv3DCompute<PRECISION(kFloat)>::Run() {
 }  // namespace paddle
 
 namespace xpu = paddle::lite::kernels::xpu;
-using Conv3dFp32 = xpu::Conv3DCompute<PRECISION(kFloat)>;
 
-REGISTER_LITE_KERNEL(conv3d, kXPU, kFloat, kNCHW, Conv3dFp32, def)
+using XPUConv3dFP32 =
+    xpu::Conv3DCompute<int, float, float, float, PRECISION(kFloat)>;
+
+using XPUConv3d_FP16_FP32_FP32 =
+    xpu::Conv3DCompute<int16_t, int16_t, float, float, PRECISION(kFloat)>;
+
+using XPUConv3dFp16 =
+    xpu::Conv3DCompute<int16_t, int16_t, float16, float16, PRECISION(kFP16)>;
+
+using XPUConv3d_FP16_FP16_FP32 =
+    xpu::Conv3DCompute<int16_t, int16_t, float16, float, PRECISION(kFP16)>;
+
+using XPUConv3d_FP16_FP32_FP16 =
+    xpu::Conv3DCompute<int16_t, int16_t, float, float16, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(
+    conv3d, kXPU, kFloat, kNCHW, XPUConv3dFP32, XPU_Real_kFloat)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(conv3d, kXPU, kFloat, kNCHW, XPUConv3d_FP16_FP32_FP32, def)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
-    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
     .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(
+    conv3d, kXPU, kFP16, kNCHW, XPUConv3dFp16, XPU_FP16_FP16_FP16)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    conv3d, kXPU, kFP16, kNCHW, XPUConv3d_FP16_FP16_FP32, XPU_FP16_FP16_FP32)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    conv3d, kXPU, kFP16, kNCHW, XPUConv3d_FP16_FP32_FP16, XPU_FP16_FP32_FP16)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
diff --git a/lite/kernels/xpu/conv3d_compute.h b/lite/kernels/xpu/conv3d_compute.h
index caadb82a1e8..4cd5fdaeca7 100644
--- a/lite/kernels/xpu/conv3d_compute.h
+++ b/lite/kernels/xpu/conv3d_compute.h
@@ -21,14 +21,22 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-template <PrecisionType FilterPtype>
-class Conv3DCompute : public KernelLite<TARGET(kXPU), FilterPtype> {
+template <typename TGEMM,
+          typename TW,
+          typename DX,
+          typename DY,
+          PrecisionType PType>
+class Conv3DCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::ConvParam;
+  void PrepareForRun() override;
 
   void Run() override;
 
   virtual ~Conv3DCompute() = default;
+
+ private:
+  XPUQuantData xpu_quant_filter_;
 };
 
 }  // namespace xpu
diff --git a/lite/kernels/xpu/elementwise_compute.cc b/lite/kernels/xpu/elementwise_compute.cc
index aaf1c913209..4b8e0e158c5 100644
--- a/lite/kernels/xpu/elementwise_compute.cc
+++ b/lite/kernels/xpu/elementwise_compute.cc
@@ -132,10 +132,15 @@ void ElementwiseCompute<T, Functor>::Run() {
 
 namespace xpu = paddle::lite::kernels::xpu;
 using AddFloat32 = xpu::ElementwiseCompute<float, xpu::AddFunctor<float>>;
+using AddFloat16 = xpu::ElementwiseCompute<float16, xpu::AddFunctor<float16>>;
 using AddInt32 = xpu::ElementwiseCompute<int, xpu::AddFunctor<int>>;
 using AddInt64 = xpu::ElementwiseCompute<int64_t, xpu::AddFunctor<int64_t>>;
+
 using SubFloat32 = xpu::ElementwiseCompute<float, xpu::SubFunctor<float>>;
+
 using MulFloat32 = xpu::ElementwiseCompute<float, xpu::MulFunctor<float>>;
+using MulFloat16 = xpu::ElementwiseCompute<float16, xpu::MulFunctor<float16>>;
+
 using MulInt64 = xpu::ElementwiseCompute<int64_t, xpu::MulFunctor<int64_t>>;
 using DivFloat32 = xpu::ElementwiseCompute<float, xpu::DivFunctor<float>>;
 using MaxFloat32 = xpu::ElementwiseCompute<float, xpu::MaxFunctor<float>>;
@@ -147,6 +152,13 @@ REGISTER_LITE_KERNEL(elementwise_add, kXPU, kFloat, kNCHW, AddFloat32, def)
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(
+    elementwise_add, kXPU, kFloat, kNCHW, AddFloat16, DISABLE_XPU1_AddFloat16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(elementwise_add, kXPU, kFloat, kNCHW, AddInt32, int32)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
@@ -171,6 +183,13 @@ REGISTER_LITE_KERNEL(elementwise_mul, kXPU, kFloat, kNCHW, MulFloat32, def)
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(
+    elementwise_mul, kXPU, kFloat, kNCHW, MulFloat16, DISABLE_XPU1_MulFloat16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(elementwise_mul, kXPU, kFloat, kNCHW, MulInt64, int64)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
diff --git a/lite/kernels/xpu/gather_compute.cc b/lite/kernels/xpu/gather_compute.cc
index f3eafc878fb..697204689d9 100644
--- a/lite/kernels/xpu/gather_compute.cc
+++ b/lite/kernels/xpu/gather_compute.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "lite/kernels/xpu/gather_compute.h"
+
 #include <vector>
+
 #include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/op_registry.h"
 
@@ -22,8 +24,8 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-template <typename DataType, typename IndexType>
-void GatherCompute<DataType, IndexType>::Run() {
+template <typename DataType, typename IndexType, PrecisionType PType>
+void GatherCompute<DataType, IndexType, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
 
@@ -46,88 +48,16 @@ void GatherCompute<DataType, IndexType>::Run() {
     axis += x_dims.size();
   }
 
-  if (param.X->precision() == PrecisionType::kInt64 &&
-      param.Index->precision() == PrecisionType::kInt64) {
-    auto* index_int64 = param.Index->template data<int64_t>();
-    int size = param.Index->dims().production();
-    XPUScratchPadGuard index_xpu_guard_ =
-        TargetWrapperXPU::MallocScratchPad(size * sizeof(int));
-    int* index_int32_device = reinterpret_cast<int*>(index_xpu_guard_->addr_);
-
-    int r0 = xdnn::cast_v2<int64_t, int32_t>(
-        ctx.GetRawContext(), index_int64, index_int32_device, index->numel());
-    CHECK_EQ(r0, 0);
+  int r = xdnn::gather<DataType, IndexType>(
+      ctx.GetRawContext(),
+      x->template data<DataType>(),
+      index->template data<IndexType>(),
+      out->template mutable_data<DataType>(TARGET(kXPU)),
+      x_dims,
+      index->numel(),
+      axis);
 
-    int r1 = xdnn::gather<int64_t, int32_t>(
-        ctx.GetRawContext(),
-        x->template data<int64_t>(),
-        index_int32_device,
-        out->template mutable_data<int64_t>(TARGET(kXPU)),
-        x_dims,
-        index->numel(),
-        axis);
-    CHECK_EQ(r1, 0);
-  } else if (param.X->precision() == PrecisionType::kInt64 &&
-             param.Index->precision() == PrecisionType::kInt32) {
-    int r = xdnn::gather<int64_t, int32_t>(
-        ctx.GetRawContext(),
-        x->template data<int64_t>(),
-        index->template data<int32_t>(),
-        out->template mutable_data<int64_t>(TARGET(kXPU)),
-        x_dims,
-        index->numel(),
-        axis);
-    CHECK_EQ(r, 0);
-  } else if (param.X->precision() == PrecisionType::kInt32 &&
-             param.Index->precision() == PrecisionType::kInt32) {
-    int r = xdnn::gather<int32_t, int32_t>(
-        ctx.GetRawContext(),
-        x->template data<int32_t>(),
-        index->template data<int32_t>(),
-        out->template mutable_data<int32_t>(TARGET(kXPU)),
-        x_dims,
-        index->numel(),
-        axis);
-    CHECK_EQ(r, 0);
-  } else if (param.X->precision() == PrecisionType::kInt32 &&
-             param.Index->precision() == PrecisionType::kInt64) {
-    int r = xdnn::gather<int32_t, int64_t>(
-        ctx.GetRawContext(),
-        x->template data<int32_t>(),
-        index->template data<int64_t>(),
-        out->template mutable_data<int32_t>(TARGET(kXPU)),
-        x_dims,
-        index->numel(),
-        axis);
-    CHECK_EQ(r, 0);
-  } else if (param.X->precision() == PrecisionType::kFloat &&
-             param.Index->precision() == PrecisionType::kInt32) {
-    int r = xdnn::gather<float, int32_t>(
-        ctx.GetRawContext(),
-        x->template data<float>(),
-        index->template data<int32_t>(),
-        out->template mutable_data<float>(TARGET(kXPU)),
-        x_dims,
-        index->numel(),
-        axis);
-    CHECK_EQ(r, 0);
-  } else if (param.X->precision() == PrecisionType::kFloat &&
-             param.Index->precision() == PrecisionType::kInt64) {
-    int r = xdnn::gather<float, int64_t>(
-        ctx.GetRawContext(),
-        x->template data<float>(),
-        index->template data<int64_t>(),
-        out->template mutable_data<float>(TARGET(kXPU)),
-        x_dims,
-        index->numel(),
-        axis);
-    CHECK_EQ(r, 0);
-  } else {
-    LOG(FATAL) << "Unsupported gather op with x dtype: "
-               << lite_api::PrecisionToStr(param.X->precision())
-               << " and index dtype: "
-               << lite_api::PrecisionToStr(param.Index->precision());
-  }
+  CHECK_EQ(r, 0);
 }
 
 }  // namespace xpu
@@ -141,10 +71,21 @@ REGISTER_LITE_KERNEL(gather, kXPU, kFloat, kNCHW, GatherXPUFloatInt32, def)
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindInput("Axis",
                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
     .Finalize();
+
 REGISTER_LITE_KERNEL(
-    gather, kXPU, kFloat, kNCHW, GatherXPUFloatInt64, gather_float_i64)
+    gather, kXPU, kFP16, kNCHW, GatherXPUkFP16Int32, gather_FP16_Int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
+    .BindInput("Axis",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    gather, kXPU, kFloat, kNCHW, GatherXPUFloatInt64, gather_FP32_INT64)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
     .BindInput("Index",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
@@ -153,7 +94,7 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 REGISTER_LITE_KERNEL(
-    gather, kXPU, kFloat, kNCHW, GatherXPUInt32Int32, gather_i32_i32)
+    gather, kXPU, kInt32, kNCHW, GatherXPUInt32Int32, gather_INT32_INT32)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindInput("Index",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
@@ -162,7 +103,7 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 REGISTER_LITE_KERNEL(
-    gather, kXPU, kFloat, kNCHW, GatherXPUInt32Int64, gather_i32_i64)
+    gather, kXPU, kInt32, kNCHW, GatherXPUInt32Int64, gather_INT32_INT64)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .BindInput("Index",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
@@ -171,7 +112,7 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 REGISTER_LITE_KERNEL(
-    gather, kXPU, kFloat, kNCHW, GatherXPUInt64Int32, gather_i64_i32)
+    gather, kXPU, kInt64, kNCHW, GatherXPUInt64Int32, gather_INT64_INT32)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .BindInput("Index",
                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
@@ -179,12 +120,3 @@ REGISTER_LITE_KERNEL(
                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .Finalize();
-REGISTER_LITE_KERNEL(
-    gather, kXPU, kFloat, kNCHW, GatherXPUInt64Int64, gather_i64_i64)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
-    .BindInput("Index",
-               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
-    .BindInput("Axis",
-               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
-    .Finalize();
diff --git a/lite/kernels/xpu/gather_compute.h b/lite/kernels/xpu/gather_compute.h
index a78be677d09..2363e8651ca 100644
--- a/lite/kernels/xpu/gather_compute.h
+++ b/lite/kernels/xpu/gather_compute.h
@@ -21,8 +21,8 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-template <typename DataType, typename IndexType>
-class GatherCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+template <typename DataType, typename IndexType, PrecisionType PType>
+class GatherCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::GatherParam;
 
@@ -36,15 +36,27 @@ class GatherCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
 }  // namespace lite
 }  // namespace paddle
 
-typedef paddle::lite::kernels::xpu::GatherCompute<int32_t, int32_t>
+typedef paddle::lite::kernels::xpu::GatherCompute<int32_t,
+                                                  int32_t,
+                                                  PRECISION(kInt32)>
     GatherXPUInt32Int32;
-typedef paddle::lite::kernels::xpu::GatherCompute<int32_t, int64_t>
+typedef paddle::lite::kernels::xpu::GatherCompute<int32_t,
+                                                  int64_t,
+                                                  PRECISION(kInt32)>
     GatherXPUInt32Int64;
-typedef paddle::lite::kernels::xpu::GatherCompute<float, int32_t>
+typedef paddle::lite::kernels::xpu::GatherCompute<float,
+                                                  int32_t,
+                                                  PRECISION(kFloat)>
     GatherXPUFloatInt32;
-typedef paddle::lite::kernels::xpu::GatherCompute<float, int64_t>
+typedef paddle::lite::kernels::xpu::GatherCompute<float16,
+                                                  int32_t,
+                                                  PRECISION(kFP16)>
+    GatherXPUkFP16Int32;
+typedef paddle::lite::kernels::xpu::GatherCompute<float,
+                                                  int64_t,
+                                                  PRECISION(kFloat)>
     GatherXPUFloatInt64;
-typedef paddle::lite::kernels::xpu::GatherCompute<int64_t, int32_t>
+typedef paddle::lite::kernels::xpu::GatherCompute<int64_t,
+                                                  int32_t,
+                                                  PRECISION(kInt64)>
     GatherXPUInt64Int32;
-typedef paddle::lite::kernels::xpu::GatherCompute<int64_t, int64_t>
-    GatherXPUInt64Int64;
diff --git a/lite/kernels/xpu/pool_compute.cc b/lite/kernels/xpu/pool_compute.cc
index 9df03bc3c48..8211de7e438 100644
--- a/lite/kernels/xpu/pool_compute.cc
+++ b/lite/kernels/xpu/pool_compute.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "lite/kernels/xpu/pool_compute.h"
+
 #include <algorithm>
 #include <vector>
+
 #include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/op_registry.h"
 
@@ -22,8 +24,8 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace xpu {
-
-void Pool2DCompute::Run() {
+template <typename InType, PrecisionType PType>
+void Pool2DCompute<InType, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
 
@@ -55,8 +57,8 @@ void Pool2DCompute::Run() {
     if (param.pooling_type == "avg") {
       int r = xdnn::adaptive_avg_pool2d(
           ctx.GetRawContext(),
-          param.x->data<float>(),
-          param.output->mutable_data<float>(TARGET(kXPU)),
+          param.x->template data<InType>(),
+          param.output->template mutable_data<InType>(TARGET(kXPU)),
           x_dims[0],
           x_dims[1],
           x_dims[2],
@@ -68,8 +70,8 @@ void Pool2DCompute::Run() {
     } else {
       int r = xdnn::adaptive_max_pool2d(
           ctx.GetRawContext(),
-          param.x->data<float>(),
-          param.output->mutable_data<float>(TARGET(kXPU)),
+          param.x->template data<InType>(),
+          param.output->template mutable_data<InType>(TARGET(kXPU)),
           nullptr,
           x_dims[0],
           x_dims[1],
@@ -82,10 +84,10 @@ void Pool2DCompute::Run() {
     }
   } else {
     if (param.pooling_type == "avg") {
-      int r = xdnn::avg_pool2d<float>(
+      int r = xdnn::avg_pool2d<InType>(
           ctx.GetRawContext(),
-          param.x->data<float>(),
-          param.output->mutable_data<float>(TARGET(kXPU)),
+          param.x->template data<InType>(),
+          param.output->template mutable_data<InType>(TARGET(kXPU)),
           x_dims[0],
           x_dims[1],
           x_dims[2],
@@ -98,10 +100,10 @@ void Pool2DCompute::Run() {
       CHECK_EQ(r, 0);
     } else {
       if (param.pad_zero == true) {
-        int r = xdnn::max_pool2d<float>(
+        int r = xdnn::max_pool2d<InType>(
             ctx.GetRawContext(),
-            param.x->data<float>(),
-            param.output->mutable_data<float>(TARGET(kXPU)),
+            param.x->template data<InType>(),
+            param.output->template mutable_data<InType>(TARGET(kXPU)),
             nullptr,
             x_dims[0],
             x_dims[1],
@@ -113,7 +115,7 @@ void Pool2DCompute::Run() {
             true);
         CHECK_EQ(r, 0);
       } else {
-        const float* xpu_x_padded = nullptr;
+        const InType* xpu_x_padded = nullptr;
         std::vector<int> xpu_x_padded_dims{static_cast<int>(x_dims[0]),
                                            static_cast<int>(x_dims[1]),
                                            static_cast<int>(x_dims[2]),
@@ -121,7 +123,7 @@ void Pool2DCompute::Run() {
         XPUScratchPadGuard xpu_x_padded_guard_;
         if (paddings[0] == 0 && paddings[1] == 0 && paddings[2] == 0 &&
             paddings[3] == 0) {
-          xpu_x_padded = param.x->data<float>();
+          xpu_x_padded = param.x->template data<InType>();
         } else {
           std::vector<int> pad_left{0, 0, paddings[0], paddings[2]};
           std::vector<int> pad_right{0, 0, paddings[1], paddings[3]};
@@ -130,25 +132,25 @@ void Pool2DCompute::Run() {
           xpu_x_padded_dims[3] =
               xpu_x_padded_dims[3] + paddings[2] + paddings[3];
           xpu_x_padded_guard_ = TargetWrapperXPU::MallocScratchPad(
-              sizeof(float) * xpu_x_padded_dims[0] * xpu_x_padded_dims[1] *
+              sizeof(InType) * xpu_x_padded_dims[0] * xpu_x_padded_dims[1] *
               xpu_x_padded_dims[2] * xpu_x_padded_dims[3]);
-          xpu_x_padded = reinterpret_cast<float*>(xpu_x_padded_guard_->addr_);
-          int r = xdnn::pad<float>(ctx.GetRawContext(),
-                                   param.x->data<float>(),
-                                   const_cast<float*>(xpu_x_padded),
-                                   {static_cast<int>(x_dims[0]),
-                                    static_cast<int>(x_dims[1]),
-                                    static_cast<int>(x_dims[2]),
-                                    static_cast<int>(x_dims[3])},
-                                   pad_left,
-                                   pad_right,
-                                   -9999999.0f);
+          xpu_x_padded = reinterpret_cast<InType*>(xpu_x_padded_guard_->addr_);
+          int r = xdnn::pad<InType>(ctx.GetRawContext(),
+                                    param.x->template data<InType>(),
+                                    const_cast<InType*>(xpu_x_padded),
+                                    {static_cast<int>(x_dims[0]),
+                                     static_cast<int>(x_dims[1]),
+                                     static_cast<int>(x_dims[2]),
+                                     static_cast<int>(x_dims[3])},
+                                    pad_left,
+                                    pad_right,
+                                    -9999999.0f);
           CHECK_EQ(r, 0);
         }
-        int r = xdnn::max_pool2d<float>(
+        int r = xdnn::max_pool2d<InType>(
             ctx.GetRawContext(),
             xpu_x_padded,
-            param.output->mutable_data<float>(TARGET(kXPU)),
+            param.output->template mutable_data<InType>(TARGET(kXPU)),
             nullptr,
             xpu_x_padded_dims[0],
             xpu_x_padded_dims[1],
@@ -168,19 +170,29 @@ void Pool2DCompute::Run() {
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
+// (TODO:quwei) refactor pool2d
+
+using pool2d_fp32 =
+    paddle::lite::kernels::xpu::Pool2DCompute<float, PRECISION(kFloat)>;
+using pool2d_fp16 =
+    paddle::lite::kernels::xpu::Pool2DCompute<float16, PRECISION(kFP16)>;
+
+using max_pool2d_with_index_fp32 =
+    paddle::lite::kernels::xpu::Pool2DCompute<float, PRECISION(kFloat)>;
+
+REGISTER_LITE_KERNEL(pool2d, kXPU, kFloat, kNCHW, pool2d_fp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .Finalize();
 
 REGISTER_LITE_KERNEL(
-    pool2d, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::Pool2DCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    pool2d, kXPU, kFP16, kNCHW, pool2d_fp16, DISABLE_XPU1_pool2d_FP16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(max_pool2d_with_index,
-                     kXPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::xpu::Pool2DCompute,
-                     def)
+REGISTER_LITE_KERNEL(
+    max_pool2d_with_index, kXPU, kFloat, kNCHW, max_pool2d_with_index_fp32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kXPU))})
diff --git a/lite/kernels/xpu/pool_compute.h b/lite/kernels/xpu/pool_compute.h
index 39e14f04a8c..c107b2877b1 100644
--- a/lite/kernels/xpu/pool_compute.h
+++ b/lite/kernels/xpu/pool_compute.h
@@ -20,8 +20,8 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace xpu {
-
-class Pool2DCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+template <typename InType, PrecisionType PType>
+class Pool2DCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::PoolParam;
 
diff --git a/lite/kernels/xpu/reduce_compute.cc b/lite/kernels/xpu/reduce_compute.cc
index da2477d48ba..8563ec4d601 100644
--- a/lite/kernels/xpu/reduce_compute.cc
+++ b/lite/kernels/xpu/reduce_compute.cc
@@ -154,6 +154,8 @@ using ReduceAll = xpu::ReduceCompute<bool, xpu::ReduceAllFunctor<bool>>;
 using ReduceAny = xpu::ReduceCompute<bool, xpu::ReduceAnyFunctor<bool>>;
 using ReduceMeanFloat32 =
     xpu::ReduceCompute<float, xpu::ReduceMeanFunctor<float>>;
+using ReduceMeanFloat16 =
+    xpu::ReduceCompute<float16, xpu::ReduceMeanFunctor<float16>>;
 using ReduceSumFloat32 =
     xpu::ReduceCompute<float, xpu::ReduceSumFunctor<float>>;
 using ReduceProdFloat32 =
@@ -178,6 +180,16 @@ REGISTER_LITE_KERNEL(reduce_mean, kXPU, kFloat, kNCHW, ReduceMeanFloat32, def)
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(reduce_mean,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     ReduceMeanFloat16,
+                     DISABLE_XPU1_ReduceMeanFloat16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(reduce_sum, kXPU, kFloat, kNCHW, ReduceSumFloat32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
diff --git a/lite/kernels/xpu/reshape_compute.cc b/lite/kernels/xpu/reshape_compute.cc
index 78359443991..c82e367e9eb 100644
--- a/lite/kernels/xpu/reshape_compute.cc
+++ b/lite/kernels/xpu/reshape_compute.cc
@@ -69,6 +69,21 @@ REGISTER_LITE_KERNEL(reshape2,
     .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kHost))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(reshape2,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ReshapeCompute<float16>,
+                     float16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("ShapeTensor",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindInput("Shape",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(reshape2,
                      kXPU,
                      kFloat,
@@ -113,6 +128,20 @@ REGISTER_LITE_KERNEL(reshape,
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(reshape,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ReshapeCompute<float16>,
+                     float16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("ShapeTensor",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindInput("Shape",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(flatten,
                      kXPU,
                      kFloat,
@@ -125,6 +154,18 @@ REGISTER_LITE_KERNEL(flatten,
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(flatten,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ReshapeCompute<float16>,
+                     float16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("Shape",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(flatten2,
                      kXPU,
                      kFloat,
@@ -137,3 +178,16 @@ REGISTER_LITE_KERNEL(flatten2,
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kHost))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(flatten2,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ReshapeCompute<float16>,
+                     float16)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindInput("Shape",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
diff --git a/lite/kernels/xpu/slice_compute.cc b/lite/kernels/xpu/slice_compute.cc
index 6353d6114e6..52a2669d083 100644
--- a/lite/kernels/xpu/slice_compute.cc
+++ b/lite/kernels/xpu/slice_compute.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "lite/kernels/xpu/slice_compute.h"
+#include <algorithm>
+#include <vector>
 #include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/op_registry.h"
 
@@ -21,6 +23,62 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
+template <class T>
+void DealTensorArray(XPUContext ctx,
+                     const operators::SliceParam& param,
+                     const std::vector<int>& starts,
+                     const std::vector<int>& ends,
+                     bool out_is_array) {
+  auto in_array = param.XTensorList;
+  // If the input is LoDTensorArray, the rank of input is 1.
+  int64_t in_size = in_array->size();
+  int64_t start = starts[0] < 0 ? (starts[0] + in_size) : starts[0];
+  int64_t end = ends[0] < 0 ? (ends[0] + in_size) : ends[0];
+
+  start = std::max(start, static_cast<int64_t>(0));
+  end = std::max(end, static_cast<int64_t>(0));
+  end = std::min(end, in_size);
+
+  CHECK_GT(end, start) << "end should greater than start";
+  int64_t out_size = end - start;
+
+  if (out_is_array) {
+    auto out_array = param.OutTensorList;
+    out_array->resize(out_size);
+    for (int i = 0; i < out_size; ++i) {
+      auto* out_tensor = &out_array->at(i);
+      auto in_tensor = in_array->at(i + start);
+      out_tensor->Resize(in_tensor.dims());
+      out_tensor->set_lod(in_tensor.lod());
+      out_tensor->set_precision(in_tensor.precision());
+      if (in_tensor.memory_size() > 0) {
+        out_tensor->mutable_data(TARGET(kXPU), in_tensor.memory_size());
+        int r = xdnn::copy<T>(ctx.GetRawContext(),
+                              in_tensor.template data<T>(),
+                              static_cast<T*>(out_tensor->raw_data()),
+                              in_tensor.numel());
+        CHECK_EQ(r, 0) << " write to array failed";
+      } else {
+        VLOG(4) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
+                   "nothing has been written to output array["
+                << i << "].";
+      }
+    }
+  } else {
+    auto out_tensor = param.Out;
+    auto in_tensor = in_array->at(start);
+    out_tensor->Resize(in_tensor.dims());
+    out_tensor->set_lod(in_tensor.lod());
+    out_tensor->set_precision(in_tensor.precision());
+    out_tensor->mutable_data(TARGET(kXPU), in_tensor.memory_size());
+    int r = xdnn::copy<T>(ctx.GetRawContext(),
+                          in_tensor.data<T>(),
+                          static_cast<T*>(out_tensor->raw_data()),
+                          in_tensor.numel());
+    CHECK_EQ(r, 0) << " write to array failed";
+  }
+}
+
 inline std::vector<int> GetIntDataFromTensorList(
     const std::vector<lite::Tensor*>& list_tensor) {
   std::vector<int> vec_data;
@@ -77,8 +135,6 @@ void SliceCompute<T>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
 
-  auto out = param.Out;
-  auto in = param.X;
   auto axes = param.axes;
   auto StartsTensor = param.StartsTensor;
   auto EndsTensor = param.EndsTensor;
@@ -89,9 +145,6 @@ void SliceCompute<T>::Run() {
   auto infer_flags = param.infer_flags;
   auto decrease_axis = param.decrease_axis;
 
-  auto out_dims = out->dims();
-  auto in_dims = in->dims();
-
   bool need_infer = false;
   if (StartsTensor || EndsTensor) {
     need_infer = true;
@@ -114,52 +167,69 @@ void SliceCompute<T>::Run() {
     }
     CHECK_EQ(ends.size(), axes.size())
         << "The size of ends must be equal to the size of axes.";
-    out_dims = in_dims;
-    int dim_value, start, end;
-    for (size_t i = 0; i < axes.size(); ++i) {
-      dim_value = out_dims[axes[i]];
-      if (dim_value > 0) {
-        // when end = start + 1 and start == -1
-        if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
-          auto ret =
-              std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
-          if (ret != decrease_axis.end()) {
-            ends[i] = 10000000;
-          }
-        }
+  }
+  // if slice input is tensor_array
+  if (param.X == nullptr && param.XTensorList != nullptr) {
+    DealTensorArray<T>(
+        ctx,
+        param,
+        starts,
+        ends,
+        (param.Out == nullptr && param.OutTensorList != nullptr));
+    return;
+  }
 
-        start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
-        end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
-        start = (std::max)(start, 0);
-        end = (std::max)(end, 0);
-        end = (std::min)(end, dim_value);
-        CHECK_GT(end, start) << "end should greater than start";
-        out_dims[axes[i]] = end - start;
+  auto out = param.Out;
+  auto in = param.X;
+  auto out_dims = out->dims();
+  auto in_dims = in->dims();
+  out_dims = in_dims;
+  int dim_value, start, end;
+  for (size_t i = 0; i < axes.size(); ++i) {
+    dim_value = out_dims[axes[i]];
+    if (dim_value > 0) {
+      // when end = start + 1 and start == -1
+      if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
+        auto ret =
+            std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
+        if (ret != decrease_axis.end()) {
+          ends[i] = 10000000;
+        }
       }
+
+      start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
+      end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
+      start = (std::max)(start, 0);
+      end = (std::max)(end, 0);
+      end = (std::min)(end, dim_value);
+      CHECK_GT(end, start) << "end should greater than start";
+      out_dims[axes[i]] = end - start;
     }
-    out->Resize(out_dims);
-    // generate new shape
-    if (decrease_axis.size() > 0) {
-      std::vector<int64_t> new_out_shape;
-      for (size_t i = 0; i < decrease_axis.size(); ++i) {
-        CHECK_EQ(out_dims[decrease_axis[i]], 1) << "decrease dim should be 1";
-        out_dims[decrease_axis[i]] = 0;
-      }
+  }
 
-      for (size_t i = 0; i < out_dims.size(); ++i) {
-        if (out_dims[i] != 0) {
-          new_out_shape.push_back(out_dims[i]);
-        }
-      }
-      if (new_out_shape.size() == 0) {
-        new_out_shape.push_back(1);
-      }
+  out->Resize(out_dims);
+  // generate new shape
+  if (decrease_axis.size() > 0) {
+    std::vector<int64_t> new_out_shape;
+    for (size_t i = 0; i < decrease_axis.size(); ++i) {
+      CHECK_EQ(out_dims[decrease_axis[i]], 1) << "decrease dim should be 1";
+      out_dims[decrease_axis[i]] = 0;
+    }
 
-      DDim new_dims;
-      new_dims.ConstructFrom(new_out_shape);
-      out_dims = new_dims;
+    for (size_t i = 0; i < out_dims.size(); ++i) {
+      if (out_dims[i] != 0) {
+        new_out_shape.push_back(out_dims[i]);
+      }
+    }
+    if (new_out_shape.size() == 0) {
+      new_out_shape.push_back(1);
     }
+
+    DDim new_dims;
+    new_dims.ConstructFrom(new_out_shape);
+    out_dims = new_dims;
   }
+
   auto x_shape = in_dims.Vectorize();
   std::vector<int> x_shape_(x_shape.begin(), x_shape.end());
   std::vector<int> x_dim_begin_(in_dims.size(), 0);
@@ -205,6 +275,21 @@ REGISTER_LITE_KERNEL(slice, kXPU, kFloat, kAny, SliceFloat32, def)
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
     .Finalize();
 
+using SliceFloat32 = paddle::lite::kernels::xpu::SliceCompute<float>;
+REGISTER_LITE_KERNEL(slice, kXPU, kFloat, kAny, SliceFloat32, array_def)
+    .BindInput("Input",
+               {LiteType::GetTensorListTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindInput("StartsTensor",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindInput("EndsTensor",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .Finalize();
+
 using SliceInt32 = paddle::lite::kernels::xpu::SliceCompute<int32_t>;
 REGISTER_LITE_KERNEL(slice, kXPU, kFloat, kAny, SliceInt32, int32)
     .BindInput("Input",
@@ -220,6 +305,21 @@ REGISTER_LITE_KERNEL(slice, kXPU, kFloat, kAny, SliceInt32, int32)
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
     .Finalize();
 
+using SliceInt32 = paddle::lite::kernels::xpu::SliceCompute<int32_t>;
+REGISTER_LITE_KERNEL(slice, kXPU, kFloat, kAny, SliceInt32, array_int32)
+    .BindInput("Input",
+               {LiteType::GetTensorListTy(TARGET(kXPU), PRECISION(kInt32))})
+    .BindInput("StartsTensor",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindInput("EndsTensor",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
+    .Finalize();
+
 using SliceInt64 = paddle::lite::kernels::xpu::SliceCompute<int64_t>;
 REGISTER_LITE_KERNEL(slice, kXPU, kFloat, kAny, SliceInt64, int64)
     .BindInput("Input",
@@ -234,3 +334,18 @@ REGISTER_LITE_KERNEL(slice, kXPU, kFloat, kAny, SliceInt64, int64)
                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .Finalize();
+
+using SliceInt64 = paddle::lite::kernels::xpu::SliceCompute<int64_t>;
+REGISTER_LITE_KERNEL(slice, kXPU, kFloat, kAny, SliceInt64, array_int64)
+    .BindInput("Input",
+               {LiteType::GetTensorListTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindInput("StartsTensor",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindInput("EndsTensor",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindInput("StartsTensorList",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindInput("EndsTensorList",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/xpu/stack_compute.cc b/lite/kernels/xpu/stack_compute.cc
index 0960c05a63c..373f682b958 100644
--- a/lite/kernels/xpu/stack_compute.cc
+++ b/lite/kernels/xpu/stack_compute.cc
@@ -21,7 +21,8 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-void StackCompute::Run() {
+template <typename T, PrecisionType PType>
+void StackCompute<T, PType>::Run() {
   auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<XPUContext>();
 
@@ -39,15 +40,15 @@ void StackCompute::Run() {
   x_shape[axis] = 1;
   std::vector<std::vector<int>> xdims_list(n, x_shape);
 
-  std::vector<const float*> x_list(n, nullptr);
+  std::vector<const T*> x_list(n, nullptr);
   for (int i = 0; i < n; ++i) {
-    x_list[i] = param.X[i]->data<float>();
+    x_list[i] = param.X[i]->template data<T>();
   }
-  int r = xdnn::concat<float>(ctx.GetRawContext(),
-                              x_list,
-                              param.Out->mutable_data<float>(TARGET(kXPU)),
-                              xdims_list,
-                              axis);
+  int r = xdnn::concat<T>(ctx.GetRawContext(),
+                          x_list,
+                          param.Out->template mutable_data<T>(TARGET(kXPU)),
+                          xdims_list,
+                          axis);
   CHECK_EQ(r, 0);
 }
 
@@ -56,8 +57,16 @@ void StackCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(
-    stack, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::StackCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
-    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+using stack_float =
+    paddle::lite::kernels::xpu::StackCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(stack, kXPU, kFloat, kNCHW, stack_float, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
+    .Finalize();
+
+using stack_int64 =
+    paddle::lite::kernels::xpu::StackCompute<int64_t, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(stack, kXPU, kFloat, kNCHW, stack_int64, int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
     .Finalize();
diff --git a/lite/kernels/xpu/stack_compute.h b/lite/kernels/xpu/stack_compute.h
index 00f01b9466a..3e6dd033de6 100644
--- a/lite/kernels/xpu/stack_compute.h
+++ b/lite/kernels/xpu/stack_compute.h
@@ -23,7 +23,8 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-class StackCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class StackCompute : public KernelLite<TARGET(kXPU), PType> {
  public:
   using param_t = operators::StackParam;
 
diff --git a/lite/kernels/xpu/tile_compute.cc b/lite/kernels/xpu/tile_compute.cc
new file mode 100644
index 00000000000..79007b85dcb
--- /dev/null
+++ b/lite/kernels/xpu/tile_compute.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/tile_compute.h"
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <typename T, PrecisionType PType>
+void TileCompute<T, PType>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+  auto repeat_times = param.repeat_times;
+  if (param.RepeatTimes) {
+    auto repeat_times_size = param.RepeatTimes->data_size();
+    for (int64_t i = 0; i < repeat_times_size; i++) {
+      repeat_times.push_back(param.RepeatTimes->template data<int>()[i]);
+    }
+  } else if (param.repeat_times_tensor.size() != 0) {
+    for (int i = 0; i < param.repeat_times_tensor.size(); i++) {
+      auto temp = param.repeat_times_tensor[i];
+      repeat_times.push_back(*(temp->template data<int>()));
+    }
+  }
+  auto in_dims = param.X->dims();
+  auto vec_in_dims = in_dims.Vectorize();
+  // broadcast for vec_in_dims.size() equal to repeat_times.size()
+  if (repeat_times.size() < vec_in_dims.size()) {
+    int diff = vec_in_dims.size() - repeat_times.size();
+    repeat_times.insert(repeat_times.begin(), diff, 1);
+  } else {
+    int diff = repeat_times.size() - vec_in_dims.size();
+    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+  }
+
+  std::vector<int> new_in_dims(vec_in_dims.begin(), vec_in_dims.end());
+  std::vector<int> out_dims(param.Out->dims().data().begin(),
+                            param.Out->dims().data().end());
+  int r = xdnn::broadcast<T>(ctx.GetRawContext(),
+                             param.X->template data<T>(),
+                             param.Out->template mutable_data<T>(TARGET(kXPU)),
+                             new_in_dims,
+                             out_dims);
+
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using tile_float =
+    paddle::lite::kernels::xpu::TileCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(tile, kXPU, kFloat, kNCHW, tile_float, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("RepeatTimes",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindInput("repeat_times_tensor",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/tile_compute.h b/lite/kernels/xpu/tile_compute.h
new file mode 100644
index 00000000000..9b6329fa17c
--- /dev/null
+++ b/lite/kernels/xpu/tile_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <typename T, PrecisionType PType>
+class TileCompute : public KernelLite<TARGET(kXPU), PType> {
+ public:
+  using param_t = operators::TileParam;
+
+  virtual void Run();
+
+  virtual ~TileCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/transpose_compute.cc b/lite/kernels/xpu/transpose_compute.cc
index d1c9553ba71..19441de2849 100644
--- a/lite/kernels/xpu/transpose_compute.cc
+++ b/lite/kernels/xpu/transpose_compute.cc
@@ -75,6 +75,18 @@ REGISTER_LITE_KERNEL(transpose2,
     .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kHost))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(transpose2,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::TransposeCompute<int>,
+                     def_int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
+    .BindOutput("XShape",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(transpose2,
                      kXPU,
                      kFloat,
diff --git a/lite/operators/__xpu__embedding_with_eltwise_add_op.cc b/lite/operators/__xpu__embedding_with_eltwise_add_op.cc
index 13819d61046..d3c1b7e1f30 100644
--- a/lite/operators/__xpu__embedding_with_eltwise_add_op.cc
+++ b/lite/operators/__xpu__embedding_with_eltwise_add_op.cc
@@ -33,7 +33,12 @@ bool XPUEmbeddingWithEltwiseAddOp::CheckShape() const {
     }
   }
   if (param_.Mask != nullptr) {
-    CHECK_EQ(id_rank, param_.Mask->dims().size());
+    if (id_rank != param_.Mask->dims().size()) {
+      CHECK(id_rank == 2 && param_.Mask->dims().size() == 3 &&
+            param_.Mask->dims()[2] == 1)
+          << "unsupported id_rank: " << id_rank
+          << "mask_dims_size: " << param_.Mask->dims().size();
+    }
     for (size_t j = 0; j < id_rank; j++) {
       CHECK_EQ(ids_dim[j], param_.Mask->dims()[j]);
     }
diff --git a/lite/operators/__xpu__fc_op.cc b/lite/operators/__xpu__fc_op.cc
index 21f6faebcb5..71357bf6338 100644
--- a/lite/operators/__xpu__fc_op.cc
+++ b/lite/operators/__xpu__fc_op.cc
@@ -107,16 +107,13 @@ bool XPUFcOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
     param_.input_max =
         scope->FindVar(op_desc.Input("InputMax").front())->GetMutable<Tensor>();
   }
-  if (op_desc.HasAttr("precision")) {
-    param_.precision = op_desc.GetAttr<std::string>("precision");
-  }
+
   if (op_desc.HasAttr("enable_int8") && op_desc.GetAttr<bool>("enable_int8")) {
-    CHECK(param_.precision == "int8") << "enable_int8 precison:"
-                                      << param_.precision;
+    param_.enable_int8 = op_desc.GetAttr<bool>("enable_int8");
     param_.quant_input_max =
         127 * op_desc.GetAttr<std::vector<float>>("X0_scale")[0];
-    param_.quant_w_max =
-        127 * op_desc.GetAttr<std::vector<float>>("Y0_scale")[0];
+    param_.weight_max = op_desc.GetAttr<std::vector<float>>("Y0_max");
+    param_.per_channel = op_desc.GetAttr<bool>("per_channel");
   }
   return true;
 }
diff --git a/lite/operators/__xpu__multi_encoder_op.cc b/lite/operators/__xpu__multi_encoder_op.cc
index e97b3a26f6d..397a1b154d1 100644
--- a/lite/operators/__xpu__multi_encoder_op.cc
+++ b/lite/operators/__xpu__multi_encoder_op.cc
@@ -58,7 +58,11 @@ bool XPUMultiEncoderOp::InferShapeImpl() const {
       new_dims.ConstructFrom(new_out_shape);
       out_dims = new_dims;
     }
-    param_.output->Resize(out_dims);
+    if (param_.norm_before) {
+      param_.output->Resize({batch_size, 1, head_num});
+    } else {
+      param_.output->Resize(out_dims);
+    }
   } else {
     param_.output->Resize({batch_size, seq_len, head_num});
   }
@@ -69,7 +73,7 @@ bool XPUMultiEncoderOp::AttachImpl(const cpp::OpDesc& op_desc,
                                    lite::Scope* scope) {
   param_.input = const_cast<lite::Tensor*>(
       &scope->FindVar(op_desc.Input("Input").front())->Get<lite::Tensor>());
-  param_.fc_weight_max = const_cast<lite::Tensor*>(
+  param_.weight_max = const_cast<lite::Tensor*>(
       &scope->FindVar(op_desc.Input("FCWeightMax").front())
            ->Get<lite::Tensor>());
   param_.output = scope->FindVar(op_desc.Output("Output").front())
@@ -141,9 +145,12 @@ bool XPUMultiEncoderOp::AttachImpl(const cpp::OpDesc& op_desc,
   param_.enable_qkv_fusion = op_desc.GetAttr<bool>("enable_qkv_fusion");
   param_.norm_before = op_desc.GetAttr<bool>("norm_before");
   param_.adaptive_seqlen = op_desc.GetAttr<bool>("adaptive_seqlen");
+  param_.per_channel = op_desc.GetAttr<bool>("per_channel");
+  if (param_.per_channel) {
+    param_.fc_channels = op_desc.GetAttr<std::vector<int>>("fc_channels");
+  }
   if (op_desc.HasAttr("enable_int8") && op_desc.GetAttr<bool>("enable_int8")) {
     param_.input_max = op_desc.GetAttr<std::vector<float>>("FCInputMax");
-    param_.weight_max = op_desc.GetAttr<std::vector<float>>("FCWeightMax");
   }
 
   if (op_desc.HasAttr("slice_axes")) {
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index bcbf480b564..f7028f9caba 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -1711,6 +1711,7 @@ struct XPUBlockFuseParam : ParamBase {
 
 struct XPUMultiEncoderParam : ParamBase {
   lite::Tensor* input{};
+  const lite::Tensor* weight_max{nullptr};
   std::vector<lite::Tensor*> fc_weight;
   std::vector<lite::Tensor*> fc_bias;
   std::vector<lite::Tensor*> ln_scale;
@@ -1726,7 +1727,6 @@ struct XPUMultiEncoderParam : ParamBase {
   std::vector<int> slice_ends{};
   std::vector<int> slice_decrease_axis{};
   std::vector<float> input_max{};
-  std::vector<float> weight_max{};
   int n_layers{};
   int head_num{};
   int size_per_head{};
@@ -1736,6 +1736,8 @@ struct XPUMultiEncoderParam : ParamBase {
   bool enable_qkv_fusion{false};
   bool norm_before{false};
   bool adaptive_seqlen{false};
+  bool per_channel{false};
+  std::vector<int> fc_channels{};
 };
 
 struct XPUEmbeddingWithEltwiseAddParam : ParamBase {
@@ -1760,10 +1762,14 @@ struct XPUFcParam : ParamBase {
   int act_type;
   float act_param;
   float quant_input_max{0.f};
-  float quant_w_max{0.f};
+  std::vector<float> weight_max{};
   std::string precision{};
   bool has_bias{false};
   int in_num_col_dims{1};
+  bool transpose_x{false};
+  bool transpose_w{true};
+  bool enable_int8{false};
+  bool per_channel{false};
 };
 
 struct XPUResNetCbamParam : ParamBase {
diff --git a/lite/operators/tile_op.cc b/lite/operators/tile_op.cc
index 042afa692df..45d3c74e5fe 100644
--- a/lite/operators/tile_op.cc
+++ b/lite/operators/tile_op.cc
@@ -118,6 +118,7 @@ bool TileOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   } else if (opdesc.HasInput("repeat_times_tensor") &&
              (opdesc.Input("repeat_times_tensor").size() != 0)) {
     auto temp = opdesc.Input("repeat_times_tensor");
+    param_.repeat_times_tensor.clear();
     for (auto var : temp) {
       param_.repeat_times_tensor.push_back(
           scope->FindVar(var)->GetMutable<lite::Tensor>());
diff --git a/lite/tests/api/test_resnet50_fp32_baidu_xpu.cc b/lite/tests/api/test_resnet50_fp32_baidu_xpu.cc
index 93b4308f102..d54c7088452 100644
--- a/lite/tests/api/test_resnet50_fp32_baidu_xpu.cc
+++ b/lite/tests/api/test_resnet50_fp32_baidu_xpu.cc
@@ -14,6 +14,7 @@
 
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
+#include <stdlib.h>
 #include <vector>
 #include "lite/api/paddle_api.h"
 #include "lite/api/paddle_use_kernels.h"
@@ -35,13 +36,13 @@ namespace paddle {
 namespace lite {
 
 TEST(resnet50, test_resnet50_fp32_baidu_xpu) {
+  setenv("XPU_CONV_AUTOTUNE", "5", 1);
   lite_api::CxxConfig config;
   config.set_model_dir(FLAGS_model_dir);
   config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
                            lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
                            lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
   config.set_xpu_l3_cache_method(16773120, false);
-  config.set_xpu_conv_autotune(true);
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
   std::string raw_data_dir = FLAGS_data_dir + std::string("/raw_data");
diff --git a/lite/tests/kernels/tile_compute_test.cc b/lite/tests/kernels/tile_compute_test.cc
index 5bf48aa880c..07e11039a12 100644
--- a/lite/tests/kernels/tile_compute_test.cc
+++ b/lite/tests/kernels/tile_compute_test.cc
@@ -199,6 +199,9 @@ TEST(tile, precision) {
 #else
   return;
 #endif
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+  alias = "def";
 #elif defined(LITE_WITH_ARM) || defined(LITE_WITH_X86)
   place = TARGET(kHost);
 #else