[XPU][cherry-pick] support K200 per channel & R200 mul&matmul quant;

* [XPU] fixed the bug of tile op in large input and add XPU implementation. (#9102) * [XPU] Fixed the bug of reuse of reshape2's output in xpu_memory_optimize_pass (#9178) * [x86][XPU] Add the support tensorarray of slice on x86 and xpu (#9134) * [XPU] Fixed the error on stack op binding on float. (#9204) * [XPU] Stop supporting xpu conv autotune config with paddlelite c api. (#9316) * [XPU] Support pre-LN encoder (#9159) * [xpu] support fp16 data pricision (#9080) * [xpu] delete kernel.precision()==float (#9189) * [XPU] support fp16 data pression (#9228) * [XPU] support fc per channel quant (#9323) Co-authored-by: wbn <66299196+wbn520@users.noreply.github.com> Co-authored-by: Jinchen Han <zealoct@hotmail.com> Co-authored-by: TingShenXD <99321958+TingShenXD@users.noreply.github.com> Co-authored-by: quwei03 <32065370+xiuxin121@users.noreply.github.com>
PaddlePaddle · Aug 29, 2022 · 7ca3e00 · 7ca3e00
1 parent 8f09eb2
commit 7ca3e00
Show file tree

Hide file tree

Showing 59 changed files with 2,888 additions and 724 deletions.
diff --git a/cmake/backends/xpu.cmake b/cmake/backends/xpu.cmake
@@ -23,7 +23,7 @@ set (XPU_DOWNLOAD_DIR ${XPU_SOURCE_DIR}/download)
 set (XPU_INSTALL_DIR ${THIRD_PARTY_PATH}/install)
 
 if (NOT XPU_SDK_URL)
-  set (XPU_SDK_URL "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev/20220519")
+  set (XPU_SDK_URL "https://baidu-kunlun-product.su.bcebos.com/klx-sdk/search/20220825")
 endif ()
 
 if (NOT XPU_SDK_ENV)

diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
@@ -626,8 +626,12 @@ void CxxConfig::set_xpu_multi_encoder_method(const std::string &precision,
 void CxxConfig::set_xpu_conv_autotune(bool autotune,
                                       const std::string &autotune_file) {
 #ifdef LITE_WITH_XPU
-  lite::TargetWrapperXPU::conv_autotune = autotune;
-  lite::TargetWrapperXPU::conv_autotune_file = autotune_file;
+  LOG(WARNING)
+      << "This function "
+         "'set_xpu_conv_autotune' is deprecated, "
+         "if you want to use autotune, please refer to "
+         "http://agroup.baidu.com/share/md/f9233d84df11452488a1fdd4f859647f";
+
 #else
   LOG(WARNING) << "The invoking of the function "
                   "'set_xpu_conv_autotune' is ignored, please "

diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
@@ -472,6 +472,8 @@ class LITE_API CxxConfig : public ConfigBase {
 
   void set_xpu_gm_workspace_method(size_t gm_size);
 
+  // **DEPRECATED**, use environ variable to enable autotune
+  // check http://agroup.baidu.com/share/md/f9233d84df11452488a1fdd4f859647f
   void set_xpu_conv_autotune(bool autotune = true,
                              const std::string& autotune_file = "");
 

diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
@@ -107,6 +107,7 @@ USE_MIR_PASS(__xpu__bigru_fuse_pass);
 USE_MIR_PASS(__xpu__dynamic_lstm_fuse_pass);
 USE_MIR_PASS(__xpu__multi_softmax_fuse_pass);
 USE_MIR_PASS(__xpu__max_pooling_pad_zero_detect_fuse_pass);
+USE_MIR_PASS(__xpu__static_kernel_pick_pass);
 USE_MIR_PASS(x86_int8_attribute_pass);
 USE_MIR_PASS(fill_range_fuse_pass);
 USE_MIR_PASS(range_calc_offline_pass);

diff --git a/lite/backends/xpu/target_wrapper.cc b/lite/backends/xpu/target_wrapper.cc
@@ -43,9 +43,13 @@ void TargetWrapperXPU::MemcpySync(void* dst,
 
 template <typename Tcpu, typename Txpu>
 XPUQuantData TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight(
-    const Tcpu* cpu_data, const DDimLite& dims, bool data_transpose) {
+    const Tcpu* cpu_data,
+    const DDimLite& dims,
+    bool data_transpose,
+    size_t max_ptr_len) {
   CHECK(quantizer_.get());
-  return quantizer_->quant<Tcpu, Txpu>(cpu_data, dims, data_transpose);
+  return quantizer_->quant<Tcpu, Txpu>(
+      cpu_data, dims, data_transpose, max_ptr_len);
 }
 
 void TargetWrapperXPU::ScatterL3Cache(
@@ -145,16 +149,16 @@ void TargetWrapperXPU::FreeL3Cache() {
 
 template XPUQuantData
 TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, float>(
-    const float*, const DDimLite&, bool);
+    const float*, const DDimLite&, bool, size_t);
 template XPUQuantData
 TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, int16_t>(
-    const float*, const DDimLite&, bool);
+    const float*, const DDimLite&, bool, size_t);
 template XPUQuantData
 TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, int8_t>(
-    const float*, const DDimLite&, bool);
+    const float*, const DDimLite&, bool, size_t);
 template XPUQuantData
 TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<int8_t, int8_t>(
-    const int8_t*, const DDimLite&, bool);
+    const int8_t*, const DDimLite&, bool, size_t);
 
 // xpu context
 LITE_THREAD_LOCAL std::shared_ptr<xdnn::Context> TargetWrapperXPU::tls_raw_ctx_{
@@ -165,9 +169,6 @@ LITE_THREAD_LOCAL std::shared_ptr<void> TargetWrapperXPU::xpu_stream_{nullptr};
 LITE_THREAD_LOCAL std::string
     TargetWrapperXPU::multi_encoder_precision;  // NOLINT
 LITE_THREAD_LOCAL bool TargetWrapperXPU::multi_encoder_adaptive_seqlen{false};
-// conv autotune config
-LITE_THREAD_LOCAL bool TargetWrapperXPU::conv_autotune{false};
-LITE_THREAD_LOCAL std::string TargetWrapperXPU::conv_autotune_file;
 // l3 cache config
 LITE_THREAD_LOCAL bool TargetWrapperXPU::need_l3_mutex{false};
 LITE_THREAD_LOCAL size_t TargetWrapperXPU::local_l3_size{

diff --git a/lite/backends/xpu/target_wrapper.h b/lite/backends/xpu/target_wrapper.h
@@ -69,7 +69,8 @@ class TargetWrapper<TARGET(kXPU)> {
   template <typename Tcpu, typename Txpu>
   static XPUQuantData ConvertCPUWeightToXPUQuantWeight(const Tcpu* cpu_data,
                                                        const DDimLite& dims,
-                                                       bool data_transpose);
+                                                       bool data_transpose,
+                                                       size_t max_ptr_len);
 
   static xdnn::Context* GetRawContext() {
     if (tls_raw_ctx_.get() == nullptr) {
@@ -111,14 +112,6 @@ class TargetWrapper<TARGET(kXPU)> {
         quantizer_.reset(new XPUQuantizer());
       }
       CHECK(quantizer_.get());
-      if (conv_autotune) {
-        tls_raw_ctx_->_xpu1_conv_selector.set_autotune_loop(true);
-        tls_raw_ctx_->_xpu1_conv_selector.set_inference_mode(true);
-      }
-      if (!conv_autotune_file.empty()) {
-        tls_raw_ctx_->_xpu1_conv_selector.set_autotune_file(
-            conv_autotune_file.c_str());
-      }
       int devid = -1;
       uint64_t max_l3_size = 0;
       XPU_CALL(xpu_current_device(&devid));
@@ -173,9 +166,6 @@ class TargetWrapper<TARGET(kXPU)> {
   // multi encoder config
   static LITE_THREAD_LOCAL std::string multi_encoder_precision;  // NOLINT
   static LITE_THREAD_LOCAL bool multi_encoder_adaptive_seqlen;
-  // conv autotune config
-  static LITE_THREAD_LOCAL bool conv_autotune;
-  static LITE_THREAD_LOCAL std::string conv_autotune_file;  // NOLINT
   // l3 cache config
   static LITE_THREAD_LOCAL bool need_l3_mutex;    // model level l3 size
   static LITE_THREAD_LOCAL size_t local_l3_size;  // model level l3 size

diff --git a/lite/backends/xpu/xpu_quantizer.cc b/lite/backends/xpu/xpu_quantizer.cc
@@ -112,7 +112,8 @@ template <
 void XPUQuantizer::ConvertWithQuant(const Tcpu* cpu_data,
                                     const DDimLite& dims,
                                     bool data_transpose,
-                                    size_t hashed_key) {
+                                    size_t hashed_key,
+                                    size_t max_ptr_len) {
   LOG(FATAL) << "Not support for Tcpu is " << CppTypeToString<Tcpu>();
 }
 
@@ -123,7 +124,8 @@ template <
 void XPUQuantizer::ConvertWithQuant(const Tcpu* cpu_data,
                                     const DDimLite& dims,
                                     bool data_transpose,
-                                    size_t hashed_key) {
+                                    size_t hashed_key,
+                                    size_t max_ptr_len) {
   // transpose
   const Tcpu* cpu_ptr = nullptr;
   int numel = dims.production();
@@ -140,7 +142,7 @@ void XPUQuantizer::ConvertWithQuant(const Tcpu* cpu_data,
   XPUScratchPadGuard weight_max_guard;
   XPUScratchPadGuard quant_weight_guard;
   float max_val = paddle::lite::xpu::math::FindMaxAbs(cpu_ptr, numel);
-  int max_ptr_size = XPUMemory::get_max_ptr_size();
+  size_t max_ptr_size = max_ptr_len;
   std::vector<float> max_vec(max_ptr_size, max_val);
   weight_max_guard =
       std::move(XPUMemory::MallocScratchPad(max_ptr_size * sizeof(float)));
@@ -162,11 +164,12 @@ template <typename T>
 void XPUQuantizer::ConvertWithoutQuant(const T* cpu_data,
                                        const DDimLite& dims,
                                        bool data_transpose,
-                                       size_t hashed_key) {
+                                       size_t hashed_key,
+                                       size_t max_ptr_len) {
   // transpose
   const T* cpu_ptr = nullptr;
   int numel = dims.production();
-  int max_ptr_size = XPUMemory::get_max_ptr_size();
+  size_t max_ptr_size = max_ptr_len;
   std::vector<T> transpose_data(numel, 0);
   if (data_transpose) {
     CHECK(dims.size() == 2) << "Not support: dims.size = " << dims.size();
@@ -178,8 +181,9 @@ void XPUQuantizer::ConvertWithoutQuant(const T* cpu_data,
   }
   // copy to XPU
   XPUScratchPadGuard weight_max_guard(new XPUScratchPad(nullptr, 0));
-  if (std::is_same<T, int8_t>::value) {
+  if (std::is_same<T, int8_t>::value || std::is_same<T, int16_t>::value) {
     // prepare max_w space for slim int8 quant
+    // just allocate buffer, set max value in kernel
     weight_max_guard =
         std::move(XPUMemory::MallocScratchPad(max_ptr_size * sizeof(float)));
   }
@@ -196,7 +200,8 @@ void XPUQuantizer::ConvertWithoutQuant(const T* cpu_data,
 template <typename Tcpu, typename Txpu>
 XPUQuantData XPUQuantizer::quant(const Tcpu* cpu_data,
                                  const DDimLite& dims,
-                                 bool data_transpose) {
+                                 bool data_transpose,
+                                 size_t max_ptr_len) {
   int numel = dims.production();
   const std::string cpu_dtype = CppTypeToString<Tcpu>();
   const std::string xpu_dtype = CppTypeToString<Txpu>();
@@ -206,7 +211,8 @@ XPUQuantData XPUQuantizer::quant(const Tcpu* cpu_data,
           << ", precision=" << precision << ", transpose=" << data_transpose
           << ", hashed_key=" << hashed_key;
   if (weight_cache_.find(hashed_key) == weight_cache_.end()) {
-    ConvertWrapper<Tcpu, Txpu>(cpu_data, dims, data_transpose, hashed_key);
+    ConvertWrapper<Tcpu, Txpu>(
+        cpu_data, dims, data_transpose, hashed_key, max_ptr_len);
   }
 
   float* max_ptr =
@@ -218,15 +224,19 @@ XPUQuantData XPUQuantizer::quant(const Tcpu* cpu_data,
 
 template XPUQuantData XPUQuantizer::quant<float, float>(const float*,
                                                         const DDimLite&,
-                                                        bool);
+                                                        bool,
+                                                        size_t);
 template XPUQuantData XPUQuantizer::quant<float, int16_t>(const float*,
                                                           const DDimLite&,
-                                                          bool);
+                                                          bool,
+                                                          size_t);
 template XPUQuantData XPUQuantizer::quant<float, int8_t>(const float*,
                                                          const DDimLite&,
-                                                         bool);
+                                                         bool,
+                                                         size_t);
 template XPUQuantData XPUQuantizer::quant<int8_t, int8_t>(const int8_t*,
                                                           const DDimLite&,
-                                                          bool);
+                                                          bool,
+                                                          size_t);
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/xpu/xpu_quantizer.h b/lite/backends/xpu/xpu_quantizer.h
@@ -36,7 +36,8 @@ class XPUQuantizer {
   template <typename Tcpu, typename Txpu>
   XPUQuantData quant(const Tcpu* cpu_data,
                      const DDimLite& dims,
-                     bool data_transpose);
+                     bool data_transpose,
+                     size_t max_ptr_len);
 
  private:
   template <typename T>
@@ -49,7 +50,8 @@ class XPUQuantizer {
   void ConvertWithoutQuant(const T* cpu_data,
                            const DDimLite& dims,
                            bool data_transpose,
-                           size_t hashed_key);
+                           size_t hashed_key,
+                           size_t max_ptr_len);
 
   template <typename Tcpu,
             typename Txpu,
@@ -58,7 +60,8 @@ class XPUQuantizer {
   void ConvertWithQuant(const Tcpu* cpu_data,
                         const DDimLite& dims,
                         bool data_transpose,
-                        size_t hashed_key);
+                        size_t hashed_key,
+                        size_t max_ptr_len);
 
   template <typename Tcpu,
             typename Txpu,
@@ -67,7 +70,8 @@ class XPUQuantizer {
   void ConvertWithQuant(const Tcpu* cpu_data,
                         const DDimLite& dims,
                         bool data_transpose,
-                        size_t hashed_key);
+                        size_t hashed_key,
+                        size_t max_ptr_len);
 
   template <typename Tcpu,
             typename Txpu,
@@ -76,8 +80,10 @@ class XPUQuantizer {
   void ConvertWrapper(const Tcpu* cpu_data,
                       const DDimLite& dims,
                       bool data_transpose,
-                      size_t hashed_key) {
-    ConvertWithQuant<Tcpu, Txpu>(cpu_data, dims, data_transpose, hashed_key);
+                      size_t hashed_key,
+                      size_t max_ptr_len) {
+    ConvertWithQuant<Tcpu, Txpu>(
+        cpu_data, dims, data_transpose, hashed_key, max_ptr_len);
   }
 
   template <typename Tcpu,
@@ -87,8 +93,10 @@ class XPUQuantizer {
   void ConvertWrapper(const Tcpu* cpu_data,
                       const DDimLite& dims,
                       bool data_transpose,
-                      size_t hashed_key) {
-    ConvertWithoutQuant<Tcpu>(cpu_data, dims, data_transpose, hashed_key);
+                      size_t hashed_key,
+                      size_t max_ptr_len) {
+    ConvertWithoutQuant<Tcpu>(
+        cpu_data, dims, data_transpose, hashed_key, max_ptr_len);
   }
 
   // cpu data to xpu quant data