diff --git a/cmake/backends/xpu.cmake b/cmake/backends/xpu.cmake index 3dd9acb7f3d..8ef1de035e0 100644 --- a/cmake/backends/xpu.cmake +++ b/cmake/backends/xpu.cmake @@ -23,7 +23,7 @@ set (XPU_DOWNLOAD_DIR ${XPU_SOURCE_DIR}/download) set (XPU_INSTALL_DIR ${THIRD_PARTY_PATH}/install) if (NOT XPU_SDK_URL) - set (XPU_SDK_URL "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev/20220519") + set (XPU_SDK_URL "https://baidu-kunlun-product.su.bcebos.com/klx-sdk/search/20220825") endif () if (NOT XPU_SDK_ENV) diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc index e102917fb03..bc3a8bb185c 100644 --- a/lite/api/paddle_api.cc +++ b/lite/api/paddle_api.cc @@ -626,8 +626,12 @@ void CxxConfig::set_xpu_multi_encoder_method(const std::string &precision, void CxxConfig::set_xpu_conv_autotune(bool autotune, const std::string &autotune_file) { #ifdef LITE_WITH_XPU - lite::TargetWrapperXPU::conv_autotune = autotune; - lite::TargetWrapperXPU::conv_autotune_file = autotune_file; + LOG(WARNING) + << "This function " + "'set_xpu_conv_autotune' is deprecated, " + "if you want to use autotune, please refer to " + "http://agroup.baidu.com/share/md/f9233d84df11452488a1fdd4f859647f"; + #else LOG(WARNING) << "The invoking of the function " "'set_xpu_conv_autotune' is ignored, please " diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h index 077a812579d..1af29e74d21 100644 --- a/lite/api/paddle_api.h +++ b/lite/api/paddle_api.h @@ -472,6 +472,8 @@ class LITE_API CxxConfig : public ConfigBase { void set_xpu_gm_workspace_method(size_t gm_size); + // **DEPRECATED**, use environ variable to enable autotune + // check http://agroup.baidu.com/share/md/f9233d84df11452488a1fdd4f859647f void set_xpu_conv_autotune(bool autotune = true, const std::string& autotune_file = ""); diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h index fef39c7c570..bc8687a767d 100644 --- a/lite/api/paddle_use_passes.h +++ b/lite/api/paddle_use_passes.h @@ -107,6 +107,7 @@ USE_MIR_PASS(__xpu__bigru_fuse_pass); USE_MIR_PASS(__xpu__dynamic_lstm_fuse_pass); USE_MIR_PASS(__xpu__multi_softmax_fuse_pass); USE_MIR_PASS(__xpu__max_pooling_pad_zero_detect_fuse_pass); +USE_MIR_PASS(__xpu__static_kernel_pick_pass); USE_MIR_PASS(x86_int8_attribute_pass); USE_MIR_PASS(fill_range_fuse_pass); USE_MIR_PASS(range_calc_offline_pass); diff --git a/lite/backends/xpu/target_wrapper.cc b/lite/backends/xpu/target_wrapper.cc index 3cf33ea02f6..29e26e392a7 100644 --- a/lite/backends/xpu/target_wrapper.cc +++ b/lite/backends/xpu/target_wrapper.cc @@ -43,9 +43,13 @@ void TargetWrapperXPU::MemcpySync(void* dst, template XPUQuantData TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( - const Tcpu* cpu_data, const DDimLite& dims, bool data_transpose) { + const Tcpu* cpu_data, + const DDimLite& dims, + bool data_transpose, + size_t max_ptr_len) { CHECK(quantizer_.get()); - return quantizer_->quant(cpu_data, dims, data_transpose); + return quantizer_->quant( + cpu_data, dims, data_transpose, max_ptr_len); } void TargetWrapperXPU::ScatterL3Cache( @@ -145,16 +149,16 @@ void TargetWrapperXPU::FreeL3Cache() { template XPUQuantData TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( - const float*, const DDimLite&, bool); + const float*, const DDimLite&, bool, size_t); template XPUQuantData TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( - const float*, const DDimLite&, bool); + const float*, const DDimLite&, bool, size_t); template XPUQuantData TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( - const float*, const DDimLite&, bool); + const float*, const DDimLite&, bool, size_t); template XPUQuantData TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( - const int8_t*, const DDimLite&, bool); + const int8_t*, const DDimLite&, bool, size_t); // xpu context LITE_THREAD_LOCAL std::shared_ptr TargetWrapperXPU::tls_raw_ctx_{ @@ -165,9 +169,6 @@ LITE_THREAD_LOCAL std::shared_ptr TargetWrapperXPU::xpu_stream_{nullptr}; LITE_THREAD_LOCAL std::string TargetWrapperXPU::multi_encoder_precision; // NOLINT LITE_THREAD_LOCAL bool TargetWrapperXPU::multi_encoder_adaptive_seqlen{false}; -// conv autotune config -LITE_THREAD_LOCAL bool TargetWrapperXPU::conv_autotune{false}; -LITE_THREAD_LOCAL std::string TargetWrapperXPU::conv_autotune_file; // l3 cache config LITE_THREAD_LOCAL bool TargetWrapperXPU::need_l3_mutex{false}; LITE_THREAD_LOCAL size_t TargetWrapperXPU::local_l3_size{ diff --git a/lite/backends/xpu/target_wrapper.h b/lite/backends/xpu/target_wrapper.h index ffc45305682..f02a55e0e2b 100644 --- a/lite/backends/xpu/target_wrapper.h +++ b/lite/backends/xpu/target_wrapper.h @@ -69,7 +69,8 @@ class TargetWrapper { template static XPUQuantData ConvertCPUWeightToXPUQuantWeight(const Tcpu* cpu_data, const DDimLite& dims, - bool data_transpose); + bool data_transpose, + size_t max_ptr_len); static xdnn::Context* GetRawContext() { if (tls_raw_ctx_.get() == nullptr) { @@ -111,14 +112,6 @@ class TargetWrapper { quantizer_.reset(new XPUQuantizer()); } CHECK(quantizer_.get()); - if (conv_autotune) { - tls_raw_ctx_->_xpu1_conv_selector.set_autotune_loop(true); - tls_raw_ctx_->_xpu1_conv_selector.set_inference_mode(true); - } - if (!conv_autotune_file.empty()) { - tls_raw_ctx_->_xpu1_conv_selector.set_autotune_file( - conv_autotune_file.c_str()); - } int devid = -1; uint64_t max_l3_size = 0; XPU_CALL(xpu_current_device(&devid)); @@ -173,9 +166,6 @@ class TargetWrapper { // multi encoder config static LITE_THREAD_LOCAL std::string multi_encoder_precision; // NOLINT static LITE_THREAD_LOCAL bool multi_encoder_adaptive_seqlen; - // conv autotune config - static LITE_THREAD_LOCAL bool conv_autotune; - static LITE_THREAD_LOCAL std::string conv_autotune_file; // NOLINT // l3 cache config static LITE_THREAD_LOCAL bool need_l3_mutex; // model level l3 size static LITE_THREAD_LOCAL size_t local_l3_size; // model level l3 size diff --git a/lite/backends/xpu/xpu_quantizer.cc b/lite/backends/xpu/xpu_quantizer.cc index dd1c24a3869..5e921cf0458 100644 --- a/lite/backends/xpu/xpu_quantizer.cc +++ b/lite/backends/xpu/xpu_quantizer.cc @@ -112,7 +112,8 @@ template < void XPUQuantizer::ConvertWithQuant(const Tcpu* cpu_data, const DDimLite& dims, bool data_transpose, - size_t hashed_key) { + size_t hashed_key, + size_t max_ptr_len) { LOG(FATAL) << "Not support for Tcpu is " << CppTypeToString(); } @@ -123,7 +124,8 @@ template < void XPUQuantizer::ConvertWithQuant(const Tcpu* cpu_data, const DDimLite& dims, bool data_transpose, - size_t hashed_key) { + size_t hashed_key, + size_t max_ptr_len) { // transpose const Tcpu* cpu_ptr = nullptr; int numel = dims.production(); @@ -140,7 +142,7 @@ void XPUQuantizer::ConvertWithQuant(const Tcpu* cpu_data, XPUScratchPadGuard weight_max_guard; XPUScratchPadGuard quant_weight_guard; float max_val = paddle::lite::xpu::math::FindMaxAbs(cpu_ptr, numel); - int max_ptr_size = XPUMemory::get_max_ptr_size(); + size_t max_ptr_size = max_ptr_len; std::vector max_vec(max_ptr_size, max_val); weight_max_guard = std::move(XPUMemory::MallocScratchPad(max_ptr_size * sizeof(float))); @@ -162,11 +164,12 @@ template void XPUQuantizer::ConvertWithoutQuant(const T* cpu_data, const DDimLite& dims, bool data_transpose, - size_t hashed_key) { + size_t hashed_key, + size_t max_ptr_len) { // transpose const T* cpu_ptr = nullptr; int numel = dims.production(); - int max_ptr_size = XPUMemory::get_max_ptr_size(); + size_t max_ptr_size = max_ptr_len; std::vector transpose_data(numel, 0); if (data_transpose) { CHECK(dims.size() == 2) << "Not support: dims.size = " << dims.size(); @@ -178,8 +181,9 @@ void XPUQuantizer::ConvertWithoutQuant(const T* cpu_data, } // copy to XPU XPUScratchPadGuard weight_max_guard(new XPUScratchPad(nullptr, 0)); - if (std::is_same::value) { + if (std::is_same::value || std::is_same::value) { // prepare max_w space for slim int8 quant + // just allocate buffer, set max value in kernel weight_max_guard = std::move(XPUMemory::MallocScratchPad(max_ptr_size * sizeof(float))); } @@ -196,7 +200,8 @@ void XPUQuantizer::ConvertWithoutQuant(const T* cpu_data, template XPUQuantData XPUQuantizer::quant(const Tcpu* cpu_data, const DDimLite& dims, - bool data_transpose) { + bool data_transpose, + size_t max_ptr_len) { int numel = dims.production(); const std::string cpu_dtype = CppTypeToString(); const std::string xpu_dtype = CppTypeToString(); @@ -206,7 +211,8 @@ XPUQuantData XPUQuantizer::quant(const Tcpu* cpu_data, << ", precision=" << precision << ", transpose=" << data_transpose << ", hashed_key=" << hashed_key; if (weight_cache_.find(hashed_key) == weight_cache_.end()) { - ConvertWrapper(cpu_data, dims, data_transpose, hashed_key); + ConvertWrapper( + cpu_data, dims, data_transpose, hashed_key, max_ptr_len); } float* max_ptr = @@ -218,15 +224,19 @@ XPUQuantData XPUQuantizer::quant(const Tcpu* cpu_data, template XPUQuantData XPUQuantizer::quant(const float*, const DDimLite&, - bool); + bool, + size_t); template XPUQuantData XPUQuantizer::quant(const float*, const DDimLite&, - bool); + bool, + size_t); template XPUQuantData XPUQuantizer::quant(const float*, const DDimLite&, - bool); + bool, + size_t); template XPUQuantData XPUQuantizer::quant(const int8_t*, const DDimLite&, - bool); + bool, + size_t); } // namespace lite } // namespace paddle diff --git a/lite/backends/xpu/xpu_quantizer.h b/lite/backends/xpu/xpu_quantizer.h index e34a2dbec1d..1f8e21ca390 100644 --- a/lite/backends/xpu/xpu_quantizer.h +++ b/lite/backends/xpu/xpu_quantizer.h @@ -36,7 +36,8 @@ class XPUQuantizer { template XPUQuantData quant(const Tcpu* cpu_data, const DDimLite& dims, - bool data_transpose); + bool data_transpose, + size_t max_ptr_len); private: template @@ -49,7 +50,8 @@ class XPUQuantizer { void ConvertWithoutQuant(const T* cpu_data, const DDimLite& dims, bool data_transpose, - size_t hashed_key); + size_t hashed_key, + size_t max_ptr_len); template (cpu_data, dims, data_transpose, hashed_key); + size_t hashed_key, + size_t max_ptr_len) { + ConvertWithQuant( + cpu_data, dims, data_transpose, hashed_key, max_ptr_len); } template (cpu_data, dims, data_transpose, hashed_key); + size_t hashed_key, + size_t max_ptr_len) { + ConvertWithoutQuant( + cpu_data, dims, data_transpose, hashed_key, max_ptr_len); } // cpu data to xpu quant data diff --git a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc new file mode 100644 index 00000000000..d55b9aad45c --- /dev/null +++ b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc @@ -0,0 +1,742 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h" +#include +#include +#include +#include +#include +#include +#include +#ifdef LITE_WITH_XPU +#include "lite/backends/xpu/target_wrapper.h" +#endif +#include "lite/core/optimizer/mir/graph_visualize_pass.h" +#include "lite/core/optimizer/mir/pass_registry.h" +namespace paddle { +namespace lite { +namespace mir { + +bool XPUKernelScoreCmp(const std::pair>& a, + const std::pair>& b) { + return a.first > b.first; +} + +void XPUStaticKernelPickPass::Apply(const std::unique_ptr& graph) { + kernel_pick_factors_.ConsiderTarget(); + kernel_pick_factors_.ConsiderPrecision(); + kernel_pick_factors_.ConsiderDataLayout(); + CHECK(kernel_pick_factors_.any_factor_considered()) + << "kernel_pick_factors should be specified first"; + CHECK(graph) << "graph not valid"; + +// Collect input data precision for each node in the graph +#ifdef LITE_WITH_XPU + DicideUseFP16Optimizer(graph); + GetXPUDeviceType(); + if (xpu_use_fp16_optimizer_) { + for (auto& node : graph->StmtTopologicalOrder()) { + if (!node->IsStmt()) continue; + if (xpu_special_op_.count(node->AsStmt().op_type())) { + SpecialNodeInputPrecision(node); + continue; + } + + if (xpu_inplace_op_.count(node->AsStmt().op_type())) { + continue; + } + + NodeInputPrecision(node, graph); + } + + for (auto& node : graph->StmtTopologicalOrder()) { + if (!node->IsStmt()) continue; + if (xpu_inplace_op_.count(node->AsStmt().op_type()) == 0) { + continue; + } + + InplaceNodeInputPrecision(node); + } + } +#endif + + // sort kernels by the factors. + VLOG(2) << "graph block_idx: " << graph->blockIdx(); + VLOG(2) << "graph->mutable_nodes().size(): " << graph->mutable_nodes().size(); + size_t idx = 0; + for (auto& node : graph->StmtTopologicalOrder()) { + if (!node->IsStmt()) continue; + auto& instruct = node->AsStmt(); + VLOG(2) << "pick kernel for op : " << instruct.op_type() << ", in block " + << graph->blockIdx() << ", idx : " << idx++; + + std::map in_types; + std::map out_types; + // threse precision info store in __model__ file, if selected fp16 kernel, + // the output precision should be changed + for (std::list::iterator i = node->inlinks.begin(); + i != node->inlinks.end(); + ++i) { + if ((*i)->arg()->type) + in_types[(*i)->arg()->name] = (*i)->arg()->type->precision(); + } + for (std::list::iterator i = node->outlinks.begin(); + i != node->outlinks.end(); + ++i) { + if ((*i)->arg()->type) + out_types[(*i)->arg()->name] = (*i)->arg()->type->precision(); + } + // Get candidate kernels + std::vector>> scored; + CHECK(!instruct.kernels().empty()) << "No kernels found for " + << instruct.op_type(); + + VLOG(2) << "candidate kernels size:" << instruct.kernels().size(); + + for (auto&& kernel : instruct.kernels()) { + VLOG(2) << "current candidate kernel is: " << kernel->summary(); + VLOG(2) << "valid_places size is: " << graph->valid_places().size(); + + float score = KernelGrade(node, + *kernel, + graph->valid_places(), + in_types, + out_types, + instruct.op_info()->input_names(), + instruct.op_info()->output_names()); + + scored.emplace_back(score, std::move(kernel)); + } + std::stable_sort(scored.begin(), scored.end(), XPUKernelScoreCmp); + instruct.kernels().clear(); + + if (!instruct.op_info()->HasAttr("enable_int8")) { +#ifdef LITE_WITH_XPU + if (xpu_use_fp16_optimizer_) { + if (xpu_special_op_.count(node->AsStmt().op_type())) { + SpecialNodeOutputPrecision(graph, node, scored.front().second); + } else if (xpu_inplace_op_.count(node->AsStmt().op_type())) { + InplaceNodeOutputPrecision(node->AsStmt(), + instruct.op_info()->input_names(), + instruct.op_info()->output_names()); + } else { + NodeOutputPrecision(graph, node); + } + } +#endif + + instruct.kernels().emplace_back(std::move(scored.front().second)); + VLOG(2) << "the final pick kernel is " + << instruct.kernels().front()->summary() << "\n\n"; + } else { + // TODO(quwei): consider XPU int8 data precision + bool out_type_int8 = true; + // Quantized lstm has fp32 output + if (instruct.op_type() == "lstm" || instruct.op_type() == "gru" || + instruct.op_type() == "__xpu__multi_encoder" || + instruct.op_type() == "__xpu__fc") { + out_type_int8 = false; + } + // Only if all ops linked to this op output has enable_int8 attr, + // then the op output type is int8, or fp32. + // Note, the quantized op linked to lstm and gru should output fp32 + // tensor. + for (auto* out_n : node->outlinks) { + CHECK(out_n->IsArg()); + for (auto* tmp_op : out_n->outlinks) { + CHECK(tmp_op->IsStmt()); + auto* tmp_op_info = tmp_op->AsStmt().op_info(); + if (!tmp_op_info->HasAttr("enable_int8") || + tmp_op_info->Type() == "lstm" || tmp_op_info->Type() == "gru" || + instruct.op_type() == "__xpu__multi_encoder" || + instruct.op_type() == "__xpu__fc") { + out_type_int8 = false; + break; + } + } + if (!out_type_int8) break; + } + // If the out_type_int8 is true, it turns out that the output type of + // this + // op can be int8. + // So we need to specify output scale for this op. + if (out_type_int8) { + auto out_node = node->outlinks.front(); + CHECK(out_node->IsArg()); + auto out_node_name = out_node->arg()->name; + auto one_adj_op_node = out_node->outlinks.front(); + CHECK(one_adj_op_node->IsStmt()); + auto& one_adj_instruct = one_adj_op_node->AsStmt(); + CHECK(one_adj_instruct.op_info()->HasAttr("enable_int8")); + CHECK(one_adj_instruct.op_info()->HasInputScale(out_node_name)); + + instruct.mutable_op_info()->SetOutputScale( + out_node_name, + one_adj_instruct.op_info()->GetInputScale(out_node_name)); + + auto update_desc = *instruct.mutable_op_info(); + instruct.ResetOp(update_desc, graph->valid_places()); + scored.clear(); + for (auto&& kernel : instruct.kernels()) { + float score = KernelGrade(node, + *kernel, + graph->valid_places(), + in_types, + out_types, + instruct.op_info()->input_names(), + instruct.op_info()->output_names()); + scored.emplace_back(score, std::move(kernel)); + } + std::stable_sort(scored.begin(), scored.end(), XPUKernelScoreCmp); + instruct.kernels().clear(); + } + // If the out_type_int8 is true, we should pick the kernel with the + // int8 input and int8 output. + // If the out_type_int8 is false, we should pick the kernel with the + // int8 input and fp32 output. + auto output_arguments = instruct.op_info()->OutputArgumentNames(); + for (auto& candidate : scored) { + bool all_output_type_match = true; + auto expect_output_type = + out_type_int8 ? PRECISION(kInt8) : PRECISION(kFloat); + + for (auto& arg_name : output_arguments) { + const Type* out_arg_ty = + candidate.second->GetOutputDeclType(arg_name); + if (out_arg_ty->precision() != expect_output_type) { + all_output_type_match = false; + } + } + + if (all_output_type_match) { + instruct.kernels().emplace_back(std::move(candidate.second)); + VLOG(2) << "instruct.kernels.emplace_back " + << instruct.kernels().front()->name(); + break; + } + } + CHECK(!instruct.kernels().empty()) << "No kernels found for " + << instruct.op_type(); + } + } +} + +#ifdef LITE_WITH_XPU +void XPUStaticKernelPickPass::DicideUseFP16Optimizer( + const std::unique_ptr& graph) { + if (GetStringFromEnv("XPUForceUseFP16", "false") == "true") { + xpu_use_fp16_optimizer_ = false; + VLOG(2) << "XPU force use data precision: FP16 "; + return; + } + + if (graph->valid_places()[0].precision == PrecisionType::kFP16) { + xpu_use_fp16_optimizer_ = true; + VLOG(2) << "XPU auto use data precision: FP16/FP32/INT16 "; + } +} + +void XPUStaticKernelPickPass::ForceUseFP32Kernel( + size_t* score, + const lite::KernelBase& kernel, + const paddle::lite::mir::Node::Stmt& instruct) { + if (kernel.place().target != TARGET(kXPU)) { + return; + } + + // only use in FC,it will not use in future. + if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int31" || + lite::TargetWrapperXPU::multi_encoder_precision == "int31") { + if (kernel.alias() == "XPU_Real_kFloat" && + instruct.op_type() == "__xpu__fc") { + *score *= 2; + VLOG(6) << "__xpu__fc: force use PRECISON INT31: *2"; + } + return; + } + + if (GetStringFromEnv("XPU_COMPUTE_PRECISION", "int16") == "int31") { + if (kernel.alias() == "XPU_Real_kFloat" && + PRECISION_INT31_OP_.count(instruct.op_type())) { + *score *= 2; + VLOG(6) << instruct.op_type() << ": force use PRECISON INT31: *2"; + } + return; + } + + if (kernel.alias() == "XPU_Real_kFloat") { + *score = 0; + VLOG(6) << "By default,XPU not use PRECISION INT31, so not pick " + "current kernel: " + << kernel.summary(); + } +} + +void XPUStaticKernelPickPass::ForceUseInt8Kernel( + size_t* score, + const lite::KernelBase& kernel, + const paddle::lite::mir::Node::Stmt& instruct) { + if (kernel.place().target != TARGET(kXPU)) { + return; + } + + // only use in FC,it will not use in future. + if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int8" || + lite::TargetWrapperXPU::multi_encoder_precision == "int8") { + if (kernel.alias() == "XPU_Int8_FP32_FP32" && + instruct.op_type() == "__xpu__fc") { + *score *= 2; + VLOG(6) << "__xpu__fc: force use PRECISON INT8: *2"; + } + return; + } + + if (GetStringFromEnv("XPU_COMPUTE_PRECISION", "int16") == "int8") { + if (kernel.alias() == "XPU_Int8_FP32_FP32" && + PRECISION_INT8_OP_.count(instruct.op_type())) { + *score *= 2; + VLOG(6) << instruct.op_type() << ": force use PRECISON INT8: *2"; + } + return; + } + + if (kernel.alias() == "XPU_Int8_FP32_FP32") { + *score = 0; + VLOG(6) << "By default,XPU not use PRECISION INT8, so not pick " + "current kernel: " + << kernel.summary(); + } +} + +void XPUStaticKernelPickPass::GetScore(PrecisionType precision, + size_t* score_tmp) { + if (precision == PrecisionType::kInt16) { + *score_tmp = *score_tmp > 9 ? *score_tmp : 9; + } else if (precision == PrecisionType::kFP16) { + *score_tmp = *score_tmp > 7 ? *score_tmp : 7; + } else if (precision == PrecisionType::kAny) { + *score_tmp = *score_tmp > 1 ? *score_tmp : 1; + } else { + *score_tmp = *score_tmp > 5 ? *score_tmp : 5; + } +} + +void XPUStaticKernelPickPass::NodeOutputPrecision( + const std::unique_ptr& graph, lite::mir::Node* node) { + auto& inst = node->AsStmt(); + if (inst.op_type() == "fetch") { + return; + } + + const auto* op_info = inst.op_info(); + for (auto* out_node : node->outlinks) { + auto& var = out_node->AsArg(); + const auto& var_name = var.name; + std::string arg_name; + CHECK(op_info->GetOutputArgname(var_name, &arg_name)) + << "Can not find the output argument,current var name : " << var_name; + VLOG(6) << " output arg name:" << arg_name << " var name:" << var_name; + Scope* scope = node->AsStmt().op()->scope(); + auto* var_ptr = scope->FindVar(var_name); + if (var_ptr == nullptr) { + VLOG(6) << "Can't find ouput var_name: " << var_name + << "in current scope."; + continue; + } + + PrecisionType precison = var_ptr->GetMutable()->precision(); + xpu_output_type_.emplace(var_name, precison); + } +} + +void XPUStaticKernelPickPass::SpecialNodeOutputPrecision( + const std::unique_ptr& graph, + lite::mir::Node* node, + const std::unique_ptr& kernel) { + auto& inst = node->AsStmt(); + + std::vector out_var_names; + const auto* op_info = inst.op_info(); + for (auto* out_node : node->outlinks) { + auto& var = out_node->AsArg(); + const auto& var_name = var.name; + std::string arg_name; + + CHECK(op_info->GetOutputArgname(var_name, &arg_name)) + << "Can not find the output argument, current var name : " << var_name; + VLOG(6) << " output arg name:" << arg_name << " var name:" << var_name; + if (output_parameter_name_.count(arg_name) == 0) { + continue; + } + + const auto* decl_type = kernel->GetOutputDeclType(arg_name); + CHECK(decl_type); + PrecisionType precison = decl_type->precision(); + xpu_output_type_.emplace(var_name, precison); + } +} + +void XPUStaticKernelPickPass::InplaceNodeOutputPrecision( + const paddle::lite::mir::Node::Stmt& instruct, + const std::vector& in_names, + const std::vector& out_names) { + PrecisionType pre_op_output_precision = PrecisionType::kUnk; + for (size_t i = 0; i < in_names.size(); ++i) { + std::string tmp; + CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp)); + VLOG(6) << "current kernel input data variable name:" << in_names[i] + << "Parameter name:" << tmp; + if (input_parameter_name_.count(tmp) && + xpu_output_type_.count(in_names[i])) { + pre_op_output_precision = xpu_output_type_[in_names[i]]; + } + } + + // collect inplace op output data precision + if (pre_op_output_precision != PrecisionType::kUnk) { + for (size_t i = 0; i < out_names.size(); ++i) { + std::string tmp; + CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp)); + if (output_parameter_name_.count(tmp)) { + xpu_output_type_.emplace(out_names[i], pre_op_output_precision); + } + } + } +} + +// Special nodes like conv2d, matmul ; collect input data precision for eatch +// registry kernel as a candidate set. +void XPUStaticKernelPickPass::SpecialNodeInputPrecision(lite::mir::Node* node) { + auto& inst = node->AsStmt(); + const auto* op_info = inst.op_info(); + for (auto* in_node : node->inlinks) { + auto& var = in_node->AsArg(); + const auto& var_name = var.name; + std::string arg_name; + CHECK(op_info->GetInputArgname(var_name, &arg_name)) + << "Can not find the input argument,current var name : " << var_name; + VLOG(6) << " input arg name:" << arg_name << " var name:" << var_name; + if (input_parameter_name_.count(arg_name) == 0) { + continue; + } + + std::vector> kernel_input_type{}; + for (auto&& kernel : inst.kernels()) { + if (kernel->summary().find(xpu_disable_flag_) != std::string::npos) { + VLOG(6) << " ignore collect current kernel:" << kernel->summary(); + continue; + } + + std::map tmp_map; + PrecisionType precison; + + const auto* decl_type = kernel->GetInputDeclType(arg_name); + CHECK(decl_type); + precison = decl_type->precision(); + tmp_map.emplace(kernel->summary(), precison); + kernel_input_type.emplace_back(std::move(tmp_map)); + } + + xpu_input_type_.emplace(var_name, kernel_input_type); + } +} + +void XPUStaticKernelPickPass::NodeInputPrecision( + lite::mir::Node* node, const std::unique_ptr& graph) { + auto& inst = node->AsStmt(); + if (inst.op_type() == "feed") { + return; + } + + const auto* op_info = inst.op_info(); + for (auto* in_node : node->inlinks) { + auto& var = in_node->AsArg(); + const auto& var_name = var.name; + std::string arg_name; + CHECK(op_info->GetInputArgname(var_name, &arg_name)) + << "Can not find the input argument,current var name : " << var_name; + VLOG(6) << " input arg name:" << arg_name << " var name:" << var_name; + + std::vector> kernel_input_type{}; + std::map tmp_map; + PrecisionType precison; + Scope* scope = node->AsStmt().op()->scope(); + + auto* var_ptr = scope->FindVar(var_name); + if (var_ptr == nullptr) { + VLOG(6) << "Can't find input var_name: " << var_name + << "in current scope."; + continue; + } + + precison = var_ptr->GetMutable()->precision(); + tmp_map.emplace(inst.op_type(), precison); + kernel_input_type.emplace_back(std::move(tmp_map)); + xpu_input_type_.emplace(var_name, kernel_input_type); + } +} + +// Special for inplace op. +void XPUStaticKernelPickPass::InplaceNodeInputPrecision(lite::mir::Node* node) { + auto& inst = node->AsStmt(); + const auto* op_info = inst.op_info(); + // inplace op only has one inpute variable. + std::string inplace_op_input_name{"none"}; + for (auto* in_node : node->inlinks) { + auto& var = in_node->AsArg(); + const auto& var_name = var.name; + std::string arg_name; + CHECK(op_info->GetInputArgname(var_name, &arg_name)) + << "Can not find the input argument,current var name : " << var_name; + VLOG(6) << " input arg name:" << arg_name << " var name:" << var_name; + if (input_parameter_name_.count(arg_name)) { + inplace_op_input_name = var_name; + } + } + + for (auto* out_node : node->outlinks) { + auto& var = out_node->AsArg(); + const auto& var_name = var.name; + std::string arg_name; + int num = 0; + + CHECK(op_info->GetOutputArgname(var_name, &arg_name)) + << "Can not find the output argument,current var name : " << var_name; + VLOG(6) << " output arg name:" << arg_name << " var name:" << var_name; + // inplace op only have one output variable,but ic can connect input + // variables of multiple Ops + int output_match_num = xpu_input_type_.count(var_name); + if (output_parameter_name_.count(arg_name) == 0 || output_match_num == 0) { + continue; + } + + for (auto iter = xpu_input_type_.begin(); iter != xpu_input_type_.end(); + ++iter) { + if (num >= output_match_num) { + break; + } + + if (iter->first != var_name) { + continue; + } + + ++num; + xpu_input_type_.emplace(inplace_op_input_name, iter->second); + } + VLOG(6) << "inplace op :" << inst.op_type() << "input prision" + << "replace by the next op input prision "; + VLOG(6) << "inplace op :" << inst.op_type() + << ", inpute name:" << inplace_op_input_name + << ",the next op input input name : " << var_name; + } +} + +void XPUStaticKernelPickPass::InplaceOpScore( + const lite::KernelBase& kernel, + const paddle::lite::mir::Node::Stmt& instruct, + const std::vector& in_names, + const std::vector& out_names, + bool* type_match, + size_t* score) { + PrecisionType pre_op_output_precision = PrecisionType::kUnk; + for (size_t i = 0; i < in_names.size(); ++i) { + std::string tmp; + CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp)); + VLOG(6) << "current kernel input data variable name:" << in_names[i] + << "Parameter name:" << tmp; + if (input_parameter_name_.count(tmp) && + xpu_output_type_.count(in_names[i])) { + size_t score_tmp = 0; + pre_op_output_precision = xpu_output_type_[in_names[i]]; + if (kernel.GetInputDeclType(tmp)->precision() == PrecisionType::kAny) { + GetScore(PrecisionType::kAny, &score_tmp); + VLOG(6) << "current inplace kernel input data precision:kAny"; + } + + if (pre_op_output_precision == + kernel.GetInputDeclType(tmp)->precision() || + pre_op_output_precision == PrecisionType::kAny) { + GetScore(pre_op_output_precision, &score_tmp); + *type_match = true; + VLOG(6) << "inplace op match input data precision"; + } + + *score += score_tmp; + } + } + + // collect inplace op output data precision + if (pre_op_output_precision != PrecisionType::kUnk) { + for (size_t i = 0; i < out_names.size(); ++i) { + std::string tmp; + CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp)); + if (output_parameter_name_.count(tmp)) { + xpu_output_type_.emplace(out_names[i], pre_op_output_precision); + } + } + } +} + +void XPUStaticKernelPickPass::SpecialOpScore( + const lite::KernelBase& kernel, + const paddle::lite::mir::Node::Stmt& instruct, + const std::vector& in_names, + const std::vector& out_names, + bool* type_match, + size_t* score) { + size_t score_tmp_all = 0; + bool intput_match = true; + bool output_match = true; + bool consider_cpu = false; + // delete?? + if (consider_cpu_op_.count(instruct.op_type())) { + consider_cpu = true; + } + + if (!(kernel.place().target == TARGET(kXPU) || consider_cpu)) { + return; + } + + // input data precision score + for (size_t i = 0; i < in_names.size(); ++i) { + std::string tmp; + CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp)); + if (input_parameter_name_.count(tmp) == 0) { + continue; + } + + if (xpu_output_type_.count(in_names[i]) == 0) { + continue; + } + + VLOG(6) << "current kernel input data variable name:" << in_names[i] + << ", Parameter name:" << tmp; + + size_t score_tmp = 0; + if (kernel.GetInputDeclType(tmp)->precision() == PrecisionType::kAny) { + GetScore(PrecisionType::kAny, &score_tmp); + VLOG(6) << "match input data precision:kAny"; + } + + if (xpu_output_type_[in_names[i]] == + kernel.GetInputDeclType(tmp)->precision() || + xpu_output_type_[in_names[i]] == PrecisionType::kAny) { + GetScore(xpu_output_type_[in_names[i]], &score_tmp); + VLOG(6) << "match input data precision"; + } + + if (score_tmp == 0) { + output_match = false; + } + + score_tmp_all += score_tmp; + } + + // output data precision score + for (size_t i = 0; i < out_names.size(); ++i) { + std::string tmp; + CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp)); + int output_match_num = xpu_input_type_.count(out_names[i]); + if (output_parameter_name_.count(tmp) == 0) { + continue; + } + + if (output_match_num == 0) { + continue; + } + + VLOG(6) << "current kernel output data variable name:" << out_names[i] + << ", Parameter name:" << tmp; + int num = 0; + size_t score_tmp = 0; + for (auto iter = xpu_input_type_.begin(); iter != xpu_input_type_.end(); + ++iter) { + if (num >= output_match_num) { + break; + } + + if (iter->first != out_names[i]) { + continue; + } + + ++num; + for (auto& map_kernel : iter->second) { + // Special op fetch + if (map_kernel.begin()->first.substr(0, 5) == "fetch") { + if (map_kernel.begin()->second == + kernel.GetOutputDeclType(tmp)->precision()) { + score_tmp = 500; + } + continue; + } + + if (kernel.GetOutputDeclType(tmp)->precision() == PrecisionType::kAny) { + VLOG(6) << "match precision:kAny,the next kernel's name:" + << map_kernel.begin()->first; + GetScore(PrecisionType::kAny, &score_tmp); + } + + if (map_kernel.begin()->second == + kernel.GetOutputDeclType(tmp)->precision() || + map_kernel.begin()->second == PrecisionType::kAny) { + VLOG(6) << "match next kernel's input data precision,the " + "next kernel name:" + << map_kernel.begin()->first; + GetScore(map_kernel.begin()->second, &score_tmp); + } + } + } + + if (score_tmp == 0) { + output_match = false; + } + score_tmp_all += score_tmp; + } + + if (score_tmp_all > 0) { + *type_match = intput_match & output_match; + } + + *score += score_tmp_all; +} + +void XPUStaticKernelPickPass::GetXPUDeviceType() { + int cur_dev_idx = 0; + uint64_t cur_dev_attr = 0; + + XPU_CALL(xpu_current_device(&cur_dev_idx)); + XPU_CALL(xpu_device_get_attr(&cur_dev_attr, XPUATTR_MODEL, cur_dev_idx)); + if (cur_dev_attr <= 1) { + VLOG(4) << "Currents XPU device : XPU1"; + xpu_disable_flag_ = "DISABLE_XPU1"; + } else if (cur_dev_attr >= 2 && cur_dev_attr <= 299) { + VLOG(4) << "Currents XPU device : XPU2"; + xpu_disable_flag_ = "DISABLE_XPU2"; + } else if (cur_dev_attr >= 300 && cur_dev_attr <= 599) { + VLOG(4) << "Currents XPU device : XPU3"; + xpu_disable_flag_ = "DISABLE_XPU3"; + } else { + VLOG(4) << "invaid XPU device"; + xpu_disable_flag_ = "NONE"; + } +} + +#endif +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__static_kernel_pick_pass, + paddle::lite::mir::XPUStaticKernelPickPass) + .BindTargets({TARGET(kXPU)}); diff --git a/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h new file mode 100644 index 00000000000..38f786b5216 --- /dev/null +++ b/lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h @@ -0,0 +1,344 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include +#include +#include +#include +#include +#include +#include "lite/core/optimizer/mir/pass.h" +#include "lite/core/types.h" + +namespace paddle { +namespace lite { +namespace mir { + +/* + * XPUStaticKernelPickPass is a simple strategy for picking the kernel for each + * Operator using operator developer defined rule, there are many other tactics + * such as considering IO or kernel execution latency and we will implement them + * latter. + * + * There are two argument for this pass: + * - place, the target place. + * - kernel_pick_factors, the factors to consider in picking kernels. + * Set them first before execute the pass. + */ +class XPUStaticKernelPickPass : public mir::StmtPass { + public: + void Apply(const std::unique_ptr& graph) override; + + const core::KernelPickFactor& kernel_pick_factors() const { + return kernel_pick_factors_; + } + core::KernelPickFactor* mutable_kernel_pick_factors() { + return &kernel_pick_factors_; + } + + private: + // Score the kernel. + size_t KernelGrade(lite::mir::Node* node, + const lite::KernelBase& kernel, + const std::vector& places, + const std::map& in_types, + const std::map& out_types, + const std::vector& in_names, + const std::vector& out_names) { + const auto& instruct = node->AsStmt(); + CHECK_GT(places.size(), static_cast(0)) << "valid_places is empty."; + float final_score{-1.}; + Place winner_place{places[0]}; + const int kMax = + (std::numeric_limits::max)(); + size_t place_size = places.size(); + + // NOTE: We compare kernel's place with place in valid_places to select the + // best match place + // The place's order in valid_places array decide the user's + // preference + // final_score = weight * socre + // weight: The weight is compute with (valid_places.size() - i) / + // valid_places.size() as default. + // where i is the place's index in valid_places array. + // score: score is the weighted sum of target、percision and layout + for (size_t i = 0; i < place_size; ++i) { + const auto& place = places[i]; + float weight = static_cast(place_size - i) / place_size; + VLOG(4) << "current place is " << place.DebugString() << ", idx : " << i + << ", weight : " << weight; + size_t score{}; + + // The more important factor comes first + if (kernel_pick_factors_.IsTargetConsidered() && + (place.target == kernel.target() || kernel.target() == TARGET(kAny) || + place.target == TARGET(kAny))) { + size_t target_score = + kMax / + static_cast(core::KernelPickFactor::Factor::TargetFirst); + score += target_score; + VLOG(4) << "[TargetConsidered score]:" << target_score; + } + VLOG(4) << "[score s1]:" << score; + + if (kernel_pick_factors_.IsPrecisionConsidered() && + (place.precision == kernel.precision() || + kernel.precision() == PRECISION(kAny) || + place.precision == PRECISION(kAny) || + // fp16 may also pick FP32 kernel preciison + (xpu_use_fp16_optimizer_ && + kernel.precision() == PRECISION(kFloat) && + place.precision == PRECISION(kFP16)))) { + // score skipped, if kernel is int8, but op is not int8 + if (!(kernel.precision() == PRECISION(kInt8) && + !instruct.op_info()->HasAttr("enable_int8"))) { + size_t precision_score = + kMax / + static_cast(core::KernelPickFactor::Factor::PrecisionFirst); + score += precision_score; + VLOG(4) << "[PrecisionConsidered score]:" << precision_score; + } + } + VLOG(4) << "[score s2]:" << score; + + if (kernel_pick_factors_.IsDataLayoutConsidered() && + (place.layout == kernel.layout() || + kernel.layout() == DATALAYOUT(kAny) || + place.layout == DATALAYOUT(kAny))) { + size_t datalayout_score = + kMax / + static_cast(core::KernelPickFactor::Factor::DataLayoutFirst); + score += datalayout_score; + VLOG(4) << "[DataLayoutConsidered score]:" << datalayout_score; + } + VLOG(4) << "[score s3]:" << score; + + // add new rules for precision: When the input types are consistent with + // kernel's input types, select the kernel of the precision. However, if + // the op is feed, we should compare the output precision type. + // Note that this strategy is not compatible with quantization, so skip + // quantization op. + if (!instruct.op_info()->HasAttr("enable_int8")) { + bool type_match = true; + if (instruct.op_type() == "feed") { + for (size_t i = 0; i < out_names.size(); ++i) { + std::string tmp; + CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp)); + if (out_types.count(out_names[i]) && + out_types.at(out_names[i]) != + kernel.GetOutputDeclType(tmp)->precision()) { + type_match = false; + } + } + } else { + for (size_t i = 0; i < in_names.size(); ++i) { + std::string tmp; + CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp)); + if (in_types.count(in_names[i]) && + !PrecTypeCompatible( + in_types.at(in_names[i]), + kernel.GetInputDeclType(tmp)->precision())) { + type_match = false; + } + } + } +#ifdef LITE_WITH_XPU + if (xpu_use_fp16_optimizer_ && + (xpu_special_op_.count(instruct.op_type()) || + xpu_inplace_op_.count(instruct.op_type()))) { + type_match = false; + if (kernel.summary().find(xpu_disable_flag_) != std::string::npos) { + score = 0; + VLOG(6) << " ignore pick current kernel:" << kernel.summary(); + } else if (xpu_inplace_op_.count(instruct.op_type())) { + InplaceOpScore( + kernel, instruct, in_names, out_names, &type_match, &score); + } else { + SpecialOpScore( + kernel, instruct, in_names, out_names, &type_match, &score); + } + } +#endif + + if (type_match) { + score *= 2; + VLOG(4) << "[Input/Output precision compatible]: *2"; + } + VLOG(4) << "[score s4]:" << score; + } +#ifdef LITE_WITH_XPU + ForceUseFP32Kernel(&score, kernel, instruct); + ForceUseInt8Kernel(&score, kernel, instruct); +#endif + + // add new rules for datatype: When the input types are consistent with + // kernel's input types, select the kernel of the datatype. + if (instruct.op_info()->Type() != "conditional_block" && + instruct.op_info()->Type() != "while" && + instruct.op_info()->Type() != "subgraph") { + bool datatype_match = true; + for (auto* in : node->inlinks) { + if (!in->IsArg()) continue; + if (in->AsArg().name == "feed" || in->AsArg().is_persist) continue; + std::string argname; + instruct.op_info()->GetInputArgname(in->AsArg().name, &argname); + VLOG(5) << "intput var name : " << in->AsArg().name; + // only when datatype is LOD_TENSOR, LOD_TENSOR_ARRAY, STEP_SCOPES, + // the type pointer is not null; + if (in->AsArg().type) { + VLOG(5) << "input datatype : " + << static_cast(in->AsArg().type->id()); + VLOG(5) << "kernel bind datatype : " + << static_cast(kernel.GetInputDeclType(argname)->id()); + if (static_cast(in->AsArg().type->id()) != + static_cast(kernel.GetInputDeclType(argname)->id())) + datatype_match = false; + } else { + datatype_match = false; + } + } + if (datatype_match) { + score *= 2; + VLOG(4) << "[Input datatype compatible]: *2"; + } + VLOG(4) << "[score s5]:" << score; + } + + if (weight * score > final_score) { + final_score = weight * score; + winner_place = place; + } + } + + VLOG(2) << "-------- score summary for candidate kernel : " + << kernel.summary() << " --------"; + VLOG(2) << " ===> winner_place():" << PrecisionToStr(winner_place.precision) + << " " << DataLayoutToStr(winner_place.layout) << " " + << TargetToStr(winner_place.target); + VLOG(2) << " ===> kernel.place():" + << PrecisionToStr(kernel.place().precision) << " " + << DataLayoutToStr(kernel.place().layout) << " " + << TargetToStr(kernel.place().target); + VLOG(4) << "kernel.op_type():" << kernel.op_type(); + VLOG(4) << "kernel picker factors:" << kernel_pick_factors_; + VLOG(4) << "winner_picker place:" << winner_place.DebugString(); + VLOG(4) << "[score(final)]:" << final_score; + VLOG(4) << "------------------------------"; + + // The data layout is not considered, for the input and output arguments + // might have different data layout. + // TODO(Superjomn) reconsider the idea of taking the data layout as a kernel + // specification. + return final_score; + } + + // Compatible for PrecisionType. + // For cuda, in the process of choosing kernel, fp16 and fp32 are compatiable. + // If kernel's declared type is kAny, it is matched. + bool PrecTypeCompatible(const PrecisionType& p1, const PrecisionType& p2) { + if (p1 == p2 || p2 == PRECISION(kAny)) { + return true; + } else if ((p1 == PRECISION(kFP16) || p1 == PRECISION(kFloat)) && + (p2 == PRECISION(kFP16) || p2 == PRECISION(kFloat))) { + return true; + } else { + return false; + } + } +#ifdef LITE_WITH_XPU + void DicideUseFP16Optimizer(const std::unique_ptr& graph); + void ForceUseFP32Kernel(size_t* score, + const lite::KernelBase& kernel, + const paddle::lite::mir::Node::Stmt& instruct); + void ForceUseInt8Kernel(size_t* score, + const lite::KernelBase& kernel, + const paddle::lite::mir::Node::Stmt& instruct); + void GetScore(PrecisionType precision, size_t* score_tmp); + + void NodeInputPrecision(lite::mir::Node* node, + const std::unique_ptr& graph); + void InplaceNodeInputPrecision(lite::mir::Node* node); + void SpecialNodeInputPrecision(lite::mir::Node* node); + + void NodeOutputPrecision(const std::unique_ptr& graph, + lite::mir::Node* node); + void InplaceNodeOutputPrecision(const paddle::lite::mir::Node::Stmt& instruct, + const std::vector& in_names, + const std::vector& out_names); + void SpecialNodeOutputPrecision( + const std::unique_ptr& graph, + lite::mir::Node* node, + const std::unique_ptr& kernel); + + void SpecialOpScore(const lite::KernelBase& kernel, + const paddle::lite::mir::Node::Stmt& instruct, + const std::vector& in_names, + const std::vector& out_names, + bool* type_match, + size_t* score); + void GetXPUDeviceType(); + void InplaceOpScore(const lite::KernelBase& kernel, + const paddle::lite::mir::Node::Stmt& instruct, + const std::vector& in_names, + const std::vector& out_names, + bool* type_match, + size_t* score); +#endif + + private: + core::KernelPickFactor kernel_pick_factors_; + + bool xpu_use_fp16_optimizer_{false}; +#ifdef LITE_WITH_XPU + // TODO(quwei:) addn more op + const std::set PRECISION_INT31_OP_{"__xpu__fc"}; + const std::set PRECISION_INT8_OP_{"__xpu__fc"}; + const std::set input_parameter_name_{ + "Input", "X", "Y", "Branch", "BBoxes", "Scores", "repeat_times_tensor"}; + const std::set output_parameter_name_{ + "Output", "Out", "Boxes", "Scores", "Y"}; + std::multimap>> + xpu_input_type_{}; + std::map xpu_output_type_{}; + std::string xpu_disable_flag_{}; + const std::set consider_cpu_op_{"cast"}; + const std::set xpu_special_op_{"__xpu__fc", + "conv3d", + "__xpu__conv2d", + "gather", + "pool2d", + "concat", + "calib", + "relu", + "tanh", + "sigmoid", + "leaky_relu", + "conv2d_transpose", + "elementwise_mul", + "elementwise_add", + "reduce_mean"}; + const std::set xpu_inplace_op_{"reshape", + "reshape2", + "flatten", + "flatten2", + "squeeze", + "squeeze2", + "unsqueeze", + "unsqueeze2"}; +#endif +}; + +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc index 0e3f3b0335d..74b1e72a974 100644 --- a/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc +++ b/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc @@ -82,27 +82,6 @@ class XPUFcFuser : public FuseBase { op_desc.SetInput("Input", {matched.at("x")->arg()->name}); op_desc.SetInput("Filter", {matched.at("W")->arg()->name}); - std::string precision = "int16"; -#ifdef LITE_WITH_XPU - if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int31" || - lite::TargetWrapperXPU::multi_encoder_precision == "int31") { - precision = "int31"; - VLOG(3) << "Use int31 in XPUFcOp"; - } else if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int8" || - lite::TargetWrapperXPU::multi_encoder_precision == "int8") { - precision = "int8"; - if (op_desc.HasAttr("enable_int8") && - op_desc.GetAttr("enable_int8")) { - CHECK(op_desc.HasAttr("X0_scale")) << " quant model fc no X0_scale"; - CHECK(op_desc.HasAttr("Y0_scale")) << " quant model fc no Y0_scale"; - VLOG(3) << "Use int8 quant model in XPUFcOp, InputMax:" - << 127 * op_desc.GetAttr>("X0_scale")[0] - << ", WeightMax: " - << 127 * op_desc.GetAttr>("Y0_scale")[0]; - } - VLOG(3) << "Use int8 in XPUFcOp"; - } -#endif if (with_bias_) { op_desc.SetInput("Bias", {matched.at("bias")->arg()->name}); } @@ -118,8 +97,48 @@ class XPUFcFuser : public FuseBase { output_name = matched.at("mul_out")->arg()->name; output_node_name = "mul_out"; } + bool per_channel = false; + int weight_scale_size = 1; + auto* op_info = matched.at("mul")->stmt()->op_info(); + auto mul_input_y_name = op_info->Input("Y").front(); + auto mul_y_shape = scope->FindMutableTensor(mul_input_y_name)->dims(); + CHECK_EQ(mul_y_shape.size(), 2) << "mul_y_shape.size: " + << mul_y_shape.size(); + const bool quant = op_info->HasAttr("enable_int8") && + op_info->GetAttr("enable_int8"); + op_desc.SetAttr("enable_int8", quant); + // X0_scale is already in op_desc when copy from mul + if (quant) { + CHECK(op_info->HasAttr("Y0_scale")) << "quant model no Y0_scale"; + weight_scale_size = + op_info->GetAttr>("Y0_scale").size(); + CHECK_EQ(weight_scale_size, mul_y_shape[1]) + << "weight_scale_size: " << weight_scale_size + << ", mul_y_shape:" << mul_y_shape; + CHECK_GE(weight_scale_size, 1) << weight_scale_size; + std::vector weight_max; + if (is_per_tensor(op_info->GetAttr>("Y0_scale"))) { + per_channel = false; + VLOG(3) << "xpu fc per tensor"; + weight_max.push_back( + op_info->GetAttr>("Y0_scale")[0] * 127); + } else { + per_channel = true; + VLOG(3) << "xpu fc per channel, first channel max:" + << op_info->GetAttr>("Y0_scale")[0] * 127 + << ", last channel max: " + << op_info->GetAttr>( + "Y0_scale")[weight_scale_size - 1] * + 127; + for (auto wm : op_info->GetAttr>("Y0_scale")) { + weight_max.push_back(wm * 127); + } + } + VLOG(3) << "weight_max size:" << weight_max.size(); + op_desc.SetAttr>("Y0_max", weight_max); + op_desc.SetAttr("per_channel", per_channel); + } op_desc.SetOutput("Output", {output_name}); - op_desc.SetAttr("precision", precision); std::map act_map{{"linear", 0}, {"relu", 1}, {"sigmoid", 2}, @@ -169,6 +188,19 @@ class XPUFcFuser : public FuseBase { private: bool with_bias_; std::string act_type_; + std::string mul_type_; + bool is_per_tensor(const std::vector& weight_max) { + bool per_tensor = true; + CHECK_GT(weight_max.size(), 0) << "fc channel size: " << weight_max.size(); + auto first = weight_max[0]; + for (int i = 1; i < weight_max.size(); ++i) { + if (std::abs(first - weight_max[i]) > 1e-6) { + per_tensor = false; + break; + } + } + return per_tensor; + } }; } // namespace fusion diff --git a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_adaptive_seqlen_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_adaptive_seqlen_fuse_pass.cc index f15667422bb..95bc14151e5 100644 --- a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_adaptive_seqlen_fuse_pass.cc +++ b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_adaptive_seqlen_fuse_pass.cc @@ -61,14 +61,14 @@ namespace fusion { class XPUMultiEncoderAdaptiveSeqlenFuser : public FuseBase { public: explicit XPUMultiEncoderAdaptiveSeqlenFuser( - const std::string& matmul_type = "matmul") - : matmul_type_(matmul_type) {} + const std::string& matmul_type = "matmul", bool pre_ln = false) + : matmul_type_(matmul_type), pre_ln_(pre_ln) {} void BuildPattern() override { auto* mask = VarNode("mask") ->assert_is_op_input(matmul_type_, "X") ->assert_is_op_input(matmul_type_, "Y"); - auto* matmul = OpNode("matmul", matmul_type_)->AsIntermediate(); + auto* matmul = OpNode(matmul_type_, matmul_type_)->AsIntermediate(); auto* matmul_out = VarNode("matmul_out") ->assert_is_op_input("scale", "X") ->assert_is_op_output(matmul_type_, "Out") @@ -85,20 +85,37 @@ class XPUMultiEncoderAdaptiveSeqlenFuser : public FuseBase { ->AsIntermediate(); auto* xpu_embedding = OpNode("xpu_embedding", "__xpu__embedding_with_eltwise_add"); - auto* embedding_out = - VarNode("embedding_out") - ->assert_is_op_output("__xpu__embedding_with_eltwise_add", "Output") - ->assert_is_op_input("layer_norm", "X"); - auto* layer_norm = OpNode("layer_norm", "layer_norm"); - auto* layer_norm_out = - VarNode("layer_norm_out") - ->assert_is_op_output("layer_norm", "Y") - ->assert_is_op_input("__xpu__multi_encoder", "Input"); + + PMNode* embedding_out = nullptr; + PMNode* layer_norm = nullptr; + PMNode* layer_norm_out = nullptr; + + if (pre_ln_) { + embedding_out = VarNode("embedding_out") + ->assert_is_op_output( + "__xpu__embedding_with_eltwise_add", "Output") + ->assert_is_op_input("__xpu__multi_encoder", "Input"); + } else { + embedding_out = VarNode("embedding_out") + ->assert_is_op_output( + "__xpu__embedding_with_eltwise_add", "Output") + ->assert_is_op_input("layer_norm", "X"); + layer_norm = OpNode("layer_norm", "layer_norm"); + layer_norm_out = + VarNode("layer_norm_out") + ->assert_is_op_output("layer_norm", "Y") + ->assert_is_op_input("__xpu__multi_encoder", "Input"); + } auto* xpu_encoder = OpNode("xpu_encoder", "__xpu__multi_encoder") ->assert_op_attr("adaptive_seqlen", true); + if (pre_ln_) { + xpu_encoder->assert_op_attr("norm_before", true); + *xpu_embedding >> *embedding_out >> *xpu_encoder; + } else { + *xpu_embedding >> *embedding_out >> *layer_norm >> *layer_norm_out >> + *xpu_encoder; + } - *xpu_embedding >> *embedding_out >> *layer_norm >> *layer_norm_out >> - *xpu_encoder; *mask >> *matmul >> *matmul_out >> *scale >> *scale_out >> *stack >> *stack_out >> *xpu_encoder; } @@ -147,6 +164,7 @@ class XPUMultiEncoderAdaptiveSeqlenFuser : public FuseBase { private: std::string matmul_type_; + bool pre_ln_; }; } // namespace fusion @@ -155,9 +173,12 @@ class XPUMultiEncoderAdaptiveSeqlenFusePass : public ProgramPass { public: void Apply(const std::unique_ptr& graph) override { std::vector matmul_types{"matmul", "matmul_v2"}; + std::vector pre_lns{true, false}; for (auto& matmul_type : matmul_types) { - fusion::XPUMultiEncoderAdaptiveSeqlenFuser fuser(matmul_type); - fuser(graph.get()); + for (auto pre_ln : pre_lns) { + fusion::XPUMultiEncoderAdaptiveSeqlenFuser fuser(matmul_type, pre_ln); + fuser(graph.get()); + } } } }; diff --git a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc index e47e12270ba..01c091ffe71 100644 --- a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc +++ b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc @@ -101,7 +101,7 @@ class XPUSingleEncoderFuser : public FuseBase { auto* q_reshape2_xshape = VarNode("q_reshape2_xshape") ->assert_is_op_output("reshape2", "XShape") ->AsIntermediate(); - std::string target_op_type = "matmul"; + std::string target_op_type = matmul_type_; if (with_q_scale_) { target_op_type = "scale"; } @@ -121,7 +121,7 @@ class XPUSingleEncoderFuser : public FuseBase { q_scale = OpNode("q_scale", "scale")->AsIntermediate(); q_scale_out = VarNode("q_scale_out") ->assert_is_op_output("scale", "Out") - ->assert_is_op_input("matmul", "X") + ->assert_is_op_input(matmul_type_, "X") ->AsIntermediate(); } @@ -151,16 +151,16 @@ class XPUSingleEncoderFuser : public FuseBase { auto* k_transpose2 = OpNode("k_transpose2", "transpose2")->AsIntermediate(); auto* k_transpose2_out = VarNode("k_transpose2_out") ->assert_is_op_output("transpose2", "Out") - ->assert_is_op_input("matmul", "Y") + ->assert_is_op_input(matmul_type_, "Y") ->AsIntermediate(); auto* k_transpose2_xshape = VarNode("k_transpose2_xshape") ->assert_is_op_output("transpose2", "XShape") ->AsIntermediate(); - auto* qk_matmul = OpNode("qk_matmul", "matmul")->AsIntermediate(); + auto* qk_matmul = OpNode("qk_matmul", matmul_type_)->AsIntermediate(); auto* qk_matmul_out = VarNode("qk_matmul_out") - ->assert_is_op_output("matmul", "Out") + ->assert_is_op_output(matmul_type_, "Out") ->assert_is_op_input("elementwise_add", "X") ->AsIntermediate(); auto* qk_mask = VarNode("qk_mask") @@ -508,67 +508,23 @@ class XPUSingleEncoderFuser : public FuseBase { CHECK_EQ(q_mul_y_shape[0], qkv_mul_y_shape[1]); CHECK_EQ(q_mul_y_shape[1], qkv_mul_y_shape[0]); CHECK_GT(hidden_dim, 0) << "invalid hidden_dim: " << hidden_dim; - // mul input_max, output_max * 6 + matmul x_max,y_max,output_max * 2 - std::vector fc_input_max; - set_quant_info(matched, &fc_input_max); - // mul & matmul input/output max - op_desc.SetAttr>("fc_input_max", fc_input_max); if (q_mul_op_info->HasAttr("enable_int8") && q_mul_op_info->GetAttr("enable_int8")) { op_desc.SetAttr("enable_int8", true); - op_desc.SetAttr>( - "Y0_max", - { - 127 * - matched.at("q_mul") - ->stmt() - ->op_info() - ->GetAttr>("Y0_scale")[0], - 127 * - matched.at("k_mul") - ->stmt() - ->op_info() - ->GetAttr>("Y0_scale")[0], - 127 * - matched.at("v_mul") - ->stmt() - ->op_info() - ->GetAttr>("Y0_scale")[0], - 127 * - matched.at("qkv_mul") - ->stmt() - ->op_info() - ->GetAttr>("Y0_scale")[0], - 127 * - matched.at("qkv_mul_3") - ->stmt() - ->op_info() - ->GetAttr>("Y0_scale")[0], - 127 * - matched.at("qkv_mul_4") - ->stmt() - ->op_info() - ->GetAttr>("Y0_scale")[0], - }); - VLOG(3) << "q/k/v weight_max: " - << 127 * - matched.at("q_mul") - ->stmt() - ->op_info() - ->GetAttr>("Y0_scale")[0] - << ", " - << 127 * - matched.at("k_mul") - ->stmt() - ->op_info() - ->GetAttr>("Y0_scale")[0] - << ", " - << 127 * - matched.at("v_mul") - ->stmt() - ->op_info() - ->GetAttr>("Y0_scale")[0]; + // mul input_max, output_max * 6 + matmul x_max,y_max,output_max * 2 + std::vector fc_input_max; + std::vector fc_weight_max; + std::vector fc_channels; + set_quant_info(matched, &fc_input_max); // set input/output scale + bool per_channel = false; + set_weight_info( + scope, matched, &fc_weight_max, &per_channel, &fc_channels); + op_desc.SetAttr("per_channel", per_channel); + op_desc.SetAttr>("fc_channels", fc_channels); + // mul & matmul input/output max + op_desc.SetAttr>("fc_input_max", fc_input_max); + op_desc.SetAttr>("Y0_max", fc_weight_max); } // extra traits to distill auto* reshape_op_info = matched.at("q_reshape2")->stmt()->op_info(); @@ -675,6 +631,8 @@ class XPUSingleEncoderFuser : public FuseBase { "X0_scale")[0]); // ew_add out_threshold for output quant auto& quant_ew = mul_add_ops[i]; + CHECK(matched.at(quant_ew)->stmt()->op_info()->HasAttr("out_threshold")) + << "act after quant mul has no out_threshold"; quant_info->push_back( matched.at(quant_ew)->stmt()->op_info()->GetAttr( "out_threshold")); @@ -697,7 +655,7 @@ class XPUSingleEncoderFuser : public FuseBase { if (matmul_quant) { auto* qkv_matmul_op_info = matched.at("qkv_matmul")->stmt()->op_info(); - CHECK(qkv_matmul_op_info->HasAttr("X0_scale") == true); + CHECK(qkv_matmul_op_info->HasAttr("X0_scale")); float softmax_out_threshold = matched.at("qk_softmax") ->stmt() ->op_info() @@ -709,7 +667,7 @@ class XPUSingleEncoderFuser : public FuseBase { "X0_scale")[0] * 127), 1e-5); - CHECK(qk_matmul_op_info->HasAttr("X0_scale") == true); + CHECK(qk_matmul_op_info->HasAttr("X0_scale")); quant_info->push_back(max_qkv_output); quant_info->push_back(max_qkv_output); quant_info->push_back(softmax_out_threshold); @@ -720,6 +678,68 @@ class XPUSingleEncoderFuser : public FuseBase { CHECK_EQ(quant_info->size(), 18); } } + bool is_per_tensor(const std::vector& weight_max) { + bool per_tensor = true; + CHECK_GT(weight_max.size(), 0) << "fc channel size: " << weight_max.size(); + auto first = weight_max[0]; + for (int i = 1; i < weight_max.size(); ++i) { + if (std::abs(first - weight_max[i]) > 1e-6) { + per_tensor = false; + break; + } + } + return per_tensor; + } + void set_weight_info(Scope* scope, + const key2nodes_t& matched, + std::vector* weight_info, + bool* per_channel, + std::vector* fc_channels) { + const std::vector quant_mul_ops = { + "q_mul", "k_mul", "v_mul", "qkv_mul", "qkv_mul_3", "qkv_mul_4"}; + bool tmp_pc = false; + for (int i = 0; i < quant_mul_ops.size(); ++i) { + auto& mul_op = quant_mul_ops[i]; + auto op_info = matched.at(mul_op)->stmt()->op_info(); + auto weight_name = op_info->Input("Y").front(); + auto weight_shape = scope->FindMutableTensor(weight_name)->dims(); + CHECK_EQ(weight_shape.size(), 2) << "weight_shape: " << weight_shape; + CHECK(op_info->HasAttr("Y0_scale")) << " quant op has no Y0_scale"; + int weight_scale_size = + op_info->GetAttr>("Y0_scale").size(); + CHECK_EQ(weight_scale_size, weight_shape[1]) + << "weight_scale_size: " << weight_scale_size + << ", weight_shape: " << weight_shape; + CHECK_GT(weight_scale_size, 3) + << mul_op << ", weight_scale_size: " << weight_scale_size; + fc_channels->push_back(weight_scale_size); + if (i == 0) { + if (is_per_tensor(op_info->GetAttr>("Y0_scale"))) { + tmp_pc = false; + VLOG(3) << "mul quant using weight_max per tensor"; + } else { + tmp_pc = true; + VLOG(3) << "mul quant using weight_max per channel"; + } + } + for (int j = 0; j < weight_scale_size; ++j) { + weight_info->push_back( + 127 * op_info->GetAttr>("Y0_scale")[j]); + if (!tmp_pc) break; + } + if (i < 3) { + if (tmp_pc) { + VLOG(3) + << mul_op << " weight max first channel: " + << (*weight_info)[i * weight_scale_size] << ", last channel:" + << (*weight_info)[i * weight_scale_size + weight_scale_size - 1]; + } else { + VLOG(3) << mul_op << " weight max per tensor: " << (*weight_info)[i]; + } + } + } + *per_channel = tmp_pc; + } }; class XPUMultiEncoderFuser { @@ -783,25 +803,60 @@ class XPUMultiEncoderFuser { std::set to_remove; Node* first_encoder = all_encoders[0]; + auto* multi_encoder_stmt = first_encoder->stmt(); + auto* first_encoder_op_info = multi_encoder_stmt->op_info(); + bool per_channel = false; + if (first_encoder_op_info->HasAttr("per_channel")) { + per_channel = first_encoder_op_info->GetAttr("per_channel"); + } + const int hidden_dim = first_encoder_op_info->GetAttr("hidden_dim"); std::string in_name, out_name; std::vector arg_names{ "FCWeight", "FCBias", "LNScale", "LNBias"}; std::map> arg_map; std::vector fc_weight_max; std::vector fc_input_max; + + std::vector fc_channels; + int single_encoder_weight_scale_size = 0; + if (per_channel) { + for (auto channel : + first_encoder_op_info->GetAttr>("fc_channels")) { + single_encoder_weight_scale_size += channel; + } + } else { + // non-quant or per tensor quant + single_encoder_weight_scale_size = 6; + } + fc_weight_max.resize(all_encoders.size() * + single_encoder_weight_scale_size); + for (size_t i = 0; i < all_encoders.size(); ++i) { Node* cur_encoder = all_encoders[i]; auto* op_info = cur_encoder->stmt()->op_info(); if (enable_int8) { CHECK(op_info->HasAttr("enable_int8")) << "no enable_int8 attr"; CHECK(op_info->HasAttr("Y0_max")) << "no Y0_max attr"; + CHECK(op_info->HasAttr("per_channel")) << "no per_channel attr"; CHECK(op_info->HasAttr("fc_input_max")) << "no fc_input_max attr"; - for (auto y0 : op_info->GetAttr>("Y0_max")) { - fc_weight_max.push_back(y0); + CHECK_EQ(op_info->GetAttr>("Y0_max").size(), + single_encoder_weight_scale_size) + << "invalid weight scale size: " + << op_info->GetAttr>("Y0_max").size() << ", " + << single_encoder_weight_scale_size; + for (int j = 0; j < single_encoder_weight_scale_size; ++j) { + fc_weight_max[i * single_encoder_weight_scale_size + j] = + op_info->GetAttr>("Y0_max")[j]; } for (auto x0 : op_info->GetAttr>("fc_input_max")) { fc_input_max.push_back(x0); } + if (per_channel) { + for (auto channel : + op_info->GetAttr>("fc_channels")) { + fc_channels.push_back(channel); + } + } } for (auto arg_name : arg_names) { auto real_names = op_info->Input(arg_name); @@ -836,7 +891,6 @@ class XPUMultiEncoderFuser { } GraphSafeRemoveNodes(graph, to_remove); - auto* multi_encoder_stmt = first_encoder->stmt(); cpp::OpDesc op_desc; op_desc.SetType("__xpu__multi_encoder"); op_desc.SetInput("Input", {in_name}); @@ -850,30 +904,43 @@ class XPUMultiEncoderFuser { op_desc.SetAttr("enable_int8", enable_int8); if (enable_int8) { CHECK_EQ(fc_precision_, "int8"); - CHECK_EQ(fc_weight_max.size(), all_encoders.size() * 6); + if (per_channel) { + CHECK_EQ(fc_weight_max.size(), + all_encoders.size() * single_encoder_weight_scale_size) + << " fc_weight_max.size:" << fc_weight_max.size() + << ", all_encoders.size():" << all_encoders.size() + << ", single_encoder_weight_scale_size: " + << single_encoder_weight_scale_size; + CHECK_EQ(fc_channels.size(), all_encoders.size() * 6) + << "fc_channels.size:" << fc_channels.size(); + } else { + CHECK_EQ(fc_weight_max.size(), all_encoders.size() * 6) + << " fc_weight_max.size:" << fc_weight_max.size() + << ", all_encoders.size():" << all_encoders.size(); + CHECK_EQ(fc_channels.size(), 0) << fc_channels.size(); + } CHECK((fc_input_max.size() == all_encoders.size() * 12) || (fc_input_max.size() == all_encoders.size() * 18)) << fc_input_max.size() << ", all_encoders.size:" << all_encoders.size(); - for (int i = 0; i < fc_weight_max.size(); i += 6) { - CHECK_LT(std::abs(fc_weight_max[i] - fc_weight_max[i + 1]), 1e-5) - << " quanted ernie's q/k weight scale should be euqal: " - << fc_weight_max[i] << ", " << fc_weight_max[i + 1]; - CHECK_LT(std::abs(fc_weight_max[i] - fc_weight_max[i + 2]), 1e-5) - << " quanted ernie's q/v weight scale should be euqal: " - << fc_weight_max[i] << ", " << fc_weight_max[i + 2]; + if (!per_channel) { + for (int i = 0; i < fc_weight_max.size(); i += 6) { + CHECK_LT(std::abs(fc_weight_max[i] - fc_weight_max[i + 1]), 1e-5) + << " quanted ernie's q/k weight scale should be euqal: " + << fc_weight_max[i] << ", " << fc_weight_max[i + 1]; + CHECK_LT(std::abs(fc_weight_max[i] - fc_weight_max[i + 2]), 1e-5) + << " quanted ernie's q/v weight scale should be euqal: " + << fc_weight_max[i] << ", " << fc_weight_max[i + 2]; + } } op_desc.SetAttr>("FCInputMax", fc_input_max); - // "FCWeightMax" is also stored as "Input" now - op_desc.SetAttr>("FCWeightMax", fc_weight_max); + VLOG(3) << "fc_input_max size: " << fc_input_max.size(); // only support adaptive_seqlen in int8 quant model CHECK_EQ(adaptive_seqlen_, true); } else { - fc_weight_max.resize(arg_map["FCWeight"].size()); + CHECK_EQ(per_channel, false) << "per_channel in non-quant model"; } - auto* first_encoder_op_info = multi_encoder_stmt->op_info(); - op_desc.SetAttr("hidden_dim", - first_encoder_op_info->GetAttr("hidden_dim")); + op_desc.SetAttr("hidden_dim", hidden_dim); op_desc.SetAttr("head_num", first_encoder_op_info->GetAttr("head_num")); op_desc.SetAttr( @@ -884,17 +951,20 @@ class XPUMultiEncoderFuser { "act_type", first_encoder_op_info->GetAttr("act_type")); op_desc.SetAttr("precision", fc_precision_); op_desc.SetAttr("adaptive_seqlen", adaptive_seqlen_); + op_desc.SetAttr("per_channel", per_channel); + if (per_channel) { + op_desc.SetAttr>("fc_channels", fc_channels); + } // q/k/v fusion bool enable_qkv_fusion = true; - if (norm_before_0) { + if (norm_before_0 && !adaptive_seqlen_) { enable_qkv_fusion = false; } op_desc.SetAttr("enable_qkv_fusion", enable_qkv_fusion); auto* scope = multi_encoder_stmt->op()->scope(); auto& fc_weight_names = arg_map["FCWeight"]; - CHECK_EQ(fc_weight_max.size(), fc_weight_names.size()); for (size_t i = 0; i < fc_weight_names.size(); ++i) { if (enable_qkv_fusion && (i % 6 == 0)) { auto weight_tensor_tmp = scope->FindMutableTensor(fc_weight_names[i]); @@ -968,7 +1038,6 @@ class XPUMultiEncoderFuser { qkv_len * sizeof(float)); } - // TODO(mayang02): we could use attr to store FCWeightMax std::string max_name = "encoder_max_" + fc_weight_names[0]; VLOG(3) << "multi-encoder max weight name: " << max_name; auto* max_filter_node = graph->RetrieveArgument(max_name); @@ -1082,9 +1151,7 @@ class XPUMultiEncoderFuser { int qkv_offset = 0; if (enable_int8) { CHECK_EQ(fc_precision_, "int8"); - CHECK(end <= fc_weight_max->size()); std::unique_ptr weight_qkv_trans(new int8_t[qkv_len]); - float max_f = (*fc_weight_max)[start]; for (int i = 0; i < (end - start); ++i) { // the quanted weight is alreay int8 in quanted model int8_t* weight_host_ptr = weight_tensor_vec[i]->mutable_data(); @@ -1098,17 +1165,9 @@ class XPUMultiEncoderFuser { weight_host_trans.get(), weight_len_vec[i] * sizeof(int8_t)); qkv_offset += weight_len_vec[i]; - if (i > 0) { - max_f = std::max(max_f, (*fc_weight_max)[start + i]); - VLOG(5) << "start+i:" << start + i - << ", weigh_max: " << (*fc_weight_max)[start + i] - << ", max_f:" << max_f; - } } CHECK_EQ(qkv_offset, qkv_len); weight_tensor_vec[0]->Resize({weight_dim1_acc, weight_dims_vec[0][0]}); - (*fc_weight_max)[start] = max_f; - VLOG(3) << "QKV fused FC-" << start << ", weight_max:" << max_f; memcpy(weight_tensor_vec[0]->mutable_data(), weight_qkv_trans.get(), qkv_len * sizeof(int8_t)); @@ -1174,9 +1233,9 @@ class XPUMultiEncoderFusePass : public ProgramPass { std::vector input_poss{"X", "Y"}; std::vector qkv_ln_2_out_poss{"X", "Y"}; std::vector matmul_types{"matmul", "matmul_v2"}; - std::vector mul_types{"mul", "matmul"}; + std::vector mul_types{"mul", "matmul", "matmul_v2"}; std::vector with_q_scales{true, false}; - std::vector norm_befores{false}; + std::vector norm_befores{true, false}; std::string fc_precision; bool adaptive_seqlen = false; diff --git a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_slice_link_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_slice_link_fuse_pass.cc index 505e912ff5d..2d009df752e 100644 --- a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_slice_link_fuse_pass.cc +++ b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_slice_link_fuse_pass.cc @@ -25,14 +25,17 @@ namespace fusion { class XPUMultiEncoderSliceLinkFuser : public FuseBase { public: + explicit XPUMultiEncoderSliceLinkFuser(bool pre_ln = false) + : pre_ln_(pre_ln) {} void BuildPattern() override { auto* xpu_encoder = OpNode("xpu_encoder", "__xpu__multi_encoder"); auto* encoder_out = VarNode("encoder_out") ->assert_is_op_output("__xpu__multi_encoder", "Output") - ->assert_is_op_input("slice", "Input") - ->assert_only_one_output() - ->AsIntermediate(); + ->assert_only_one_output(); + PMNode* layer_norm = nullptr; + PMNode* layer_norm_out = nullptr; + auto* slice = OpNode("slice", "slice") ->assert_op_attr_satisfied>( "axes", @@ -45,13 +48,28 @@ class XPUMultiEncoderSliceLinkFuser : public FuseBase { return attr.size() == 1 && attr[0] == 0; }) ->assert_op_attr_satisfied>( - "ends", - [](const std::vector& attr) { + "ends", [](const std::vector& attr) { return attr.size() == 1 && attr[0] == 1; - }) - ->AsIntermediate(); + }); + if (pre_ln_) { + xpu_encoder->assert_op_attr("norm_before", true); + encoder_out->assert_is_op_input("layer_norm", "X"); + layer_norm = OpNode("layer_norm", "layer_norm"); + layer_norm_out = VarNode("layer_norm_out") + ->assert_is_op_output("layer_norm", "Y") + ->assert_is_op_input("slice", "Input"); + } else { + xpu_encoder->assert_op_attr("norm_before", false); + encoder_out->assert_is_op_input("slice", "Input")->AsIntermediate(); + slice->AsIntermediate(); + } auto* slice_out = VarNode("slice_out")->assert_is_op_output("slice", "Out"); - *xpu_encoder >> *encoder_out >> *slice >> *slice_out; + if (pre_ln_) { + *xpu_encoder >> *encoder_out >> *layer_norm >> *layer_norm_out >> + *slice >> *slice_out; + } else { + *xpu_encoder >> *encoder_out >> *slice >> *slice_out; + } } void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { @@ -62,7 +80,9 @@ class XPUMultiEncoderSliceLinkFuser : public FuseBase { auto slice_op_desc = *slice_instruct->op_info(); std::string slice_out_name = matched.at("slice_out")->arg()->name; - encoder_op_desc.SetOutput("Output", {slice_out_name}); + if (!pre_ln_) { + encoder_op_desc.SetOutput("Output", {slice_out_name}); + } auto slice_axes = slice_op_desc.GetAttr>("axes"); encoder_op_desc.SetAttr("slice_axes", slice_axes); if (slice_op_desc.HasAttr("starts")) { @@ -79,8 +99,13 @@ class XPUMultiEncoderSliceLinkFuser : public FuseBase { encoder_op_desc.SetAttr("slice_decrease_axis", slice_decrease_axis); } encoder_instruct->ResetOp(encoder_op_desc, encoder_op->valid_places()); - DirectedLink(matched.at("xpu_encoder"), matched.at("slice_out")); + if (!pre_ln_) { + DirectedLink(matched.at("xpu_encoder"), matched.at("slice_out")); + } } + + private: + bool pre_ln_; }; } // namespace fusion @@ -88,8 +113,11 @@ class XPUMultiEncoderSliceLinkFuser : public FuseBase { class XPUMultiEncoderSliceLinkFusePass : public ProgramPass { public: void Apply(const std::unique_ptr& graph) override { - fusion::XPUMultiEncoderSliceLinkFuser fuser; - fuser(graph.get()); + std::vector pre_lns{true, false}; + for (auto pre_ln : pre_lns) { + fusion::XPUMultiEncoderSliceLinkFuser fuser(pre_ln); + fuser(graph.get()); + } } }; diff --git a/lite/core/optimizer/mir/static_kernel_pick_pass.cc b/lite/core/optimizer/mir/static_kernel_pick_pass.cc index 92695aa9ed7..236173558d0 100644 --- a/lite/core/optimizer/mir/static_kernel_pick_pass.cc +++ b/lite/core/optimizer/mir/static_kernel_pick_pass.cc @@ -193,4 +193,5 @@ void StaticKernelPickPass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(static_kernel_pick_pass, paddle::lite::mir::StaticKernelPickPass) - .BindTargets({TARGET(kAny)}); + .BindTargets({TARGET(kAny)}) + .ExcludeTargets({TARGET(kXPU)}); diff --git a/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc b/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc index d5032333a57..8bfc85493d1 100644 --- a/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc +++ b/lite/core/optimizer/mir/xpu_memory_optimize_pass.cc @@ -140,27 +140,6 @@ void XPUMemoryOptimizePass::CollectLifeCycleByDevice(SSAGraph* graph) { var_nodes.insert( var_nodes.end(), op_node->outlinks.begin(), op_node->outlinks.end()); TargetType target_type; - for (auto* var_node : var_nodes) { - CHECK(var_node->IsArg()); - auto& arg = var_node->AsArg(); - if (arg.is_weight || arg.is_persist) continue; - std::string var_name = arg.name; - VLOG(4) << "OP VAR NAME IS " << var_name; - if (var_name.find("_xpu_max") != std::string::npos) continue; - if (invalid_var_names.count(var_name)) continue; - target_type = arg.type->target(); - if (is_host(target_type)) target_type = TARGET(kHost); - - if (!lifecycles[TargetToStr(target_type)].count(var_name)) { - lifecycles[TargetToStr(target_type)].emplace( - var_name, std::make_pair(max_lifecycle_, max_lifecycle_)); - } else { - int cur_life = lifecycles[TargetToStr(target_type)][var_name].second; - lifecycles[TargetToStr(target_type)][var_name].second = - (std::max)(max_lifecycle_, cur_life); - } - } - ++max_lifecycle_; auto inplace_op_node = inplace_op_nodes.find(op_type); if (inplace_op_node != inplace_op_nodes.end()) { @@ -171,6 +150,8 @@ void XPUMemoryOptimizePass::CollectLifeCycleByDevice(SSAGraph* graph) { if (inplace) { auto in_arg_name = op_info->Input("X")[0]; auto out_arg_name = op_info->Output("Out")[0]; + if (invalid_var_names.count(in_arg_name)) continue; + if (invalid_var_names.count(out_arg_name)) continue; bool reuse = false; int i = 0; for (const auto& reuse_var_names : inpalce_reuse_var_names) { @@ -190,6 +171,28 @@ void XPUMemoryOptimizePass::CollectLifeCycleByDevice(SSAGraph* graph) { } } } + + for (auto* var_node : var_nodes) { + CHECK(var_node->IsArg()); + auto& arg = var_node->AsArg(); + if (arg.is_weight || arg.is_persist) continue; + std::string var_name = arg.name; + VLOG(4) << "OP VAR NAME IS " << var_name; + if (var_name.find("_xpu_max") != std::string::npos) continue; + if (invalid_var_names.count(var_name)) continue; + target_type = arg.type->target(); + if (is_host(target_type)) target_type = TARGET(kHost); + + if (!lifecycles[TargetToStr(target_type)].count(var_name)) { + lifecycles[TargetToStr(target_type)].emplace( + var_name, std::make_pair(max_lifecycle_, max_lifecycle_)); + } else { + int cur_life = lifecycles[TargetToStr(target_type)][var_name].second; + lifecycles[TargetToStr(target_type)][var_name].second = + (std::max)(max_lifecycle_, cur_life); + } + } + ++max_lifecycle_; } } diff --git a/lite/core/optimizer/optimizer.cc b/lite/core/optimizer/optimizer.cc index 17a6a62ba47..68ba7e91107 100644 --- a/lite/core/optimizer/optimizer.cc +++ b/lite/core/optimizer/optimizer.cc @@ -14,6 +14,9 @@ #include "lite/core/optimizer/optimizer.h" #include +#ifdef LITE_WITH_XPU +#include "lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.h" +#endif #include "lite/core/optimizer/mir/static_kernel_pick_pass.h" #include "lite/core/optimizer/mir/type_target_cast_pass.h" #include "lite/model_parser/model_parser.h" @@ -49,7 +52,6 @@ std::unique_ptr Optimizer::Run(Program&& program) { graph->SetValidPlaces(valid_places_); graphs_.emplace_back(std::move(graph)); } - SpecifyKernelPickTactic(kernel_pick_factor_); InitTargetTypeTransformPass(); InitControlFlowOpUnusedInputsAndOutputsEliminatePass(); @@ -63,8 +65,12 @@ std::unique_ptr Optimizer::Run(Program&& program) { } void Optimizer::SpecifyKernelPickTactic(core::KernelPickFactor factor) { + std::string static_pick_name = "static_kernel_pick_pass"; +#ifdef LITE_WITH_XPU + static_pick_name = "__xpu__static_kernel_pick_pass"; +#endif auto* pass = mir::PassManager::Global().LookUp( - "static_kernel_pick_pass"); + static_pick_name); CHECK(pass); *pass->mutable_kernel_pick_factors() = factor; @@ -218,6 +224,9 @@ std::unique_ptr RunDefaultOptimizer( "fpga_concat_fuse_pass", "control_flow_op_unused_inputs_and_outputs_eliminate_pass", "static_kernel_pick_pass", // pick original kernel from graph +#ifdef LITE_WITH_XPU + "__xpu__static_kernel_pick_pass", // xpu pick original kernel from graph +#endif "remove_tf_redundant_ops_pass", "variable_place_inference_pass", // inference arg/var's diff --git a/lite/kernels/host/tile_compute.cc b/lite/kernels/host/tile_compute.cc index b1a61aebc41..11d4d013cc6 100644 --- a/lite/kernels/host/tile_compute.cc +++ b/lite/kernels/host/tile_compute.cc @@ -85,9 +85,10 @@ void TileCompute::Run() { int dst_stride = in_stride[i + 1] * right; for (int m = 0; m < num; m++) { for (int j = 0; j < bcast_dims[i]; j++) { - std::memcpy(tmp_dst + j * dst_stride / bcast_dims[i] + m * dst_stride, - tmp_src + m * dst_stride / bcast_dims[i], - dst_stride / bcast_dims[i] * sizeof(T)); + std::memcpy( + tmp_dst + j * (dst_stride / bcast_dims[i]) + m * dst_stride, + tmp_src + m * (dst_stride / bcast_dims[i]), + dst_stride / bcast_dims[i] * sizeof(T)); } } tmp_src_tensor.CopyDataFrom(tmp_dst_tensor); diff --git a/lite/kernels/x86/slice_compute.cc b/lite/kernels/x86/slice_compute.cc index 88194d5c8c0..3bd26fb4511 100644 --- a/lite/kernels/x86/slice_compute.cc +++ b/lite/kernels/x86/slice_compute.cc @@ -33,6 +33,25 @@ REGISTER_LITE_KERNEL(slice, .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))}) .Finalize(); +REGISTER_LITE_KERNEL(slice, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SliceCompute, + array_def) + .BindInput("Input", + {LiteType::GetTensorListTy(TARGET(kX86), PRECISION(kFloat))}) + .BindInput("StartsTensor", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))}) + .BindInput("EndsTensor", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))}) + .BindInput("StartsTensorList", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))}) + .BindInput("EndsTensorList", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))}) + .Finalize(); + REGISTER_LITE_KERNEL(slice, kX86, kFloat, @@ -52,6 +71,25 @@ REGISTER_LITE_KERNEL(slice, .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))}) .Finalize(); +REGISTER_LITE_KERNEL(slice, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SliceCompute, + array_int32) + .BindInput("Input", + {LiteType::GetTensorListTy(TARGET(kX86), PRECISION(kInt32))}) + .BindInput("StartsTensor", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))}) + .BindInput("EndsTensor", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))}) + .BindInput("StartsTensorList", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))}) + .BindInput("EndsTensorList", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))}) + .Finalize(); + REGISTER_LITE_KERNEL(slice, kX86, kFloat, @@ -70,3 +108,22 @@ REGISTER_LITE_KERNEL(slice, {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))}) .Finalize(); + +REGISTER_LITE_KERNEL(slice, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SliceCompute, + array_int64) + .BindInput("Input", + {LiteType::GetTensorListTy(TARGET(kX86), PRECISION(kInt64))}) + .BindInput("StartsTensor", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))}) + .BindInput("EndsTensor", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))}) + .BindInput("StartsTensorList", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))}) + .BindInput("EndsTensorList", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kAny))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))}) + .Finalize(); diff --git a/lite/kernels/x86/slice_compute.h b/lite/kernels/x86/slice_compute.h index 1de0c368c68..4d448f47049 100644 --- a/lite/kernels/x86/slice_compute.h +++ b/lite/kernels/x86/slice_compute.h @@ -28,6 +28,47 @@ namespace lite { namespace kernels { namespace x86 { +void DealTensorArray(const std::vector* XTensorList, + std::vector* OutTensorList, + lite::Tensor* Out, + const std::vector& starts, + const std::vector& ends, + bool out_is_array) { + auto in_array = XTensorList; + // If the input is LoDTensorArray, the rank of input is 1. + int64_t in_size = in_array->size(); + int64_t start = starts[0] < 0 ? (starts[0] + in_size) : starts[0]; + int64_t end = ends[0] < 0 ? (ends[0] + in_size) : ends[0]; + + start = std::max(start, static_cast(0)); + end = std::max(end, static_cast(0)); + end = std::min(end, in_size); + + CHECK_GT(end, start) << "end should greater than start"; + int64_t out_size = end - start; + + if (out_is_array) { + auto out_array = OutTensorList; + out_array->resize(out_size); + for (int i = 0; i < out_size; ++i) { + auto* out_tensor = &out_array->at(i); + auto in_tensor = in_array->at(i + start); + out_tensor->set_lod(in_tensor.lod()); + if (in_tensor.memory_size() > 0) { + out_tensor->CopyDataFrom(in_tensor); + } else { + VLOG(4) << "WARNING: The input tensor 'x_tensor' holds no memory, so " + "nothing has been written to output array[" + << i << "]."; + } + } + } else { + auto out_tensor = Out; + auto in_tensor = in_array->at(start); + out_tensor->CopyDataFrom(in_tensor); + } +} + inline std::vector GetIntDataFromTensorList( const std::vector& list_tensor) { std::vector vec_data; @@ -219,6 +260,8 @@ void slice_compute(const lite::Tensor* in, template void slice_compute_(const lite::Tensor* Input, lite::Tensor* Out, + const std::vector* XTensorList, + std::vector* OutTensorList, std::vector axes, std::vector starts, std::vector ends, @@ -228,6 +271,38 @@ void slice_compute_(const lite::Tensor* Input, std::vector StartsTensorList, std::vector EndsTensorList, std::vector infer_flags) { + if (Input == nullptr && XTensorList != nullptr) { + bool need_infer = false; + if (StartsTensor || EndsTensor) { + need_infer = true; + } + if (StartsTensorList.size() > 0 || EndsTensorList.size() > 0) { + need_infer = true; + } + if (need_infer) { + if (StartsTensor) { + starts = GetIntDataFromTensor(StartsTensor); + } else if (StartsTensorList.size() > 0) { + starts = GetIntDataFromTensorList(StartsTensorList); + } + CHECK_EQ(starts.size(), axes.size()) + << "The size of starts must be equal to the size of axes."; + if (EndsTensor) { + ends = GetIntDataFromTensor(EndsTensor); + } else if (EndsTensorList.size() > 0) { + ends = GetIntDataFromTensorList(EndsTensorList); + } + CHECK_EQ(ends.size(), axes.size()) + << "The size of starts must be equal to the size of axes."; + } + DealTensorArray(XTensorList, + OutTensorList, + Out, + starts, + ends, + (Out == nullptr && OutTensorList != nullptr)); + return; + } int rank = Input->dims().size(); switch (rank) { case 1: @@ -320,6 +395,8 @@ class SliceCompute : public KernelLite { auto& param = *param_.get_mutable(); slice_compute_(param.X, param.Out, + param.XTensorList, + param.OutTensorList, param.axes, param.starts, param.ends, diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt index 29266862dea..1efed16ac05 100644 --- a/lite/kernels/xpu/CMakeLists.txt +++ b/lite/kernels/xpu/CMakeLists.txt @@ -30,6 +30,7 @@ add_kernel(gru_compute_xpu XPU basic SRCS gru_compute.cc) add_kernel(gru_unit_compute_xpu XPU basic SRCS gru_unit_compute.cc) add_kernel(stack_compute_xpu XPU basic SRCS stack_compute.cc) add_kernel(slice_compute_xpu XPU basic SRCS slice_compute.cc) +add_kernel(tile_compute_xpu XPU basic SRCS tile_compute.cc) add_kernel(cast_compute_xpu XPU basic SRCS cast_compute.cc) add_kernel(sequence_topk_avg_pooling_compute_xpu XPU basic SRCS sequence_topk_avg_pooling_compute.cc) add_kernel(concat_compute_xpu XPU basic SRCS concat_compute.cc) diff --git a/lite/kernels/xpu/__xpu__bigru_compute.cc b/lite/kernels/xpu/__xpu__bigru_compute.cc index 0fb9e3c3fe2..780f904e525 100644 --- a/lite/kernels/xpu/__xpu__bigru_compute.cc +++ b/lite/kernels/xpu/__xpu__bigru_compute.cc @@ -55,13 +55,14 @@ void XPUBiGRUCompute::PrepareBiasForRun(bool forward) { void XPUBiGRUCompute::PrepareMulWeightForRun(bool forward) { auto& mul_quant_weight_ = forward ? fw_mul_quant_weight_ : bw_mul_quant_weight_; + auto& ctx = this->ctx_->template As(); auto& param = this->template Param(); auto* weight = forward ? param.fw_mul_w : param.bw_mul_w; auto weight_ptr = weight->data(); auto weight_dims = weight->dims(); mul_quant_weight_ = TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( - weight_ptr, weight_dims, true); + weight_ptr, weight_dims, true, ctx.GetRawContext()->max_ptr_size()); } void XPUBiGRUCompute::PrepareGRUWeightForRun(bool forward) { diff --git a/lite/kernels/xpu/__xpu__conv2d_compute.cc b/lite/kernels/xpu/__xpu__conv2d_compute.cc index cad9a4fd691..8c267843f8b 100644 --- a/lite/kernels/xpu/__xpu__conv2d_compute.cc +++ b/lite/kernels/xpu/__xpu__conv2d_compute.cc @@ -22,36 +22,12 @@ namespace lite { namespace kernels { namespace xpu { -template -bool QuantFilter(const float* filter_on_host, - T* quant_res, - float max, - int64_t len) { - return false; -} - -template <> -bool QuantFilter(const float* filter_on_host, - int16_t* quant_res, - float max, - int64_t len) { - paddle::lite::xpu::math::ConvertFP32ToInt16( - filter_on_host, quant_res, max, len); - return true; -} - -template <> -bool QuantFilter(const float* filter_on_host, - int8_t* quant_res, - float max, - int64_t len) { - paddle::lite::xpu::math::ConvertFP32ToInt8( - filter_on_host, quant_res, max, len); - return true; -} - -template -void XPUConv2dCompute::PrepareForRun() { +template +void XPUConv2dCompute::PrepareForRun() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); int max_ptr_size = ctx.GetRawContext()->max_ptr_size(); @@ -60,12 +36,16 @@ void XPUConv2dCompute::PrepareForRun() { auto filter_dims = param.filter->dims(); xpu_quant_filter_ = - TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( - filter_ptr, filter_dims, false); + TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( + filter_ptr, filter_dims, false, max_ptr_size); } -template -void XPUConv2dCompute::Run() { +template +void XPUConv2dCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); @@ -86,8 +66,8 @@ void XPUConv2dCompute::Run() { param.output_max->template mutable_data(TARGET(kXPU)); const auto* bias = param.has_bias ? param.bias->template data() : nullptr; - const float* branch = - param.has_branch ? param.branch->template data() : nullptr; + const DY* branch = + param.has_branch ? param.branch->template data() : nullptr; const float* input_max = param.input_max ? param.input_max->template data() : nullptr; xdnn::Activation_t act((xdnn::Activation_t::act_enum)act_type); @@ -101,15 +81,15 @@ void XPUConv2dCompute::Run() { CHECK_EQ(act_type, 0); if (branch_broadcast_guard_.get() == nullptr) { branch_broadcast_guard_ = TargetWrapperXPU::MallocScratchPad( - param.output->numel() * sizeof(float)); + param.output->numel() * sizeof(DY)); } else { - branch_broadcast_guard_->Reserve(param.output->numel() * sizeof(float)); + branch_broadcast_guard_->Reserve(param.output->numel() * sizeof(DY)); } - int r = xdnn::conv2d_fusion( + int r = xdnn::conv2d_fusion( ctx.GetRawContext(), - param.input->template data(), - reinterpret_cast(xpu_quant_filter_.data_ptr_), - reinterpret_cast(branch_broadcast_guard_->addr_), + param.input->template data(), + reinterpret_cast(xpu_quant_filter_.data_ptr_), + reinterpret_cast(branch_broadcast_guard_->addr_), batch, img_c, img_h, @@ -139,21 +119,21 @@ void XPUConv2dCompute::Run() { if (branch_shape > conv_out_shape) { param.output->Resize(lite::DDim(branch_shape)); } - float* output = param.output->template mutable_data(TARGET(kXPU)); - r = xdnn::broadcast_add( + DY* output = param.output->template mutable_data(TARGET(kXPU)); + r = xdnn::broadcast_add( ctx.GetRawContext(), - reinterpret_cast(branch_broadcast_guard_->addr_), + reinterpret_cast(branch_broadcast_guard_->addr_), branch, output, xshape, yshape); CHECK_EQ(r, 0); } else { - float* output = param.output->template mutable_data(TARGET(kXPU)); - int r = xdnn::conv2d_fusion( + DY* output = param.output->template mutable_data(TARGET(kXPU)); + int r = xdnn::conv2d_fusion( ctx.GetRawContext(), - param.input->template data(), - reinterpret_cast(xpu_quant_filter_.data_ptr_), + param.input->template data(), + reinterpret_cast(xpu_quant_filter_.data_ptr_), output, batch, img_c, @@ -182,11 +162,27 @@ void XPUConv2dCompute::Run() { } // namespace paddle namespace xpu = paddle::lite::kernels::xpu; -using XPUConv2dFp32 = xpu::XPUConv2dCompute; -using XPUConv2dInt8 = xpu::XPUConv2dCompute; +using XPUConv2dFP32 = + xpu::XPUConv2dCompute; + +using XPUConv2d_FP16_FP32_FP32 = + xpu::XPUConv2dCompute; + +using XPUConv2dFp16 = + xpu::XPUConv2dCompute; + +using XPUConv2d_FP16_FP16_FP32 = + xpu::XPUConv2dCompute; -REGISTER_LITE_KERNEL(__xpu__conv2d, kXPU, kFloat, kNCHW, XPUConv2dFp32, def) +using XPUConv2d_FP16_FP32_FP16 = + xpu::XPUConv2dCompute; + +using XPUConv2dInt8_FP32_FP32 = + xpu::XPUConv2dCompute; + +REGISTER_LITE_KERNEL( + __xpu__conv2d, kXPU, kFloat, kNCHW, XPUConv2d_FP16_FP32_FP32, def) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) @@ -196,7 +192,71 @@ REGISTER_LITE_KERNEL(__xpu__conv2d, kXPU, kFloat, kNCHW, XPUConv2dFp32, def) .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); -REGISTER_LITE_KERNEL(__xpu__conv2d, kXPU, kInt8, kNCHW, XPUConv2dInt8, def) +REGISTER_LITE_KERNEL( + __xpu__conv2d, kXPU, kFloat, kNCHW, XPUConv2dFP32, XPU_Real_kFloat) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Branch", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + __xpu__conv2d, kXPU, kFP16, kNCHW, XPUConv2dFp16, XPU_FP16_FP16__FP16) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindInput("Branch", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(__xpu__conv2d, + kXPU, + kFP16, + kNCHW, + XPUConv2d_FP16_FP16_FP32, + XPU_FP16_FP16__FP32) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindInput("Branch", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(__xpu__conv2d, + kXPU, + kFP16, + kNCHW, + XPUConv2d_FP16_FP32_FP16, + XPU_FP16_FP32__FP16) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindInput("Branch", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(__xpu__conv2d, + kXPU, + kInt8, + kNCHW, + XPUConv2dInt8_FP32_FP32, + XPU_Int8_FP32_FP32) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) diff --git a/lite/kernels/xpu/__xpu__conv2d_compute.h b/lite/kernels/xpu/__xpu__conv2d_compute.h index 69a9aec69c8..c3c31d94743 100644 --- a/lite/kernels/xpu/__xpu__conv2d_compute.h +++ b/lite/kernels/xpu/__xpu__conv2d_compute.h @@ -21,8 +21,11 @@ namespace paddle { namespace lite { namespace kernels { namespace xpu { - -template +template class XPUConv2dCompute : public KernelLite { public: using param_t = operators::XPUBlockFuseParam; diff --git a/lite/kernels/xpu/__xpu__fc_compute.cc b/lite/kernels/xpu/__xpu__fc_compute.cc index 4cd429a836e..e3e465da5d2 100644 --- a/lite/kernels/xpu/__xpu__fc_compute.cc +++ b/lite/kernels/xpu/__xpu__fc_compute.cc @@ -24,75 +24,102 @@ namespace lite { namespace kernels { namespace xpu { -void XPUFcCompute::PrepareForRun() { +template +void XPUFcCompute::PrepareForRun() { auto& ctx = this->ctx_->template As(); auto& param = this->template Param(); - auto w_ptr = param.w->data(); + auto w_ptr = param.w->template data(); auto weight_dims = param.w->dims(); - bool quant_int8 = false; - if (param.quant_w_max > 0.f) { - quant_int8 = true; - } + bool w_trans = param.transpose_w; + enable_int8_ = param.enable_int8; + per_channel_ = param.per_channel; // max int max_ptr_size = ctx.GetRawContext()->max_ptr_size(); - input_max_guard_ = - TargetWrapperXPU::MallocScratchPad(max_ptr_size * sizeof(float)); - if (quant_int8) { // for paddle slim int8 quant + if (enable_int8_) { // for paddle slim int8 quant + input_max_guard_ = + TargetWrapperXPU::MallocScratchPad(max_ptr_size * sizeof(float)); xpu_quant_weight_ = TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( - reinterpret_cast(w_ptr), weight_dims, true); - std::vector cpu_w_max(max_ptr_size, param.quant_w_max); + reinterpret_cast(w_ptr), + weight_dims, + w_trans, + per_channel_ ? param.weight_max.size() : max_ptr_size); CHECK(xpu_quant_weight_.max_ptr_ != nullptr) << "slim int8 quant xpu_quant_weight_max_ptr should't be null"; - lite::TargetWrapperXPU::MemcpySync(xpu_quant_weight_.max_ptr_, - cpu_w_max.data(), - sizeof(float) * max_ptr_size, - IoDirection::HtoD); std::vector cpu_input_max(max_ptr_size, param.quant_input_max); lite::TargetWrapperXPU::MemcpySync(input_max_guard_->addr_, cpu_input_max.data(), sizeof(float) * max_ptr_size, IoDirection::HtoD); + if (per_channel_) { + lite::TargetWrapperXPU::MemcpySync( + xpu_quant_weight_.max_ptr_, + param.weight_max.data(), + sizeof(float) * param.weight_max.size(), + IoDirection::HtoD); + } else { + VLOG(3) << "set weight max :" << max_ptr_size + << ", param.weight_max[0]:" << param.weight_max[0]; + std::vector cpu_w_max(max_ptr_size, param.weight_max[0]); + lite::TargetWrapperXPU::MemcpySync(xpu_quant_weight_.max_ptr_, + cpu_w_max.data(), + sizeof(float) * max_ptr_size, + IoDirection::HtoD); + } return; - } - - if (param.precision == "int31") { - xpu_quant_weight_ = - TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( - w_ptr, weight_dims, true); - CHECK(xpu_quant_weight_.max_ptr_ == nullptr) - << "int31 weight max should be null"; - } else if (param.precision == "int16") { - xpu_quant_weight_ = - TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( - w_ptr, weight_dims, true); - } else if (param.precision == "int8") { + } else { xpu_quant_weight_ = - TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( - w_ptr, weight_dims, true); + TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( + w_ptr, weight_dims, w_trans, max_ptr_size); + if (std::is_same::value) { + VLOG(6) + << "If fc compute precision is int31,must check weight max should " + "be null "; + CHECK(xpu_quant_weight_.max_ptr_ == nullptr) + << "int31 weight max should be null"; + } } } - -void XPUFcCompute::Run() { +template +void XPUFcCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); auto input_dims = param.input->dims(); + if (param.in_num_col_dims == -1) { + param.in_num_col_dims += input_dims.size(); + } auto in_mat_dims = input_dims.Flatten2D(param.in_num_col_dims); int m = in_mat_dims[0]; int k = in_mat_dims[1]; int n = param.w->dims()[1]; - bool quant_int8 = param.quant_w_max > 0.f; int max_ptr_size = ctx.GetRawContext()->max_ptr_size(); param.output_max->Resize({max_ptr_size}); - float* output_max = quant_int8 - ? nullptr - : param.output_max->mutable_data(TARGET(kXPU)); - const auto* bias = param.has_bias ? param.bias->data() : nullptr; + bool x_trans = param.transpose_x; + bool w_trans = param.transpose_w; + int ldx = (x_trans ? m : k); + int ldw = (w_trans ? k : n); + int ldy = n; + + float* output_max = + enable_int8_ + ? nullptr + : param.output_max->template mutable_data(TARGET(kXPU)); + const auto* bias = + param.has_bias ? param.bias->template data() : nullptr; const float* input_max = - quant_int8 ? reinterpret_cast(input_max_guard_->addr_) - : (param.input_max ? param.input_max->data() : nullptr); + enable_int8_ ? reinterpret_cast(input_max_guard_->addr_) + : (param.input_max ? param.input_max->template data() + : nullptr); xdnn::Activation_t act((xdnn::Activation_t::act_enum)param.act_type); if (param.act_type == 5) { act.leaky_alpha = param.act_param; @@ -101,82 +128,53 @@ void XPUFcCompute::Run() { act.hard_sigmoid_slope = param.act_param; } // TODO(weihaoji): remove fc_int31 and fc_int16 after xpu fc wrapper refactor - if (param.precision == "int31") { - int r = xdnn::fc_fusion( - ctx.GetRawContext(), // ctx - param.input->data(), // x - reinterpret_cast(xpu_quant_weight_.data_ptr_), // w - param.output->mutable_data(TARGET(kXPU)), // y - m, // m - n, // n - k, // k - false, // x_trans - true, // w_trans - input_max, // x_maxptr - reinterpret_cast(xpu_quant_weight_.max_ptr_), // w_maxptr - output_max, // y_maxptr - k, // ldx - k, // ldw - n, // ldy - 1.0f, // alpha - 0.0f, // beta - bias, // bias + int r = 0; + if (per_channel_) { + r = xdnn::fc_fusion_pc( + ctx.GetRawContext(), // ctx + param.input->template data(), // x + reinterpret_cast(xpu_quant_weight_.data_ptr_), // w + param.output->template mutable_data(TARGET(kXPU)), // y + m, // m + n, // n + k, // k + x_trans, // x_trans + w_trans, // w_trans + input_max, // x_maxptr + nullptr, // w_maxptr + output_max, // y_maxptr + ldx, // ldx + ldw, // ldw + ldy, // ldy + 1.0f, // alpha + 0.0f, // beta + bias, // bias + reinterpret_cast( + xpu_quant_weight_.max_ptr_), // per channel weight_max act); - CHECK_EQ(r, 0); - } else if (param.precision == "int16") { - int r = 0; - r = xdnn::fc_fusion( - ctx.GetRawContext(), // ctx - param.input->data(), // x - reinterpret_cast(xpu_quant_weight_.data_ptr_), // w - param.output->mutable_data(TARGET(kXPU)), // y - m, // m - n, // n - k, // k - false, // x_trans - true, // w_trans + } else { + r = xdnn::fc_fusion( + ctx.GetRawContext(), // ctx + param.input->template data(), // x + reinterpret_cast(xpu_quant_weight_.data_ptr_), // w + param.output->template mutable_data(TARGET(kXPU)), // y + m, // m + n, // n + k, // k + x_trans, // x_trans + w_trans, // w_trans input_max, // x_maxptr reinterpret_cast(xpu_quant_weight_.max_ptr_), // w_maxptr output_max, // y_maxptr - k, // ldx - k, // ldw - n, // ldy + ldx, // ldx + ldw, // ldw + ldy, // ldy 1.0f, // alpha 0.0f, // beta bias, // bias - act); // act - - CHECK_EQ(r, 0); - } else if (param.precision == "int8") { - bool x_trans = false; - bool w_trans = true; - int ldx = (x_trans ? m : k); - int ldw = (w_trans ? k : n); - int ldy = n; - int r = xdnn::fc_fusion( - ctx.GetRawContext(), /* context */ - param.input->data(), /* x */ - reinterpret_cast(xpu_quant_weight_.data_ptr_), - param.output->mutable_data(TARGET(kXPU)), /* y */ - m, /* m */ - n, /* n */ - k, /* k */ - x_trans, /* x_trans */ - w_trans, /* w_trans */ - input_max, /* x_max */ - reinterpret_cast(xpu_quant_weight_.max_ptr_), /* w_max */ - output_max, /* y_max */ - ldx, /* ldx */ - ldw, /* ldw */ - ldy, /* ldy */ - 1.0f, /* alpha */ - 0.0f, /* beta */ - bias, /* bias */ - act); /* act_type */ - CHECK_EQ(r, 0); - } else { - LOG(FATAL) << "Unsupport XPUFC Precision: " << param.precision; + act); } + CHECK_EQ(r, 0); } } // namespace xpu @@ -184,12 +182,28 @@ void XPUFcCompute::Run() { } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL(__xpu__fc, - kXPU, - kFloat, - kNCHW, - paddle::lite::kernels::xpu::XPUFcCompute, - def) +namespace xpu = paddle::lite::kernels::xpu; + +using XPUFC_FP32 = + xpu::XPUFcCompute; + +using XPUFC_FP16_FP32_FP32 = + xpu::XPUFcCompute; + +using XPUFC_FP16_FP16_FP16 = + xpu::XPUFcCompute; + +using XPUFC_FP16_FP32_FP16 = + xpu::XPUFcCompute; + +using XPUFC_FP16_FP16_FP32 = + xpu::XPUFcCompute; + +using XPUFC_Int8_FP32_FP32 = + xpu::XPUFcCompute; + +REGISTER_LITE_KERNEL( + __xpu__fc, kXPU, kFloat, kNCHW, XPUFC_FP32, XPU_Real_kFloat) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) @@ -197,3 +211,58 @@ REGISTER_LITE_KERNEL(__xpu__fc, .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); + +REGISTER_LITE_KERNEL(__xpu__fc, kXPU, kFloat, kNCHW, XPUFC_FP16_FP32_FP32, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + __xpu__fc, kXPU, kFP16, kNCHW, XPUFC_FP16_FP16_FP16, XPUFC_FP16_FP16_FP16) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + __xpu__fc, kXPU, kFP16, kNCHW, XPUFC_FP16_FP32_FP16, XPUFC_FP16_FP32_FP16) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + __xpu__fc, kXPU, kFP16, kNCHW, XPUFC_FP16_FP16_FP32, XPUFC_FP16_FP16_FP32) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + __xpu__fc, kXPU, kFloat, kNCHW, XPUFC_Int8_FP32_FP32, XPU_Int8_FP32_FP32) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("InputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindOutput("OutputMax", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/__xpu__fc_compute.h b/lite/kernels/xpu/__xpu__fc_compute.h index 687f8d5e9c1..ffb17c8abe2 100644 --- a/lite/kernels/xpu/__xpu__fc_compute.h +++ b/lite/kernels/xpu/__xpu__fc_compute.h @@ -20,8 +20,12 @@ namespace paddle { namespace lite { namespace kernels { namespace xpu { - -class XPUFcCompute : public KernelLite { +template +class XPUFcCompute : public KernelLite { public: using param_t = operators::XPUFcParam; @@ -32,10 +36,10 @@ class XPUFcCompute : public KernelLite { virtual ~XPUFcCompute() = default; private: - // TODO(weihaoji): remove cpu w_max after xpu fc wrapper refactor - float w_max; XPUScratchPadGuard input_max_guard_; XPUQuantData xpu_quant_weight_; + bool per_channel_; + bool enable_int8_; }; } // namespace xpu diff --git a/lite/kernels/xpu/__xpu__multi_encoder_compute.cc b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc index 82444bbec2f..e96f7121a11 100644 --- a/lite/kernels/xpu/__xpu__multi_encoder_compute.cc +++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc @@ -91,7 +91,51 @@ void XPUMultiEncoderCompute::prepare_quant_max( } return; } - +void XPUMultiEncoderCompute::prepare_weight_max( + int n_layers, + bool per_channel, + const lite::Tensor* weight_max, + int max_ptr_len, + const std::vector& fc_channels, + std::vector& max_xpu_ptrs) { + // prepare weight_max + int max_ext_times = max_ptr_len; + int total_channels = 0; + if (per_channel) { + max_ext_times = 1; + CHECK_EQ(fc_channels.size(), n_layers * 6) << fc_channels.size(); + for (auto channel : fc_channels) { + total_channels += channel; + } + CHECK_EQ(weight_max->numel(), total_channels) + << "weight_max->numel: " << weight_max->numel() + << ", total_channels: " << total_channels; + } + int len = weight_max->numel() * max_ext_times * sizeof(float); + weight_max_guard_ = TargetWrapperXPU::MallocScratchPad(len); + float* weight_max_ptr = reinterpret_cast(weight_max_guard_->addr_); + if (per_channel) { + lite::TargetWrapperXPU::MemcpySync( + weight_max_ptr, weight_max->data(), len, IoDirection::HtoD); + float* cur_ptr = weight_max_ptr; + for (int i = 0; i < fc_channels.size(); ++i) { + max_xpu_ptrs.push_back(cur_ptr); + cur_ptr += fc_channels[i]; + } + CHECK_EQ(cur_ptr - weight_max_ptr, total_channels) + << weight_max_ptr << ", cur_ptr:" << cur_ptr; + } else { + for (int i = 0; i < weight_max->numel(); i++) { + float* cur_weight_max_ptr = weight_max_ptr + i * max_ptr_len; + std::vector cpu_max(max_ptr_len, weight_max->data()[i]); + lite::TargetWrapperXPU::MemcpySync(cur_weight_max_ptr, + cpu_max.data(), + sizeof(float) * max_ptr_len, + IoDirection::HtoD); + max_xpu_ptrs.push_back(cur_weight_max_ptr); + } + } +} void XPUMultiEncoderCompute::PrepareForRun() { auto& ctx = this->ctx_->template As(); auto& param = this->template Param(); @@ -115,23 +159,15 @@ void XPUMultiEncoderCompute::PrepareForRun() { } else if (param.precision == "int31") { arg_fc_weight_fp32_ = prepare_weight(param.fc_weight); } + const int n_layers = param.fc_weight.size() / 6; const int XPU_QUANT_SCALE_NUM = ctx.GetRawContext()->max_ptr_size(); - // prepare weight_max - weight_max_guard_ = TargetWrapperXPU::MallocScratchPad( - param.fc_weight_max->numel() * XPU_QUANT_SCALE_NUM * sizeof(float)); - float* weight_max_ptr = reinterpret_cast(weight_max_guard_->addr_); - for (int i = 0; i < param.fc_weight_max->numel(); i++) { - float* cur_weight_max_ptr = weight_max_ptr + i * XPU_QUANT_SCALE_NUM; - std::vector cpu_max(XPU_QUANT_SCALE_NUM, - param.fc_weight_max->data()[i]); - lite::TargetWrapperXPU::MemcpySync(cur_weight_max_ptr, - cpu_max.data(), - sizeof(float) * XPU_QUANT_SCALE_NUM, - IoDirection::HtoD); - fc_weight_max_.push_back(cur_weight_max_ptr); - } + prepare_weight_max(n_layers, + param.per_channel, + param.weight_max, + XPU_QUANT_SCALE_NUM, + param.fc_channels, + fc_weight_max_); // prepare quant max, mul&matmul input/output max - const int n_layers = param.fc_weight.size() / 6; prepare_quant_max( param.input_max, n_layers, XPU_QUANT_SCALE_NUM, fc_input_max_); // prepare act_type @@ -170,7 +206,9 @@ void XPUMultiEncoderCompute::run_encoder(const T* in, T* out) { slice_idx, true /* qkv fusion */, max_pad_seqlen, - param.hidden_dim); + param.hidden_dim, + param.norm_before, /*is_pre_norm*/ + param.per_channel); if (std::is_same::value) { CHECK_GT(fc_input_max_.size(), 0); } @@ -202,7 +240,8 @@ void XPUMultiEncoderCompute::run_encoder(const T* in, T* out) { qkv_act, slice_idx, true, - param.hidden_dim); + param.hidden_dim, + param.norm_before); int r = xdnn::transformer_encoder( ctx.GetRawContext(), in, diff --git a/lite/kernels/xpu/__xpu__multi_encoder_compute.h b/lite/kernels/xpu/__xpu__multi_encoder_compute.h index a32a413ff74..08ce7645eb8 100644 --- a/lite/kernels/xpu/__xpu__multi_encoder_compute.h +++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.h @@ -56,6 +56,12 @@ class XPUMultiEncoderCompute int n_layers, int max_ptr_len, std::vector &max_xpu_ptrs); + void prepare_weight_max(int n_layers, + bool per_channel, + const lite::Tensor *weight_max, + int max_ptr_len, + const std::vector &fc_channels, + std::vector &max_xpu_ptrs); template void run_encoder(const T *in, T *out); }; diff --git a/lite/kernels/xpu/__xpu__squeeze_excitation_compute.cc b/lite/kernels/xpu/__xpu__squeeze_excitation_compute.cc index 10a789b8428..3c51c269970 100644 --- a/lite/kernels/xpu/__xpu__squeeze_excitation_compute.cc +++ b/lite/kernels/xpu/__xpu__squeeze_excitation_compute.cc @@ -25,6 +25,7 @@ namespace xpu { void XPUSqueezeExcitationCompute::PrepareForRun() { auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); auto weight_ptr = param.filter->data(); auto weight_len = param.filter->numel(); auto weight1_len = weight_len / 2; @@ -33,12 +34,13 @@ void XPUSqueezeExcitationCompute::PrepareForRun() { auto weight2_dims = paddle::lite::DDimLite(); weight1_dims.ConstructFrom({weight1_len}); weight2_dims.ConstructFrom({weight2_len}); + auto max_ptr_len = ctx.GetRawContext()->max_ptr_size(); quant_weight1_ = TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( - weight_ptr, weight1_dims, false); + weight_ptr, weight1_dims, false, max_ptr_len); quant_weight2_ = TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( - weight_ptr + weight1_len, weight2_dims, false); + weight_ptr + weight1_len, weight2_dims, false, max_ptr_len); } void XPUSqueezeExcitationCompute::Run() { diff --git a/lite/kernels/xpu/activation_compute.cc b/lite/kernels/xpu/activation_compute.cc index 867acb68205..bb92854f0b8 100644 --- a/lite/kernels/xpu/activation_compute.cc +++ b/lite/kernels/xpu/activation_compute.cc @@ -21,13 +21,14 @@ namespace lite { namespace kernels { namespace xpu { -void ReluCompute::Run() { +template +void ReluCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); int r = xdnn::relu(ctx.GetRawContext(), - param.X->data(), - param.Out->mutable_data(TARGET(kXPU)), + param.X->template data(), + param.Out->template mutable_data(TARGET(kXPU)), param.X->numel()); CHECK_EQ(r, 0); } @@ -54,24 +55,26 @@ void GeluCompute::Run() { CHECK_EQ(r, 0); } -void TanhCompute::Run() { +template +void TanhCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); int r = xdnn::tanh(ctx.GetRawContext(), - param.X->data(), - param.Out->mutable_data(TARGET(kXPU)), + param.X->template data(), + param.Out->template mutable_data(TARGET(kXPU)), param.X->numel()); CHECK_EQ(r, 0); } -void SigmoidCompute::Run() { +template +void SigmoidCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); int r = xdnn::sigmoid(ctx.GetRawContext(), - param.X->data(), - param.Out->mutable_data(TARGET(kXPU)), + param.X->template data(), + param.Out->template mutable_data(TARGET(kXPU)), param.X->numel()); CHECK_EQ(r, 0); } @@ -205,13 +208,13 @@ void HardSigmoidCompute::Run() { CHECK_EQ(r, 0); } -void LeakyReluCompute::Run() { +template +void LeakyReluCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); - int r = xdnn::leaky_relu(ctx.GetRawContext(), - param.X->data(), - param.Out->mutable_data(TARGET(kXPU)), + param.X->template data(), + param.Out->template mutable_data(TARGET(kXPU)), param.X->numel(), param.Leaky_relu_alpha); CHECK_EQ(r, 0); @@ -274,12 +277,20 @@ void PReluCompute::Run() { } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL( - relu, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ReluCompute, def) +using reluFP32 = + paddle::lite::kernels::xpu::ReluCompute; +using reluFP16 = + paddle::lite::kernels::xpu::ReluCompute; +REGISTER_LITE_KERNEL(relu, kXPU, kFloat, kNCHW, reluFP32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); +REGISTER_LITE_KERNEL(relu, kXPU, kFP16, kNCHW, reluFP16, reluFP16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + REGISTER_LITE_KERNEL( relu6, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::Relu6Compute, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) @@ -292,21 +303,31 @@ REGISTER_LITE_KERNEL( .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); -REGISTER_LITE_KERNEL( - tanh, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::TanhCompute, def) +using tanhFP32 = + paddle::lite::kernels::xpu::TanhCompute; +using tanhFP16 = + paddle::lite::kernels::xpu::TanhCompute; +REGISTER_LITE_KERNEL(tanh, kXPU, kFloat, kNCHW, tanhFP32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); +REGISTER_LITE_KERNEL(tanh, kXPU, kFP16, kNCHW, tanhFP16, tanhFP16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); -REGISTER_LITE_KERNEL(sigmoid, - kXPU, - kFloat, - kNCHW, - paddle::lite::kernels::xpu::SigmoidCompute, - def) +using sigmoidFP32 = + paddle::lite::kernels::xpu::SigmoidCompute; +using sigmoidFP16 = + paddle::lite::kernels::xpu::SigmoidCompute; +REGISTER_LITE_KERNEL(sigmoid, kXPU, kFloat, kNCHW, sigmoidFP32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); +REGISTER_LITE_KERNEL(sigmoid, kXPU, kFP16, kNCHW, sigmoidFP16, sigmoidFP16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); REGISTER_LITE_KERNEL( abs, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::AbsCompute, def) @@ -386,16 +407,21 @@ REGISTER_LITE_KERNEL(hard_swish, .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); -REGISTER_LITE_KERNEL(leaky_relu, - kXPU, - kFloat, - kNCHW, - paddle::lite::kernels::xpu::LeakyReluCompute, - def) +using leaky_reluFP32 = + paddle::lite::kernels::xpu::LeakyReluCompute; +using leaky_reluFP16 = + paddle::lite::kernels::xpu::LeakyReluCompute; +REGISTER_LITE_KERNEL(leaky_relu, kXPU, kFloat, kNCHW, leaky_reluFP32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); +REGISTER_LITE_KERNEL( + leaky_relu, kXPU, kFP16, kNCHW, leaky_reluFP16, leaky_reluFP16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + REGISTER_LITE_KERNEL(softsign, kXPU, kFloat, diff --git a/lite/kernels/xpu/activation_compute.h b/lite/kernels/xpu/activation_compute.h index 057d527ef89..ab47e5ed580 100644 --- a/lite/kernels/xpu/activation_compute.h +++ b/lite/kernels/xpu/activation_compute.h @@ -20,7 +20,8 @@ namespace lite { namespace kernels { namespace xpu { -class ReluCompute : public KernelLite { +template +class ReluCompute : public KernelLite { public: using param_t = operators::ActivationParam; @@ -47,7 +48,8 @@ class GeluCompute : public KernelLite { virtual ~GeluCompute() = default; }; -class TanhCompute : public KernelLite { +template +class TanhCompute : public KernelLite { public: using param_t = operators::ActivationParam; @@ -56,7 +58,8 @@ class TanhCompute : public KernelLite { virtual ~TanhCompute() = default; }; -class SigmoidCompute : public KernelLite { +template +class SigmoidCompute : public KernelLite { public: using param_t = operators::ActivationParam; @@ -164,7 +167,8 @@ class HardSigmoidCompute : public KernelLite { virtual ~HardSigmoidCompute() = default; }; -class LeakyReluCompute : public KernelLite { +template +class LeakyReluCompute : public KernelLite { public: using param_t = operators::ActivationParam; diff --git a/lite/kernels/xpu/calib_compute.cc b/lite/kernels/xpu/calib_compute.cc index 34a6fb53d72..dc134fde02a 100644 --- a/lite/kernels/xpu/calib_compute.cc +++ b/lite/kernels/xpu/calib_compute.cc @@ -29,6 +29,9 @@ void CalibCompute::Run() { int numel = param.input->numel(); const auto* in_data = param.input->template data(); auto* out_data = param.output->template mutable_data(TARGET(kXPU)); + if (numel == 0) { + return; + } int r = xdnn::cast_v2( ctx.GetRawContext(), in_data, out_data, numel); CHECK_EQ(r, 0); @@ -43,31 +46,69 @@ using xpu_calib_int64_to_int32 = paddle::lite::kernels::xpu::CalibCompute; using xpu_calib_int32_to_int64 = paddle::lite::kernels::xpu::CalibCompute; +using xpu_calib_fp32_to_fp16 = + paddle::lite::kernels::xpu::CalibCompute; +using xpu_calib_fp16_to_fp32 = + paddle::lite::kernels::xpu::CalibCompute; REGISTER_LITE_KERNEL( - calib, kXPU, kFloat, kNCHW, xpu_calib_int64_to_int32, int64_to_int32) + calib, kXPU, kFloat, kNCHW, xpu_calib_int64_to_int32, calib_int64_to_int32) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .Finalize(); REGISTER_LITE_KERNEL( - calib, kXPU, kFloat, kNCHW, xpu_calib_int32_to_int64, int32_to_int64) + calib, kXPU, kFloat, kNCHW, xpu_calib_int32_to_int64, calib_int32_to_int64) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .Finalize(); REGISTER_LITE_KERNEL( - calib_once, kXPU, kFloat, kNCHW, xpu_calib_int64_to_int32, int64_to_int32) + calib, kXPU, kFloat, kNCHW, xpu_calib_fp32_to_fp16, calib_fp32_to_fp16) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + calib, kXPU, kFloat, kNCHW, xpu_calib_fp16_to_fp32, calib_fp16_to_fp32) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .Finalize(); + +REGISTER_LITE_KERNEL(calib_once, + kXPU, + kFloat, + kNCHW, + xpu_calib_int64_to_int32, + calib_int64_to_int32) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .Finalize(); -REGISTER_LITE_KERNEL( - calib_once, kXPU, kFloat, kNCHW, xpu_calib_int32_to_int64, int32_to_int64) +REGISTER_LITE_KERNEL(calib_once, + kXPU, + kFloat, + kNCHW, + xpu_calib_int32_to_int64, + calib_int32_to_int64) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .Finalize(); + +REGISTER_LITE_KERNEL( + calib_once, kXPU, kFloat, kNCHW, xpu_calib_fp32_to_fp16, calib_fp32_to_fp16) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + calib_once, kXPU, kFloat, kNCHW, xpu_calib_fp16_to_fp32, calib_fp16_to_fp32) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .Finalize(); diff --git a/lite/kernels/xpu/compare_compute.cc b/lite/kernels/xpu/compare_compute.cc index 51b92cc092a..d4cf45714fa 100644 --- a/lite/kernels/xpu/compare_compute.cc +++ b/lite/kernels/xpu/compare_compute.cc @@ -46,6 +46,18 @@ struct EqualFunctor { } }; +template +struct GreaterThanFunctor { + inline int operator()(xdnn::Context* ctx, + const T* x, + const T* y, + bool* z, + const std::vector& xshape, + const std::vector& yshape) const { + return xdnn::broadcast_greater_than(ctx, x, y, z, xshape, yshape); + } +}; + template void CompareCompute::Run() { auto& param = this->template Param(); @@ -224,3 +236,65 @@ REGISTER_LITE_KERNEL(equal, kXPU, kFloat, kAny, euqal_int64, int64) DATALAYOUT(kAny))}) .BindPaddleOpVersion("equal", 1) .Finalize(); + +using greater_than_float = paddle::lite::kernels::xpu::CompareCompute< + PRECISION(kFloat), + float, + paddle::lite::kernels::xpu::GreaterThanFunctor>; +REGISTER_LITE_KERNEL(greater_than, kXPU, kFloat, kAny, greater_than_float, def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kXPU), + PRECISION(kFloat), + DATALAYOUT(kAny))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kXPU), + PRECISION(kFloat), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kXPU), + PRECISION(kBool), + DATALAYOUT(kAny))}) + .BindPaddleOpVersion("greater_than", 1) + .Finalize(); + +using greater_than_int32 = paddle::lite::kernels::xpu::CompareCompute< + PRECISION(kFloat), + int, + paddle::lite::kernels::xpu::GreaterThanFunctor>; +REGISTER_LITE_KERNEL( + greater_than, kXPU, kFloat, kAny, greater_than_int32, int32) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kXPU), + PRECISION(kInt32), + DATALAYOUT(kAny))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kXPU), + PRECISION(kInt32), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kXPU), + PRECISION(kBool), + DATALAYOUT(kAny))}) + .BindPaddleOpVersion("greater_than", 1) + .Finalize(); + +using greater_than_int64 = paddle::lite::kernels::xpu::CompareCompute< + PRECISION(kFloat), + int64_t, + paddle::lite::kernels::xpu::GreaterThanFunctor>; +REGISTER_LITE_KERNEL( + greater_than, kXPU, kFloat, kAny, greater_than_int64, int64) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kXPU), + PRECISION(kInt64), + DATALAYOUT(kAny))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kXPU), + PRECISION(kInt64), + DATALAYOUT(kAny))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kXPU), + PRECISION(kBool), + DATALAYOUT(kAny))}) + .BindPaddleOpVersion("greater_than", 1) + .Finalize(); diff --git a/lite/kernels/xpu/concat_compute.cc b/lite/kernels/xpu/concat_compute.cc index e3fc5ef554d..9eceace16f5 100644 --- a/lite/kernels/xpu/concat_compute.cc +++ b/lite/kernels/xpu/concat_compute.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "lite/kernels/xpu/concat_compute.h" + #include #include + #include "lite/backends/xpu/xpu_header_sitter.h" #include "lite/core/op_registry.h" @@ -23,8 +25,8 @@ namespace lite { namespace kernels { namespace xpu { -template -void ConcatCompute::Run() { +template +void ConcatCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); @@ -34,7 +36,7 @@ void ConcatCompute::Run() { ? param.axis + static_cast(ins[0]->dims().size()) : param.axis; - std::vector x_list; + std::vector x_list; std::vector> xdims_list; for (int i = 0; i < ins.size(); i++) { if (ins[i]->numel() > 0) { @@ -46,14 +48,14 @@ void ConcatCompute::Run() { xdims_list[i].back() = xdims_list[i].back() * 2; } x_list.push_back( - reinterpret_cast(ins[i]->template data())); + reinterpret_cast(ins[i]->template data())); } } if (x_list.size() > 1) { - int r = xdnn::concat( + int r = xdnn::concat( ctx.GetRawContext(), x_list, - reinterpret_cast( + reinterpret_cast( out->template mutable_data(TARGET(kXPU))), xdims_list, axis); @@ -75,37 +77,45 @@ void ConcatCompute::Run() { } // namespace kernels } // namespace lite } // namespace paddle - -REGISTER_LITE_KERNEL(concat, - kXPU, - kFloat, - kNCHW, - paddle::lite::kernels::xpu::ConcatCompute, - def) +using concatfp32 = + paddle::lite::kernels::xpu::ConcatCompute; +using concatfp16 = + paddle::lite::kernels::xpu::ConcatCompute; +using concati16 = + paddle::lite::kernels::xpu::ConcatCompute; +using concati32 = + paddle::lite::kernels::xpu::ConcatCompute; +using concati64 = + paddle::lite::kernels::xpu::ConcatCompute; +REGISTER_LITE_KERNEL(concat, kXPU, kFloat, kNCHW, concatfp32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) .BindInput("AxisTensor", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) .Finalize(); -REGISTER_LITE_KERNEL(concat, - kXPU, - kFloat, - kNCHW, - paddle::lite::kernels::xpu::ConcatCompute, - concat_i32) +REGISTER_LITE_KERNEL(concat, kXPU, kFP16, kNCHW, concatfp16, concat_FP16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("AxisTensor", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + +REGISTER_LITE_KERNEL(concat, kXPU, kInt16, kNCHW, concati16, concat_INT16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt16))}) + .BindInput("AxisTensor", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt16))}) + .Finalize(); + +REGISTER_LITE_KERNEL(concat, kXPU, kInt32, kNCHW, concati32, concat_INT32) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .BindInput("AxisTensor", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .Finalize(); -REGISTER_LITE_KERNEL(concat, - kXPU, - kFloat, - kNCHW, - paddle::lite::kernels::xpu::ConcatCompute, - concat_i64) +REGISTER_LITE_KERNEL(concat, kXPU, kInt64, kNCHW, concati64, concat_INT64) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .BindInput("AxisTensor", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) diff --git a/lite/kernels/xpu/concat_compute.h b/lite/kernels/xpu/concat_compute.h index 218c4704557..964f94f8194 100644 --- a/lite/kernels/xpu/concat_compute.h +++ b/lite/kernels/xpu/concat_compute.h @@ -21,8 +21,8 @@ namespace lite { namespace kernels { namespace xpu { -template -class ConcatCompute : public KernelLite { +template +class ConcatCompute : public KernelLite { public: using param_t = operators::ConcatParam; diff --git a/lite/kernels/xpu/conv2d_transpose_compute.cc b/lite/kernels/xpu/conv2d_transpose_compute.cc index 7949b193c56..0ec8532b4bc 100644 --- a/lite/kernels/xpu/conv2d_transpose_compute.cc +++ b/lite/kernels/xpu/conv2d_transpose_compute.cc @@ -22,6 +22,23 @@ namespace lite { namespace kernels { namespace xpu { +template <> +void Conv2dTransposeCompute::PrepareForRun() { + int cur_dev_idx = 0; + + XPU_CALL(xpu_current_device(&cur_dev_idx)); + XPU_CALL(xpu_device_get_attr(&cur_dev_attr_, XPUATTR_MODEL, cur_dev_idx)); + if (cur_dev_attr_ <= 1) { + VLOG(4) << "Currents XPU device : XPU1"; + } else if (cur_dev_attr_ >= 2 && cur_dev_attr_ <= 299) { + VLOG(4) << "Currents XPU device : XPU2"; + } else if (cur_dev_attr_ >= 300 && cur_dev_attr_ <= 599) { + VLOG(4) << "Currents XPU device : XPU3"; + } else { + VLOG(4) << "invaid XPU device"; + } +} + template <> void Conv2dTransposeCompute::Run() { auto& param = this->template Param(); @@ -37,27 +54,53 @@ void Conv2dTransposeCompute::Run() { auto dilations = *param.dilations; if (param.output_padding.empty()) { - int ret = xdnn::conv2d_transpose( - ctx.GetRawContext(), - param.x->data(), - param.filter->data(), - param.output->mutable_data(TARGET(kXPU)), - in_dims[0], - in_dims[1], - in_dims[2], - in_dims[3], - out_dims[1], - std::vector{static_cast(w_dims[2]), - static_cast(w_dims[3])}, - strides, - paddings, - dilations, - groups, - nullptr, - nullptr, - nullptr, - true); - CHECK_EQ(ret, 0); + if (cur_dev_attr_ <= 1) { + int ret = xdnn::conv2d_transpose( + ctx.GetRawContext(), + param.x->data(), + param.filter->data(), + param.output->mutable_data(TARGET(kXPU)), + in_dims[0], + in_dims[1], + in_dims[2], + in_dims[3], + out_dims[1], + std::vector{static_cast(w_dims[2]), + static_cast(w_dims[3])}, + strides, + paddings, + dilations, + groups, + nullptr, + nullptr, + nullptr, + true); + CHECK_EQ(ret, 0); + } else { + int ret = xdnn::conv2d_transpose_fusion( + ctx.GetRawContext(), + param.x->data(), + param.filter->data(), + param.output->mutable_data(TARGET(kXPU)), + in_dims[0], + in_dims[1], + in_dims[2], + in_dims[3], + out_dims[1], + std::vector{static_cast(w_dims[2]), + static_cast(w_dims[3])}, + strides, + paddings, + dilations, + groups, + nullptr, + nullptr, + nullptr, + nullptr, + xdnn::Activation_t::LINEAR, + true); + CHECK_EQ(ret, 0); + } } else { int n = in_dims[0]; int yc = in_dims[1]; diff --git a/lite/kernels/xpu/conv2d_transpose_compute.h b/lite/kernels/xpu/conv2d_transpose_compute.h index 5a3d8714fd4..6e779fc42ad 100644 --- a/lite/kernels/xpu/conv2d_transpose_compute.h +++ b/lite/kernels/xpu/conv2d_transpose_compute.h @@ -28,9 +28,11 @@ class Conv2dTransposeCompute : public KernelLite { public: using param_t = operators::ConvParam; + void PrepareForRun() override; void Run() override; virtual ~Conv2dTransposeCompute() = default; + uint64_t cur_dev_attr_ = 0; }; } // namespace xpu diff --git a/lite/kernels/xpu/conv3d_compute.cc b/lite/kernels/xpu/conv3d_compute.cc index cc3ad389679..cd5b79c21fc 100644 --- a/lite/kernels/xpu/conv3d_compute.cc +++ b/lite/kernels/xpu/conv3d_compute.cc @@ -22,8 +22,27 @@ namespace lite { namespace kernels { namespace xpu { -template <> -void Conv3DCompute::Run() { +template +void Conv3DCompute::PrepareForRun() { + auto& ctx = this->ctx_->template As(); + auto& param = this->template Param(); + auto filter_ptr = param.filter->template data(); + auto filter_dims = param.filter->dims(); + xpu_quant_filter_ = + TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight( + filter_ptr, filter_dims, false, ctx.GetRawContext()->max_ptr_size()); +} + +template +void Conv3DCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); @@ -34,11 +53,11 @@ void Conv3DCompute::Run() { auto paddings = *param.paddings; auto dilations = *param.dilations; - int r = xdnn::conv3d( + int r = xdnn::conv3d( ctx.GetRawContext(), /* context */ - param.x->data(), - param.filter->data(), /* weight */ - param.output->mutable_data(TARGET(kXPU)), + param.x->template data(), + reinterpret_cast(xpu_quant_filter_.data_ptr_), /* weight */ + param.output->template mutable_data(TARGET(kXPU)), x_dims[0], /* input_n */ x_dims[1], /* input_c */ x_dims[2], /* input_d */ @@ -53,7 +72,7 @@ void Conv3DCompute::Run() { dilations, groups, nullptr, - nullptr, + reinterpret_cast(xpu_quant_filter_.max_ptr_), nullptr, true /*is_ncdhw*/); CHECK_EQ(r, 0); @@ -65,11 +84,61 @@ void Conv3DCompute::Run() { } // namespace paddle namespace xpu = paddle::lite::kernels::xpu; -using Conv3dFp32 = xpu::Conv3DCompute; -REGISTER_LITE_KERNEL(conv3d, kXPU, kFloat, kNCHW, Conv3dFp32, def) +using XPUConv3dFP32 = + xpu::Conv3DCompute; + +using XPUConv3d_FP16_FP32_FP32 = + xpu::Conv3DCompute; + +using XPUConv3dFp16 = + xpu::Conv3DCompute; + +using XPUConv3d_FP16_FP16_FP32 = + xpu::Conv3DCompute; + +using XPUConv3d_FP16_FP32_FP16 = + xpu::Conv3DCompute; + +REGISTER_LITE_KERNEL( + conv3d, kXPU, kFloat, kNCHW, XPUConv3dFP32, XPU_Real_kFloat) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); + +REGISTER_LITE_KERNEL(conv3d, kXPU, kFloat, kNCHW, XPUConv3d_FP16_FP32_FP32, def) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) - .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); + +REGISTER_LITE_KERNEL( + conv3d, kXPU, kFP16, kNCHW, XPUConv3dFp16, XPU_FP16_FP16_FP16) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + conv3d, kXPU, kFP16, kNCHW, XPUConv3d_FP16_FP16_FP32, XPU_FP16_FP16_FP32) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + conv3d, kXPU, kFP16, kNCHW, XPUConv3d_FP16_FP32_FP16, XPU_FP16_FP32_FP16) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Output", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); diff --git a/lite/kernels/xpu/conv3d_compute.h b/lite/kernels/xpu/conv3d_compute.h index caadb82a1e8..4cd5fdaeca7 100644 --- a/lite/kernels/xpu/conv3d_compute.h +++ b/lite/kernels/xpu/conv3d_compute.h @@ -21,14 +21,22 @@ namespace lite { namespace kernels { namespace xpu { -template -class Conv3DCompute : public KernelLite { +template +class Conv3DCompute : public KernelLite { public: using param_t = operators::ConvParam; + void PrepareForRun() override; void Run() override; virtual ~Conv3DCompute() = default; + + private: + XPUQuantData xpu_quant_filter_; }; } // namespace xpu diff --git a/lite/kernels/xpu/elementwise_compute.cc b/lite/kernels/xpu/elementwise_compute.cc index aaf1c913209..4b8e0e158c5 100644 --- a/lite/kernels/xpu/elementwise_compute.cc +++ b/lite/kernels/xpu/elementwise_compute.cc @@ -132,10 +132,15 @@ void ElementwiseCompute::Run() { namespace xpu = paddle::lite::kernels::xpu; using AddFloat32 = xpu::ElementwiseCompute>; +using AddFloat16 = xpu::ElementwiseCompute>; using AddInt32 = xpu::ElementwiseCompute>; using AddInt64 = xpu::ElementwiseCompute>; + using SubFloat32 = xpu::ElementwiseCompute>; + using MulFloat32 = xpu::ElementwiseCompute>; +using MulFloat16 = xpu::ElementwiseCompute>; + using MulInt64 = xpu::ElementwiseCompute>; using DivFloat32 = xpu::ElementwiseCompute>; using MaxFloat32 = xpu::ElementwiseCompute>; @@ -147,6 +152,13 @@ REGISTER_LITE_KERNEL(elementwise_add, kXPU, kFloat, kNCHW, AddFloat32, def) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); +REGISTER_LITE_KERNEL( + elementwise_add, kXPU, kFloat, kNCHW, AddFloat16, DISABLE_XPU1_AddFloat16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + REGISTER_LITE_KERNEL(elementwise_add, kXPU, kFloat, kNCHW, AddInt32, int32) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) @@ -171,6 +183,13 @@ REGISTER_LITE_KERNEL(elementwise_mul, kXPU, kFloat, kNCHW, MulFloat32, def) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); +REGISTER_LITE_KERNEL( + elementwise_mul, kXPU, kFloat, kNCHW, MulFloat16, DISABLE_XPU1_MulFloat16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + REGISTER_LITE_KERNEL(elementwise_mul, kXPU, kFloat, kNCHW, MulInt64, int64) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) diff --git a/lite/kernels/xpu/gather_compute.cc b/lite/kernels/xpu/gather_compute.cc index f3eafc878fb..697204689d9 100644 --- a/lite/kernels/xpu/gather_compute.cc +++ b/lite/kernels/xpu/gather_compute.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "lite/kernels/xpu/gather_compute.h" + #include + #include "lite/backends/xpu/xpu_header_sitter.h" #include "lite/core/op_registry.h" @@ -22,8 +24,8 @@ namespace lite { namespace kernels { namespace xpu { -template -void GatherCompute::Run() { +template +void GatherCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); @@ -46,88 +48,16 @@ void GatherCompute::Run() { axis += x_dims.size(); } - if (param.X->precision() == PrecisionType::kInt64 && - param.Index->precision() == PrecisionType::kInt64) { - auto* index_int64 = param.Index->template data(); - int size = param.Index->dims().production(); - XPUScratchPadGuard index_xpu_guard_ = - TargetWrapperXPU::MallocScratchPad(size * sizeof(int)); - int* index_int32_device = reinterpret_cast(index_xpu_guard_->addr_); - - int r0 = xdnn::cast_v2( - ctx.GetRawContext(), index_int64, index_int32_device, index->numel()); - CHECK_EQ(r0, 0); + int r = xdnn::gather( + ctx.GetRawContext(), + x->template data(), + index->template data(), + out->template mutable_data(TARGET(kXPU)), + x_dims, + index->numel(), + axis); - int r1 = xdnn::gather( - ctx.GetRawContext(), - x->template data(), - index_int32_device, - out->template mutable_data(TARGET(kXPU)), - x_dims, - index->numel(), - axis); - CHECK_EQ(r1, 0); - } else if (param.X->precision() == PrecisionType::kInt64 && - param.Index->precision() == PrecisionType::kInt32) { - int r = xdnn::gather( - ctx.GetRawContext(), - x->template data(), - index->template data(), - out->template mutable_data(TARGET(kXPU)), - x_dims, - index->numel(), - axis); - CHECK_EQ(r, 0); - } else if (param.X->precision() == PrecisionType::kInt32 && - param.Index->precision() == PrecisionType::kInt32) { - int r = xdnn::gather( - ctx.GetRawContext(), - x->template data(), - index->template data(), - out->template mutable_data(TARGET(kXPU)), - x_dims, - index->numel(), - axis); - CHECK_EQ(r, 0); - } else if (param.X->precision() == PrecisionType::kInt32 && - param.Index->precision() == PrecisionType::kInt64) { - int r = xdnn::gather( - ctx.GetRawContext(), - x->template data(), - index->template data(), - out->template mutable_data(TARGET(kXPU)), - x_dims, - index->numel(), - axis); - CHECK_EQ(r, 0); - } else if (param.X->precision() == PrecisionType::kFloat && - param.Index->precision() == PrecisionType::kInt32) { - int r = xdnn::gather( - ctx.GetRawContext(), - x->template data(), - index->template data(), - out->template mutable_data(TARGET(kXPU)), - x_dims, - index->numel(), - axis); - CHECK_EQ(r, 0); - } else if (param.X->precision() == PrecisionType::kFloat && - param.Index->precision() == PrecisionType::kInt64) { - int r = xdnn::gather( - ctx.GetRawContext(), - x->template data(), - index->template data(), - out->template mutable_data(TARGET(kXPU)), - x_dims, - index->numel(), - axis); - CHECK_EQ(r, 0); - } else { - LOG(FATAL) << "Unsupported gather op with x dtype: " - << lite_api::PrecisionToStr(param.X->precision()) - << " and index dtype: " - << lite_api::PrecisionToStr(param.Index->precision()); - } + CHECK_EQ(r, 0); } } // namespace xpu @@ -141,10 +71,21 @@ REGISTER_LITE_KERNEL(gather, kXPU, kFloat, kNCHW, GatherXPUFloatInt32, def) {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .BindInput("Axis", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) .Finalize(); + REGISTER_LITE_KERNEL( - gather, kXPU, kFloat, kNCHW, GatherXPUFloatInt64, gather_float_i64) + gather, kXPU, kFP16, kNCHW, GatherXPUkFP16Int32, gather_FP16_Int32) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Index", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindInput("Axis", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + +REGISTER_LITE_KERNEL( + gather, kXPU, kFloat, kNCHW, GatherXPUFloatInt64, gather_FP32_INT64) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) .BindInput("Index", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) @@ -153,7 +94,7 @@ REGISTER_LITE_KERNEL( .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); REGISTER_LITE_KERNEL( - gather, kXPU, kFloat, kNCHW, GatherXPUInt32Int32, gather_i32_i32) + gather, kXPU, kInt32, kNCHW, GatherXPUInt32Int32, gather_INT32_INT32) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .BindInput("Index", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) @@ -162,7 +103,7 @@ REGISTER_LITE_KERNEL( .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); REGISTER_LITE_KERNEL( - gather, kXPU, kFloat, kNCHW, GatherXPUInt32Int64, gather_i32_i64) + gather, kXPU, kInt32, kNCHW, GatherXPUInt32Int64, gather_INT32_INT64) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .BindInput("Index", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) @@ -171,7 +112,7 @@ REGISTER_LITE_KERNEL( .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); REGISTER_LITE_KERNEL( - gather, kXPU, kFloat, kNCHW, GatherXPUInt64Int32, gather_i64_i32) + gather, kXPU, kInt64, kNCHW, GatherXPUInt64Int32, gather_INT64_INT32) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .BindInput("Index", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) @@ -179,12 +120,3 @@ REGISTER_LITE_KERNEL( {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .Finalize(); -REGISTER_LITE_KERNEL( - gather, kXPU, kFloat, kNCHW, GatherXPUInt64Int64, gather_i64_i64) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) - .BindInput("Index", - {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) - .BindInput("Axis", - {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) - .Finalize(); diff --git a/lite/kernels/xpu/gather_compute.h b/lite/kernels/xpu/gather_compute.h index a78be677d09..2363e8651ca 100644 --- a/lite/kernels/xpu/gather_compute.h +++ b/lite/kernels/xpu/gather_compute.h @@ -21,8 +21,8 @@ namespace lite { namespace kernels { namespace xpu { -template -class GatherCompute : public KernelLite { +template +class GatherCompute : public KernelLite { public: using param_t = operators::GatherParam; @@ -36,15 +36,27 @@ class GatherCompute : public KernelLite { } // namespace lite } // namespace paddle -typedef paddle::lite::kernels::xpu::GatherCompute +typedef paddle::lite::kernels::xpu::GatherCompute GatherXPUInt32Int32; -typedef paddle::lite::kernels::xpu::GatherCompute +typedef paddle::lite::kernels::xpu::GatherCompute GatherXPUInt32Int64; -typedef paddle::lite::kernels::xpu::GatherCompute +typedef paddle::lite::kernels::xpu::GatherCompute GatherXPUFloatInt32; -typedef paddle::lite::kernels::xpu::GatherCompute +typedef paddle::lite::kernels::xpu::GatherCompute + GatherXPUkFP16Int32; +typedef paddle::lite::kernels::xpu::GatherCompute GatherXPUFloatInt64; -typedef paddle::lite::kernels::xpu::GatherCompute +typedef paddle::lite::kernels::xpu::GatherCompute GatherXPUInt64Int32; -typedef paddle::lite::kernels::xpu::GatherCompute - GatherXPUInt64Int64; diff --git a/lite/kernels/xpu/pool_compute.cc b/lite/kernels/xpu/pool_compute.cc index 9df03bc3c48..8211de7e438 100644 --- a/lite/kernels/xpu/pool_compute.cc +++ b/lite/kernels/xpu/pool_compute.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "lite/kernels/xpu/pool_compute.h" + #include #include + #include "lite/backends/xpu/xpu_header_sitter.h" #include "lite/core/op_registry.h" @@ -22,8 +24,8 @@ namespace paddle { namespace lite { namespace kernels { namespace xpu { - -void Pool2DCompute::Run() { +template +void Pool2DCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); @@ -55,8 +57,8 @@ void Pool2DCompute::Run() { if (param.pooling_type == "avg") { int r = xdnn::adaptive_avg_pool2d( ctx.GetRawContext(), - param.x->data(), - param.output->mutable_data(TARGET(kXPU)), + param.x->template data(), + param.output->template mutable_data(TARGET(kXPU)), x_dims[0], x_dims[1], x_dims[2], @@ -68,8 +70,8 @@ void Pool2DCompute::Run() { } else { int r = xdnn::adaptive_max_pool2d( ctx.GetRawContext(), - param.x->data(), - param.output->mutable_data(TARGET(kXPU)), + param.x->template data(), + param.output->template mutable_data(TARGET(kXPU)), nullptr, x_dims[0], x_dims[1], @@ -82,10 +84,10 @@ void Pool2DCompute::Run() { } } else { if (param.pooling_type == "avg") { - int r = xdnn::avg_pool2d( + int r = xdnn::avg_pool2d( ctx.GetRawContext(), - param.x->data(), - param.output->mutable_data(TARGET(kXPU)), + param.x->template data(), + param.output->template mutable_data(TARGET(kXPU)), x_dims[0], x_dims[1], x_dims[2], @@ -98,10 +100,10 @@ void Pool2DCompute::Run() { CHECK_EQ(r, 0); } else { if (param.pad_zero == true) { - int r = xdnn::max_pool2d( + int r = xdnn::max_pool2d( ctx.GetRawContext(), - param.x->data(), - param.output->mutable_data(TARGET(kXPU)), + param.x->template data(), + param.output->template mutable_data(TARGET(kXPU)), nullptr, x_dims[0], x_dims[1], @@ -113,7 +115,7 @@ void Pool2DCompute::Run() { true); CHECK_EQ(r, 0); } else { - const float* xpu_x_padded = nullptr; + const InType* xpu_x_padded = nullptr; std::vector xpu_x_padded_dims{static_cast(x_dims[0]), static_cast(x_dims[1]), static_cast(x_dims[2]), @@ -121,7 +123,7 @@ void Pool2DCompute::Run() { XPUScratchPadGuard xpu_x_padded_guard_; if (paddings[0] == 0 && paddings[1] == 0 && paddings[2] == 0 && paddings[3] == 0) { - xpu_x_padded = param.x->data(); + xpu_x_padded = param.x->template data(); } else { std::vector pad_left{0, 0, paddings[0], paddings[2]}; std::vector pad_right{0, 0, paddings[1], paddings[3]}; @@ -130,25 +132,25 @@ void Pool2DCompute::Run() { xpu_x_padded_dims[3] = xpu_x_padded_dims[3] + paddings[2] + paddings[3]; xpu_x_padded_guard_ = TargetWrapperXPU::MallocScratchPad( - sizeof(float) * xpu_x_padded_dims[0] * xpu_x_padded_dims[1] * + sizeof(InType) * xpu_x_padded_dims[0] * xpu_x_padded_dims[1] * xpu_x_padded_dims[2] * xpu_x_padded_dims[3]); - xpu_x_padded = reinterpret_cast(xpu_x_padded_guard_->addr_); - int r = xdnn::pad(ctx.GetRawContext(), - param.x->data(), - const_cast(xpu_x_padded), - {static_cast(x_dims[0]), - static_cast(x_dims[1]), - static_cast(x_dims[2]), - static_cast(x_dims[3])}, - pad_left, - pad_right, - -9999999.0f); + xpu_x_padded = reinterpret_cast(xpu_x_padded_guard_->addr_); + int r = xdnn::pad(ctx.GetRawContext(), + param.x->template data(), + const_cast(xpu_x_padded), + {static_cast(x_dims[0]), + static_cast(x_dims[1]), + static_cast(x_dims[2]), + static_cast(x_dims[3])}, + pad_left, + pad_right, + -9999999.0f); CHECK_EQ(r, 0); } - int r = xdnn::max_pool2d( + int r = xdnn::max_pool2d( ctx.GetRawContext(), xpu_x_padded, - param.output->mutable_data(TARGET(kXPU)), + param.output->template mutable_data(TARGET(kXPU)), nullptr, xpu_x_padded_dims[0], xpu_x_padded_dims[1], @@ -168,19 +170,29 @@ void Pool2DCompute::Run() { } // namespace kernels } // namespace lite } // namespace paddle +// (TODO:quwei) refactor pool2d + +using pool2d_fp32 = + paddle::lite::kernels::xpu::Pool2DCompute; +using pool2d_fp16 = + paddle::lite::kernels::xpu::Pool2DCompute; + +using max_pool2d_with_index_fp32 = + paddle::lite::kernels::xpu::Pool2DCompute; + +REGISTER_LITE_KERNEL(pool2d, kXPU, kFloat, kNCHW, pool2d_fp32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .Finalize(); REGISTER_LITE_KERNEL( - pool2d, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::Pool2DCompute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) - .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + pool2d, kXPU, kFP16, kNCHW, pool2d_fp16, DISABLE_XPU1_pool2d_FP16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) .Finalize(); -REGISTER_LITE_KERNEL(max_pool2d_with_index, - kXPU, - kFloat, - kNCHW, - paddle::lite::kernels::xpu::Pool2DCompute, - def) +REGISTER_LITE_KERNEL( + max_pool2d_with_index, kXPU, kFloat, kNCHW, max_pool2d_with_index_fp32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kXPU))}) diff --git a/lite/kernels/xpu/pool_compute.h b/lite/kernels/xpu/pool_compute.h index 39e14f04a8c..c107b2877b1 100644 --- a/lite/kernels/xpu/pool_compute.h +++ b/lite/kernels/xpu/pool_compute.h @@ -20,8 +20,8 @@ namespace paddle { namespace lite { namespace kernels { namespace xpu { - -class Pool2DCompute : public KernelLite { +template +class Pool2DCompute : public KernelLite { public: using param_t = operators::PoolParam; diff --git a/lite/kernels/xpu/reduce_compute.cc b/lite/kernels/xpu/reduce_compute.cc index da2477d48ba..8563ec4d601 100644 --- a/lite/kernels/xpu/reduce_compute.cc +++ b/lite/kernels/xpu/reduce_compute.cc @@ -154,6 +154,8 @@ using ReduceAll = xpu::ReduceCompute>; using ReduceAny = xpu::ReduceCompute>; using ReduceMeanFloat32 = xpu::ReduceCompute>; +using ReduceMeanFloat16 = + xpu::ReduceCompute>; using ReduceSumFloat32 = xpu::ReduceCompute>; using ReduceProdFloat32 = @@ -178,6 +180,16 @@ REGISTER_LITE_KERNEL(reduce_mean, kXPU, kFloat, kNCHW, ReduceMeanFloat32, def) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); +REGISTER_LITE_KERNEL(reduce_mean, + kXPU, + kFloat, + kNCHW, + ReduceMeanFloat16, + DISABLE_XPU1_ReduceMeanFloat16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + REGISTER_LITE_KERNEL(reduce_sum, kXPU, kFloat, kNCHW, ReduceSumFloat32, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) diff --git a/lite/kernels/xpu/reshape_compute.cc b/lite/kernels/xpu/reshape_compute.cc index 78359443991..c82e367e9eb 100644 --- a/lite/kernels/xpu/reshape_compute.cc +++ b/lite/kernels/xpu/reshape_compute.cc @@ -69,6 +69,21 @@ REGISTER_LITE_KERNEL(reshape2, .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kHost))}) .Finalize(); +REGISTER_LITE_KERNEL(reshape2, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::ReshapeCompute, + float16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("ShapeTensor", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindInput("Shape", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); + REGISTER_LITE_KERNEL(reshape2, kXPU, kFloat, @@ -113,6 +128,20 @@ REGISTER_LITE_KERNEL(reshape, .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); +REGISTER_LITE_KERNEL(reshape, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::ReshapeCompute, + float16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("ShapeTensor", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindInput("Shape", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + REGISTER_LITE_KERNEL(flatten, kXPU, kFloat, @@ -125,6 +154,18 @@ REGISTER_LITE_KERNEL(flatten, .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .Finalize(); +REGISTER_LITE_KERNEL(flatten, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::ReshapeCompute, + float16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Shape", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .Finalize(); + REGISTER_LITE_KERNEL(flatten2, kXPU, kFloat, @@ -137,3 +178,16 @@ REGISTER_LITE_KERNEL(flatten2, .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kHost))}) .Finalize(); + +REGISTER_LITE_KERNEL(flatten2, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::ReshapeCompute, + float16) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindInput("Shape", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) + .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); diff --git a/lite/kernels/xpu/slice_compute.cc b/lite/kernels/xpu/slice_compute.cc index 6353d6114e6..52a2669d083 100644 --- a/lite/kernels/xpu/slice_compute.cc +++ b/lite/kernels/xpu/slice_compute.cc @@ -13,6 +13,8 @@ // limitations under the License. #include "lite/kernels/xpu/slice_compute.h" +#include +#include #include "lite/backends/xpu/xpu_header_sitter.h" #include "lite/core/op_registry.h" @@ -21,6 +23,62 @@ namespace lite { namespace kernels { namespace xpu { +template +void DealTensorArray(XPUContext ctx, + const operators::SliceParam& param, + const std::vector& starts, + const std::vector& ends, + bool out_is_array) { + auto in_array = param.XTensorList; + // If the input is LoDTensorArray, the rank of input is 1. + int64_t in_size = in_array->size(); + int64_t start = starts[0] < 0 ? (starts[0] + in_size) : starts[0]; + int64_t end = ends[0] < 0 ? (ends[0] + in_size) : ends[0]; + + start = std::max(start, static_cast(0)); + end = std::max(end, static_cast(0)); + end = std::min(end, in_size); + + CHECK_GT(end, start) << "end should greater than start"; + int64_t out_size = end - start; + + if (out_is_array) { + auto out_array = param.OutTensorList; + out_array->resize(out_size); + for (int i = 0; i < out_size; ++i) { + auto* out_tensor = &out_array->at(i); + auto in_tensor = in_array->at(i + start); + out_tensor->Resize(in_tensor.dims()); + out_tensor->set_lod(in_tensor.lod()); + out_tensor->set_precision(in_tensor.precision()); + if (in_tensor.memory_size() > 0) { + out_tensor->mutable_data(TARGET(kXPU), in_tensor.memory_size()); + int r = xdnn::copy(ctx.GetRawContext(), + in_tensor.template data(), + static_cast(out_tensor->raw_data()), + in_tensor.numel()); + CHECK_EQ(r, 0) << " write to array failed"; + } else { + VLOG(4) << "WARNING: The input tensor 'x_tensor' holds no memory, so " + "nothing has been written to output array[" + << i << "]."; + } + } + } else { + auto out_tensor = param.Out; + auto in_tensor = in_array->at(start); + out_tensor->Resize(in_tensor.dims()); + out_tensor->set_lod(in_tensor.lod()); + out_tensor->set_precision(in_tensor.precision()); + out_tensor->mutable_data(TARGET(kXPU), in_tensor.memory_size()); + int r = xdnn::copy(ctx.GetRawContext(), + in_tensor.data(), + static_cast(out_tensor->raw_data()), + in_tensor.numel()); + CHECK_EQ(r, 0) << " write to array failed"; + } +} + inline std::vector GetIntDataFromTensorList( const std::vector& list_tensor) { std::vector vec_data; @@ -77,8 +135,6 @@ void SliceCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); - auto out = param.Out; - auto in = param.X; auto axes = param.axes; auto StartsTensor = param.StartsTensor; auto EndsTensor = param.EndsTensor; @@ -89,9 +145,6 @@ void SliceCompute::Run() { auto infer_flags = param.infer_flags; auto decrease_axis = param.decrease_axis; - auto out_dims = out->dims(); - auto in_dims = in->dims(); - bool need_infer = false; if (StartsTensor || EndsTensor) { need_infer = true; @@ -114,52 +167,69 @@ void SliceCompute::Run() { } CHECK_EQ(ends.size(), axes.size()) << "The size of ends must be equal to the size of axes."; - out_dims = in_dims; - int dim_value, start, end; - for (size_t i = 0; i < axes.size(); ++i) { - dim_value = out_dims[axes[i]]; - if (dim_value > 0) { - // when end = start + 1 and start == -1 - if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) { - auto ret = - std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]); - if (ret != decrease_axis.end()) { - ends[i] = 10000000; - } - } + } + // if slice input is tensor_array + if (param.X == nullptr && param.XTensorList != nullptr) { + DealTensorArray( + ctx, + param, + starts, + ends, + (param.Out == nullptr && param.OutTensorList != nullptr)); + return; + } - start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i]; - end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i]; - start = (std::max)(start, 0); - end = (std::max)(end, 0); - end = (std::min)(end, dim_value); - CHECK_GT(end, start) << "end should greater than start"; - out_dims[axes[i]] = end - start; + auto out = param.Out; + auto in = param.X; + auto out_dims = out->dims(); + auto in_dims = in->dims(); + out_dims = in_dims; + int dim_value, start, end; + for (size_t i = 0; i < axes.size(); ++i) { + dim_value = out_dims[axes[i]]; + if (dim_value > 0) { + // when end = start + 1 and start == -1 + if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) { + auto ret = + std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]); + if (ret != decrease_axis.end()) { + ends[i] = 10000000; + } } + + start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i]; + end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i]; + start = (std::max)(start, 0); + end = (std::max)(end, 0); + end = (std::min)(end, dim_value); + CHECK_GT(end, start) << "end should greater than start"; + out_dims[axes[i]] = end - start; } - out->Resize(out_dims); - // generate new shape - if (decrease_axis.size() > 0) { - std::vector new_out_shape; - for (size_t i = 0; i < decrease_axis.size(); ++i) { - CHECK_EQ(out_dims[decrease_axis[i]], 1) << "decrease dim should be 1"; - out_dims[decrease_axis[i]] = 0; - } + } - for (size_t i = 0; i < out_dims.size(); ++i) { - if (out_dims[i] != 0) { - new_out_shape.push_back(out_dims[i]); - } - } - if (new_out_shape.size() == 0) { - new_out_shape.push_back(1); - } + out->Resize(out_dims); + // generate new shape + if (decrease_axis.size() > 0) { + std::vector new_out_shape; + for (size_t i = 0; i < decrease_axis.size(); ++i) { + CHECK_EQ(out_dims[decrease_axis[i]], 1) << "decrease dim should be 1"; + out_dims[decrease_axis[i]] = 0; + } - DDim new_dims; - new_dims.ConstructFrom(new_out_shape); - out_dims = new_dims; + for (size_t i = 0; i < out_dims.size(); ++i) { + if (out_dims[i] != 0) { + new_out_shape.push_back(out_dims[i]); + } + } + if (new_out_shape.size() == 0) { + new_out_shape.push_back(1); } + + DDim new_dims; + new_dims.ConstructFrom(new_out_shape); + out_dims = new_dims; } + auto x_shape = in_dims.Vectorize(); std::vector x_shape_(x_shape.begin(), x_shape.end()); std::vector x_dim_begin_(in_dims.size(), 0); @@ -205,6 +275,21 @@ REGISTER_LITE_KERNEL(slice, kXPU, kFloat, kAny, SliceFloat32, def) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) .Finalize(); +using SliceFloat32 = paddle::lite::kernels::xpu::SliceCompute; +REGISTER_LITE_KERNEL(slice, kXPU, kFloat, kAny, SliceFloat32, array_def) + .BindInput("Input", + {LiteType::GetTensorListTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindInput("StartsTensor", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .BindInput("EndsTensor", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .BindInput("StartsTensorList", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .BindInput("EndsTensorList", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .Finalize(); + using SliceInt32 = paddle::lite::kernels::xpu::SliceCompute; REGISTER_LITE_KERNEL(slice, kXPU, kFloat, kAny, SliceInt32, int32) .BindInput("Input", @@ -220,6 +305,21 @@ REGISTER_LITE_KERNEL(slice, kXPU, kFloat, kAny, SliceInt32, int32) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) .Finalize(); +using SliceInt32 = paddle::lite::kernels::xpu::SliceCompute; +REGISTER_LITE_KERNEL(slice, kXPU, kFloat, kAny, SliceInt32, array_int32) + .BindInput("Input", + {LiteType::GetTensorListTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindInput("StartsTensor", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .BindInput("EndsTensor", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .BindInput("StartsTensorList", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .BindInput("EndsTensorList", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .Finalize(); + using SliceInt64 = paddle::lite::kernels::xpu::SliceCompute; REGISTER_LITE_KERNEL(slice, kXPU, kFloat, kAny, SliceInt64, int64) .BindInput("Input", @@ -234,3 +334,18 @@ REGISTER_LITE_KERNEL(slice, kXPU, kFloat, kAny, SliceInt64, int64) {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .Finalize(); + +using SliceInt64 = paddle::lite::kernels::xpu::SliceCompute; +REGISTER_LITE_KERNEL(slice, kXPU, kFloat, kAny, SliceInt64, array_int64) + .BindInput("Input", + {LiteType::GetTensorListTy(TARGET(kXPU), PRECISION(kInt64))}) + .BindInput("StartsTensor", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .BindInput("EndsTensor", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .BindInput("StartsTensorList", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .BindInput("EndsTensorList", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .Finalize(); diff --git a/lite/kernels/xpu/stack_compute.cc b/lite/kernels/xpu/stack_compute.cc index 0960c05a63c..373f682b958 100644 --- a/lite/kernels/xpu/stack_compute.cc +++ b/lite/kernels/xpu/stack_compute.cc @@ -21,7 +21,8 @@ namespace lite { namespace kernels { namespace xpu { -void StackCompute::Run() { +template +void StackCompute::Run() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); @@ -39,15 +40,15 @@ void StackCompute::Run() { x_shape[axis] = 1; std::vector> xdims_list(n, x_shape); - std::vector x_list(n, nullptr); + std::vector x_list(n, nullptr); for (int i = 0; i < n; ++i) { - x_list[i] = param.X[i]->data(); + x_list[i] = param.X[i]->template data(); } - int r = xdnn::concat(ctx.GetRawContext(), - x_list, - param.Out->mutable_data(TARGET(kXPU)), - xdims_list, - axis); + int r = xdnn::concat(ctx.GetRawContext(), + x_list, + param.Out->template mutable_data(TARGET(kXPU)), + xdims_list, + axis); CHECK_EQ(r, 0); } @@ -56,8 +57,16 @@ void StackCompute::Run() { } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL( - stack, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::StackCompute, def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) - .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))}) +using stack_float = + paddle::lite::kernels::xpu::StackCompute; +REGISTER_LITE_KERNEL(stack, kXPU, kFloat, kNCHW, stack_float, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))}) + .Finalize(); + +using stack_int64 = + paddle::lite::kernels::xpu::StackCompute; +REGISTER_LITE_KERNEL(stack, kXPU, kFloat, kNCHW, stack_int64, int64) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .Finalize(); diff --git a/lite/kernels/xpu/stack_compute.h b/lite/kernels/xpu/stack_compute.h index 00f01b9466a..3e6dd033de6 100644 --- a/lite/kernels/xpu/stack_compute.h +++ b/lite/kernels/xpu/stack_compute.h @@ -23,7 +23,8 @@ namespace lite { namespace kernels { namespace xpu { -class StackCompute : public KernelLite { +template +class StackCompute : public KernelLite { public: using param_t = operators::StackParam; diff --git a/lite/kernels/xpu/tile_compute.cc b/lite/kernels/xpu/tile_compute.cc new file mode 100644 index 00000000000..79007b85dcb --- /dev/null +++ b/lite/kernels/xpu/tile_compute.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/tile_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +template +void TileCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + auto repeat_times = param.repeat_times; + if (param.RepeatTimes) { + auto repeat_times_size = param.RepeatTimes->data_size(); + for (int64_t i = 0; i < repeat_times_size; i++) { + repeat_times.push_back(param.RepeatTimes->template data()[i]); + } + } else if (param.repeat_times_tensor.size() != 0) { + for (int i = 0; i < param.repeat_times_tensor.size(); i++) { + auto temp = param.repeat_times_tensor[i]; + repeat_times.push_back(*(temp->template data())); + } + } + auto in_dims = param.X->dims(); + auto vec_in_dims = in_dims.Vectorize(); + // broadcast for vec_in_dims.size() equal to repeat_times.size() + if (repeat_times.size() < vec_in_dims.size()) { + int diff = vec_in_dims.size() - repeat_times.size(); + repeat_times.insert(repeat_times.begin(), diff, 1); + } else { + int diff = repeat_times.size() - vec_in_dims.size(); + vec_in_dims.insert(vec_in_dims.begin(), diff, 1); + } + + std::vector new_in_dims(vec_in_dims.begin(), vec_in_dims.end()); + std::vector out_dims(param.Out->dims().data().begin(), + param.Out->dims().data().end()); + int r = xdnn::broadcast(ctx.GetRawContext(), + param.X->template data(), + param.Out->template mutable_data(TARGET(kXPU)), + new_in_dims, + out_dims); + + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +using tile_float = + paddle::lite::kernels::xpu::TileCompute; +REGISTER_LITE_KERNEL(tile, kXPU, kFloat, kNCHW, tile_float, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("RepeatTimes", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindInput("repeat_times_tensor", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/tile_compute.h b/lite/kernels/xpu/tile_compute.h new file mode 100644 index 00000000000..9b6329fa17c --- /dev/null +++ b/lite/kernels/xpu/tile_compute.h @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +template +class TileCompute : public KernelLite { + public: + using param_t = operators::TileParam; + + virtual void Run(); + + virtual ~TileCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/transpose_compute.cc b/lite/kernels/xpu/transpose_compute.cc index d1c9553ba71..19441de2849 100644 --- a/lite/kernels/xpu/transpose_compute.cc +++ b/lite/kernels/xpu/transpose_compute.cc @@ -75,6 +75,18 @@ REGISTER_LITE_KERNEL(transpose2, .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kHost))}) .Finalize(); +REGISTER_LITE_KERNEL(transpose2, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::TransposeCompute, + def_int32) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .BindOutput("XShape", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))}) + .Finalize(); + REGISTER_LITE_KERNEL(transpose2, kXPU, kFloat, diff --git a/lite/operators/__xpu__embedding_with_eltwise_add_op.cc b/lite/operators/__xpu__embedding_with_eltwise_add_op.cc index 13819d61046..d3c1b7e1f30 100644 --- a/lite/operators/__xpu__embedding_with_eltwise_add_op.cc +++ b/lite/operators/__xpu__embedding_with_eltwise_add_op.cc @@ -33,7 +33,12 @@ bool XPUEmbeddingWithEltwiseAddOp::CheckShape() const { } } if (param_.Mask != nullptr) { - CHECK_EQ(id_rank, param_.Mask->dims().size()); + if (id_rank != param_.Mask->dims().size()) { + CHECK(id_rank == 2 && param_.Mask->dims().size() == 3 && + param_.Mask->dims()[2] == 1) + << "unsupported id_rank: " << id_rank + << "mask_dims_size: " << param_.Mask->dims().size(); + } for (size_t j = 0; j < id_rank; j++) { CHECK_EQ(ids_dim[j], param_.Mask->dims()[j]); } diff --git a/lite/operators/__xpu__fc_op.cc b/lite/operators/__xpu__fc_op.cc index 21f6faebcb5..71357bf6338 100644 --- a/lite/operators/__xpu__fc_op.cc +++ b/lite/operators/__xpu__fc_op.cc @@ -107,16 +107,13 @@ bool XPUFcOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) { param_.input_max = scope->FindVar(op_desc.Input("InputMax").front())->GetMutable(); } - if (op_desc.HasAttr("precision")) { - param_.precision = op_desc.GetAttr("precision"); - } + if (op_desc.HasAttr("enable_int8") && op_desc.GetAttr("enable_int8")) { - CHECK(param_.precision == "int8") << "enable_int8 precison:" - << param_.precision; + param_.enable_int8 = op_desc.GetAttr("enable_int8"); param_.quant_input_max = 127 * op_desc.GetAttr>("X0_scale")[0]; - param_.quant_w_max = - 127 * op_desc.GetAttr>("Y0_scale")[0]; + param_.weight_max = op_desc.GetAttr>("Y0_max"); + param_.per_channel = op_desc.GetAttr("per_channel"); } return true; } diff --git a/lite/operators/__xpu__multi_encoder_op.cc b/lite/operators/__xpu__multi_encoder_op.cc index e97b3a26f6d..397a1b154d1 100644 --- a/lite/operators/__xpu__multi_encoder_op.cc +++ b/lite/operators/__xpu__multi_encoder_op.cc @@ -58,7 +58,11 @@ bool XPUMultiEncoderOp::InferShapeImpl() const { new_dims.ConstructFrom(new_out_shape); out_dims = new_dims; } - param_.output->Resize(out_dims); + if (param_.norm_before) { + param_.output->Resize({batch_size, 1, head_num}); + } else { + param_.output->Resize(out_dims); + } } else { param_.output->Resize({batch_size, seq_len, head_num}); } @@ -69,7 +73,7 @@ bool XPUMultiEncoderOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) { param_.input = const_cast( &scope->FindVar(op_desc.Input("Input").front())->Get()); - param_.fc_weight_max = const_cast( + param_.weight_max = const_cast( &scope->FindVar(op_desc.Input("FCWeightMax").front()) ->Get()); param_.output = scope->FindVar(op_desc.Output("Output").front()) @@ -141,9 +145,12 @@ bool XPUMultiEncoderOp::AttachImpl(const cpp::OpDesc& op_desc, param_.enable_qkv_fusion = op_desc.GetAttr("enable_qkv_fusion"); param_.norm_before = op_desc.GetAttr("norm_before"); param_.adaptive_seqlen = op_desc.GetAttr("adaptive_seqlen"); + param_.per_channel = op_desc.GetAttr("per_channel"); + if (param_.per_channel) { + param_.fc_channels = op_desc.GetAttr>("fc_channels"); + } if (op_desc.HasAttr("enable_int8") && op_desc.GetAttr("enable_int8")) { param_.input_max = op_desc.GetAttr>("FCInputMax"); - param_.weight_max = op_desc.GetAttr>("FCWeightMax"); } if (op_desc.HasAttr("slice_axes")) { diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h index bcbf480b564..f7028f9caba 100644 --- a/lite/operators/op_params.h +++ b/lite/operators/op_params.h @@ -1711,6 +1711,7 @@ struct XPUBlockFuseParam : ParamBase { struct XPUMultiEncoderParam : ParamBase { lite::Tensor* input{}; + const lite::Tensor* weight_max{nullptr}; std::vector fc_weight; std::vector fc_bias; std::vector ln_scale; @@ -1726,7 +1727,6 @@ struct XPUMultiEncoderParam : ParamBase { std::vector slice_ends{}; std::vector slice_decrease_axis{}; std::vector input_max{}; - std::vector weight_max{}; int n_layers{}; int head_num{}; int size_per_head{}; @@ -1736,6 +1736,8 @@ struct XPUMultiEncoderParam : ParamBase { bool enable_qkv_fusion{false}; bool norm_before{false}; bool adaptive_seqlen{false}; + bool per_channel{false}; + std::vector fc_channels{}; }; struct XPUEmbeddingWithEltwiseAddParam : ParamBase { @@ -1760,10 +1762,14 @@ struct XPUFcParam : ParamBase { int act_type; float act_param; float quant_input_max{0.f}; - float quant_w_max{0.f}; + std::vector weight_max{}; std::string precision{}; bool has_bias{false}; int in_num_col_dims{1}; + bool transpose_x{false}; + bool transpose_w{true}; + bool enable_int8{false}; + bool per_channel{false}; }; struct XPUResNetCbamParam : ParamBase { diff --git a/lite/operators/tile_op.cc b/lite/operators/tile_op.cc index 042afa692df..45d3c74e5fe 100644 --- a/lite/operators/tile_op.cc +++ b/lite/operators/tile_op.cc @@ -118,6 +118,7 @@ bool TileOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { } else if (opdesc.HasInput("repeat_times_tensor") && (opdesc.Input("repeat_times_tensor").size() != 0)) { auto temp = opdesc.Input("repeat_times_tensor"); + param_.repeat_times_tensor.clear(); for (auto var : temp) { param_.repeat_times_tensor.push_back( scope->FindVar(var)->GetMutable()); diff --git a/lite/tests/api/test_resnet50_fp32_baidu_xpu.cc b/lite/tests/api/test_resnet50_fp32_baidu_xpu.cc index 93b4308f102..d54c7088452 100644 --- a/lite/tests/api/test_resnet50_fp32_baidu_xpu.cc +++ b/lite/tests/api/test_resnet50_fp32_baidu_xpu.cc @@ -14,6 +14,7 @@ #include #include +#include #include #include "lite/api/paddle_api.h" #include "lite/api/paddle_use_kernels.h" @@ -35,13 +36,13 @@ namespace paddle { namespace lite { TEST(resnet50, test_resnet50_fp32_baidu_xpu) { + setenv("XPU_CONV_AUTOTUNE", "5", 1); lite_api::CxxConfig config; config.set_model_dir(FLAGS_model_dir); config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)}, lite_api::Place{TARGET(kX86), PRECISION(kFloat)}, lite_api::Place{TARGET(kHost), PRECISION(kFloat)}}); config.set_xpu_l3_cache_method(16773120, false); - config.set_xpu_conv_autotune(true); auto predictor = lite_api::CreatePaddlePredictor(config); std::string raw_data_dir = FLAGS_data_dir + std::string("/raw_data"); diff --git a/lite/tests/kernels/tile_compute_test.cc b/lite/tests/kernels/tile_compute_test.cc index 5bf48aa880c..07e11039a12 100644 --- a/lite/tests/kernels/tile_compute_test.cc +++ b/lite/tests/kernels/tile_compute_test.cc @@ -199,6 +199,9 @@ TEST(tile, precision) { #else return; #endif +#elif defined(LITE_WITH_XPU) + place = TARGET(kXPU); + alias = "def"; #elif defined(LITE_WITH_ARM) || defined(LITE_WITH_X86) place = TARGET(kHost); #else