Skip to content

Commit

Permalink
[XPU][cherry-pick] support K200 per channel & R200 mul&matmul quant;
Browse files Browse the repository at this point in the history
* [XPU] fixed the bug of tile op in large input and add XPU implementation. (#9102)

* [XPU] Fixed the bug of reuse of reshape2's output in xpu_memory_optimize_pass (#9178)

* [x86][XPU] Add the support tensorarray of slice on x86 and xpu (#9134)

* [XPU] Fixed the error on stack op binding on float. (#9204)

* [XPU] Stop supporting xpu conv autotune config with paddlelite c api. (#9316)

* [XPU] Support pre-LN encoder (#9159)

* [xpu] support fp16 data pricision (#9080)

* [xpu] delete kernel.precision()==float (#9189)

* [XPU] support fp16 data pression (#9228)

* [XPU] support fc per channel quant (#9323)

Co-authored-by: wbn <66299196+wbn520@users.noreply.github.com>
Co-authored-by: Jinchen Han <zealoct@hotmail.com>
Co-authored-by: TingShenXD <99321958+TingShenXD@users.noreply.github.com>
Co-authored-by: quwei03 <32065370+xiuxin121@users.noreply.github.com>
  • Loading branch information
5 people committed Aug 29, 2022
1 parent 8f09eb2 commit 7ca3e00
Show file tree
Hide file tree
Showing 59 changed files with 2,888 additions and 724 deletions.
2 changes: 1 addition & 1 deletion cmake/backends/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ set (XPU_DOWNLOAD_DIR ${XPU_SOURCE_DIR}/download)
set (XPU_INSTALL_DIR ${THIRD_PARTY_PATH}/install)

if (NOT XPU_SDK_URL)
set (XPU_SDK_URL "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev/20220519")
set (XPU_SDK_URL "https://baidu-kunlun-product.su.bcebos.com/klx-sdk/search/20220825")
endif ()

if (NOT XPU_SDK_ENV)
Expand Down
8 changes: 6 additions & 2 deletions lite/api/paddle_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -626,8 +626,12 @@ void CxxConfig::set_xpu_multi_encoder_method(const std::string &precision,
void CxxConfig::set_xpu_conv_autotune(bool autotune,
const std::string &autotune_file) {
#ifdef LITE_WITH_XPU
lite::TargetWrapperXPU::conv_autotune = autotune;
lite::TargetWrapperXPU::conv_autotune_file = autotune_file;
LOG(WARNING)
<< "This function "
"'set_xpu_conv_autotune' is deprecated, "
"if you want to use autotune, please refer to "
"http://agroup.baidu.com/share/md/f9233d84df11452488a1fdd4f859647f";

#else
LOG(WARNING) << "The invoking of the function "
"'set_xpu_conv_autotune' is ignored, please "
Expand Down
2 changes: 2 additions & 0 deletions lite/api/paddle_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,8 @@ class LITE_API CxxConfig : public ConfigBase {

void set_xpu_gm_workspace_method(size_t gm_size);

// **DEPRECATED**, use environ variable to enable autotune
// check http://agroup.baidu.com/share/md/f9233d84df11452488a1fdd4f859647f
void set_xpu_conv_autotune(bool autotune = true,
const std::string& autotune_file = "");

Expand Down
1 change: 1 addition & 0 deletions lite/api/paddle_use_passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ USE_MIR_PASS(__xpu__bigru_fuse_pass);
USE_MIR_PASS(__xpu__dynamic_lstm_fuse_pass);
USE_MIR_PASS(__xpu__multi_softmax_fuse_pass);
USE_MIR_PASS(__xpu__max_pooling_pad_zero_detect_fuse_pass);
USE_MIR_PASS(__xpu__static_kernel_pick_pass);
USE_MIR_PASS(x86_int8_attribute_pass);
USE_MIR_PASS(fill_range_fuse_pass);
USE_MIR_PASS(range_calc_offline_pass);
Expand Down
19 changes: 10 additions & 9 deletions lite/backends/xpu/target_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,13 @@ void TargetWrapperXPU::MemcpySync(void* dst,

template <typename Tcpu, typename Txpu>
XPUQuantData TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight(
const Tcpu* cpu_data, const DDimLite& dims, bool data_transpose) {
const Tcpu* cpu_data,
const DDimLite& dims,
bool data_transpose,
size_t max_ptr_len) {
CHECK(quantizer_.get());
return quantizer_->quant<Tcpu, Txpu>(cpu_data, dims, data_transpose);
return quantizer_->quant<Tcpu, Txpu>(
cpu_data, dims, data_transpose, max_ptr_len);
}

void TargetWrapperXPU::ScatterL3Cache(
Expand Down Expand Up @@ -145,16 +149,16 @@ void TargetWrapperXPU::FreeL3Cache() {

template XPUQuantData
TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, float>(
const float*, const DDimLite&, bool);
const float*, const DDimLite&, bool, size_t);
template XPUQuantData
TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, int16_t>(
const float*, const DDimLite&, bool);
const float*, const DDimLite&, bool, size_t);
template XPUQuantData
TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<float, int8_t>(
const float*, const DDimLite&, bool);
const float*, const DDimLite&, bool, size_t);
template XPUQuantData
TargetWrapperXPU::ConvertCPUWeightToXPUQuantWeight<int8_t, int8_t>(
const int8_t*, const DDimLite&, bool);
const int8_t*, const DDimLite&, bool, size_t);

// xpu context
LITE_THREAD_LOCAL std::shared_ptr<xdnn::Context> TargetWrapperXPU::tls_raw_ctx_{
Expand All @@ -165,9 +169,6 @@ LITE_THREAD_LOCAL std::shared_ptr<void> TargetWrapperXPU::xpu_stream_{nullptr};
LITE_THREAD_LOCAL std::string
TargetWrapperXPU::multi_encoder_precision; // NOLINT
LITE_THREAD_LOCAL bool TargetWrapperXPU::multi_encoder_adaptive_seqlen{false};
// conv autotune config
LITE_THREAD_LOCAL bool TargetWrapperXPU::conv_autotune{false};
LITE_THREAD_LOCAL std::string TargetWrapperXPU::conv_autotune_file;
// l3 cache config
LITE_THREAD_LOCAL bool TargetWrapperXPU::need_l3_mutex{false};
LITE_THREAD_LOCAL size_t TargetWrapperXPU::local_l3_size{
Expand Down
14 changes: 2 additions & 12 deletions lite/backends/xpu/target_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ class TargetWrapper<TARGET(kXPU)> {
template <typename Tcpu, typename Txpu>
static XPUQuantData ConvertCPUWeightToXPUQuantWeight(const Tcpu* cpu_data,
const DDimLite& dims,
bool data_transpose);
bool data_transpose,
size_t max_ptr_len);

static xdnn::Context* GetRawContext() {
if (tls_raw_ctx_.get() == nullptr) {
Expand Down Expand Up @@ -111,14 +112,6 @@ class TargetWrapper<TARGET(kXPU)> {
quantizer_.reset(new XPUQuantizer());
}
CHECK(quantizer_.get());
if (conv_autotune) {
tls_raw_ctx_->_xpu1_conv_selector.set_autotune_loop(true);
tls_raw_ctx_->_xpu1_conv_selector.set_inference_mode(true);
}
if (!conv_autotune_file.empty()) {
tls_raw_ctx_->_xpu1_conv_selector.set_autotune_file(
conv_autotune_file.c_str());
}
int devid = -1;
uint64_t max_l3_size = 0;
XPU_CALL(xpu_current_device(&devid));
Expand Down Expand Up @@ -173,9 +166,6 @@ class TargetWrapper<TARGET(kXPU)> {
// multi encoder config
static LITE_THREAD_LOCAL std::string multi_encoder_precision; // NOLINT
static LITE_THREAD_LOCAL bool multi_encoder_adaptive_seqlen;
// conv autotune config
static LITE_THREAD_LOCAL bool conv_autotune;
static LITE_THREAD_LOCAL std::string conv_autotune_file; // NOLINT
// l3 cache config
static LITE_THREAD_LOCAL bool need_l3_mutex; // model level l3 size
static LITE_THREAD_LOCAL size_t local_l3_size; // model level l3 size
Expand Down
34 changes: 22 additions & 12 deletions lite/backends/xpu/xpu_quantizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ template <
void XPUQuantizer::ConvertWithQuant(const Tcpu* cpu_data,
const DDimLite& dims,
bool data_transpose,
size_t hashed_key) {
size_t hashed_key,
size_t max_ptr_len) {
LOG(FATAL) << "Not support for Tcpu is " << CppTypeToString<Tcpu>();
}

Expand All @@ -123,7 +124,8 @@ template <
void XPUQuantizer::ConvertWithQuant(const Tcpu* cpu_data,
const DDimLite& dims,
bool data_transpose,
size_t hashed_key) {
size_t hashed_key,
size_t max_ptr_len) {
// transpose
const Tcpu* cpu_ptr = nullptr;
int numel = dims.production();
Expand All @@ -140,7 +142,7 @@ void XPUQuantizer::ConvertWithQuant(const Tcpu* cpu_data,
XPUScratchPadGuard weight_max_guard;
XPUScratchPadGuard quant_weight_guard;
float max_val = paddle::lite::xpu::math::FindMaxAbs(cpu_ptr, numel);
int max_ptr_size = XPUMemory::get_max_ptr_size();
size_t max_ptr_size = max_ptr_len;
std::vector<float> max_vec(max_ptr_size, max_val);
weight_max_guard =
std::move(XPUMemory::MallocScratchPad(max_ptr_size * sizeof(float)));
Expand All @@ -162,11 +164,12 @@ template <typename T>
void XPUQuantizer::ConvertWithoutQuant(const T* cpu_data,
const DDimLite& dims,
bool data_transpose,
size_t hashed_key) {
size_t hashed_key,
size_t max_ptr_len) {
// transpose
const T* cpu_ptr = nullptr;
int numel = dims.production();
int max_ptr_size = XPUMemory::get_max_ptr_size();
size_t max_ptr_size = max_ptr_len;
std::vector<T> transpose_data(numel, 0);
if (data_transpose) {
CHECK(dims.size() == 2) << "Not support: dims.size = " << dims.size();
Expand All @@ -178,8 +181,9 @@ void XPUQuantizer::ConvertWithoutQuant(const T* cpu_data,
}
// copy to XPU
XPUScratchPadGuard weight_max_guard(new XPUScratchPad(nullptr, 0));
if (std::is_same<T, int8_t>::value) {
if (std::is_same<T, int8_t>::value || std::is_same<T, int16_t>::value) {
// prepare max_w space for slim int8 quant
// just allocate buffer, set max value in kernel
weight_max_guard =
std::move(XPUMemory::MallocScratchPad(max_ptr_size * sizeof(float)));
}
Expand All @@ -196,7 +200,8 @@ void XPUQuantizer::ConvertWithoutQuant(const T* cpu_data,
template <typename Tcpu, typename Txpu>
XPUQuantData XPUQuantizer::quant(const Tcpu* cpu_data,
const DDimLite& dims,
bool data_transpose) {
bool data_transpose,
size_t max_ptr_len) {
int numel = dims.production();
const std::string cpu_dtype = CppTypeToString<Tcpu>();
const std::string xpu_dtype = CppTypeToString<Txpu>();
Expand All @@ -206,7 +211,8 @@ XPUQuantData XPUQuantizer::quant(const Tcpu* cpu_data,
<< ", precision=" << precision << ", transpose=" << data_transpose
<< ", hashed_key=" << hashed_key;
if (weight_cache_.find(hashed_key) == weight_cache_.end()) {
ConvertWrapper<Tcpu, Txpu>(cpu_data, dims, data_transpose, hashed_key);
ConvertWrapper<Tcpu, Txpu>(
cpu_data, dims, data_transpose, hashed_key, max_ptr_len);
}

float* max_ptr =
Expand All @@ -218,15 +224,19 @@ XPUQuantData XPUQuantizer::quant(const Tcpu* cpu_data,

template XPUQuantData XPUQuantizer::quant<float, float>(const float*,
const DDimLite&,
bool);
bool,
size_t);
template XPUQuantData XPUQuantizer::quant<float, int16_t>(const float*,
const DDimLite&,
bool);
bool,
size_t);
template XPUQuantData XPUQuantizer::quant<float, int8_t>(const float*,
const DDimLite&,
bool);
bool,
size_t);
template XPUQuantData XPUQuantizer::quant<int8_t, int8_t>(const int8_t*,
const DDimLite&,
bool);
bool,
size_t);
} // namespace lite
} // namespace paddle
24 changes: 16 additions & 8 deletions lite/backends/xpu/xpu_quantizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ class XPUQuantizer {
template <typename Tcpu, typename Txpu>
XPUQuantData quant(const Tcpu* cpu_data,
const DDimLite& dims,
bool data_transpose);
bool data_transpose,
size_t max_ptr_len);

private:
template <typename T>
Expand All @@ -49,7 +50,8 @@ class XPUQuantizer {
void ConvertWithoutQuant(const T* cpu_data,
const DDimLite& dims,
bool data_transpose,
size_t hashed_key);
size_t hashed_key,
size_t max_ptr_len);

template <typename Tcpu,
typename Txpu,
Expand All @@ -58,7 +60,8 @@ class XPUQuantizer {
void ConvertWithQuant(const Tcpu* cpu_data,
const DDimLite& dims,
bool data_transpose,
size_t hashed_key);
size_t hashed_key,
size_t max_ptr_len);

template <typename Tcpu,
typename Txpu,
Expand All @@ -67,7 +70,8 @@ class XPUQuantizer {
void ConvertWithQuant(const Tcpu* cpu_data,
const DDimLite& dims,
bool data_transpose,
size_t hashed_key);
size_t hashed_key,
size_t max_ptr_len);

template <typename Tcpu,
typename Txpu,
Expand All @@ -76,8 +80,10 @@ class XPUQuantizer {
void ConvertWrapper(const Tcpu* cpu_data,
const DDimLite& dims,
bool data_transpose,
size_t hashed_key) {
ConvertWithQuant<Tcpu, Txpu>(cpu_data, dims, data_transpose, hashed_key);
size_t hashed_key,
size_t max_ptr_len) {
ConvertWithQuant<Tcpu, Txpu>(
cpu_data, dims, data_transpose, hashed_key, max_ptr_len);
}

template <typename Tcpu,
Expand All @@ -87,8 +93,10 @@ class XPUQuantizer {
void ConvertWrapper(const Tcpu* cpu_data,
const DDimLite& dims,
bool data_transpose,
size_t hashed_key) {
ConvertWithoutQuant<Tcpu>(cpu_data, dims, data_transpose, hashed_key);
size_t hashed_key,
size_t max_ptr_len) {
ConvertWithoutQuant<Tcpu>(
cpu_data, dims, data_transpose, hashed_key, max_ptr_len);
}

// cpu data to xpu quant data
Expand Down
Loading

0 comments on commit 7ca3e00

Please sign in to comment.