Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[XPU] add dynreshape for MaskOCR, add fp16 for bn, add gelu for __xpu… #9942

Merged
merged 2 commits into from
Jan 31, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lite/core/optimizer/mir/fusion/__xpu__conv2d_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,7 @@ class XPUConv2dFuser : public FuseBase {
{"relu", 1},
{"sigmoid", 2},
{"tanh", 3},
{"gelu", 4},
{"leaky_relu", 5},
{"hard_swish", 14},
{"hard_sigmoid", 15},
Expand Down Expand Up @@ -603,6 +604,7 @@ class XPUConv2dFusePass : public ProgramPass {
for (auto act_type : {"relu",
"sigmoid",
"tanh",
"gelu",
"leaky_relu",
"hard_swish",
"hard_sigmoid",
Expand Down
197 changes: 165 additions & 32 deletions lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -945,14 +945,16 @@ class XPUSingleEncoderV2Fuser : public FuseBase {
const std::string& matmul_type = "matmul",
const std::string& mul_type = "mul",
bool with_fusion_qkv_bias = false,
bool norm_before = false)
bool norm_before = false,
bool with_dyn_reshape = false)
: act_type_(act_type),
input_pos_(input_pos),
qkv_ln_2_out_pos_(qkv_ln_2_out_pos),
matmul_type_(matmul_type),
mul_type_(mul_type),
with_fusion_qkv_bias_(with_fusion_qkv_bias),
norm_before_(norm_before) {}
norm_before_(norm_before),
with_dyn_reshape_(with_dyn_reshape) {}

void BuildPattern() override {
PMNode* ln_before_scale = nullptr;
Expand Down Expand Up @@ -1008,6 +1010,107 @@ class XPUSingleEncoderV2Fuser : public FuseBase {
->assert_is_op_output("elementwise_add", "Out")
->AsIntermediate();
}
// dyn reshape
PMNode* shape = nullptr;
PMNode* shape_out = nullptr;
PMNode* shape_slice = nullptr;
PMNode* shape_slice_out = nullptr;
PMNode* fill_constant1 = nullptr;
PMNode* fill_constant1_out = nullptr;
PMNode* fill_constant2 = nullptr;
PMNode* fill_constant2_out = nullptr;
PMNode* fill_constant3 = nullptr;
PMNode* fill_constant3_out = nullptr;
PMNode* fill_constant4 = nullptr;
PMNode* fill_constant4_out = nullptr;

PMNode* fill_constant5 = nullptr;
PMNode* fill_constant5_out = nullptr;
PMNode* fill_constant6 = nullptr;
PMNode* fill_constant6_out = nullptr;

if (with_dyn_reshape_) {
shape = OpNode("shape", "shape")->AsIntermediate();
shape_out = VarNode("shape_out")
->assert_is_op_output("shape", "Out")
->assert_is_op_input("slice", "Input")
->AsIntermediate();

shape_slice = OpNode("shape_slice", "slice")
->assert_op_attr_satisfied<std::vector<int>>(
"axes",
[](const std::vector<int>& attr) {
return attr.size() == 1 && attr[0] == 0;
})
->assert_op_attr_satisfied<std::vector<int>>(
"starts",
[](const std::vector<int>& attr) {
return attr.size() == 1 && attr[0] == 1;
})
->assert_op_attr_satisfied<std::vector<int>>(
"ends",
[](const std::vector<int>& attr) {
return attr.size() == 1 && attr[0] == 2;
})
->AsIntermediate();
shape_slice_out =
VarNode("shape_slice_out")
->assert_is_op_output("slice", "Out")
->assert_is_op_nth_input("reshape2", "ShapeTensor", 1)
->AsIntermediate();

fill_constant1 = OpNode("fill_constant1", "fill_constant")
->assert_op_attr_satisfied<float>(
"value", [](float value) { return value == -1; })
->AsIntermediate();

fill_constant1_out =
VarNode("fill_constant1_out")
->assert_is_op_output("fill_constant", "Out")
->assert_is_op_nth_input("reshape2", "ShapeTensor", 0)
->AsIntermediate();
fill_constant2 = OpNode("fill_constant2", "fill_constant")
->assert_op_attr_satisfied<float>(
"value", [](float value) { return value == 3; })
->AsIntermediate();

fill_constant2_out =
VarNode("fill_constant2_out")
->assert_is_op_output("fill_constant", "Out")
->assert_is_op_nth_input("reshape2", "ShapeTensor", 2)
->AsIntermediate();
fill_constant3 =
OpNode("fill_constant3", "fill_constant")->AsIntermediate();
fill_constant3_out =
VarNode("fill_constant3_out")
->assert_is_op_output("fill_constant", "Out")
->assert_is_op_nth_input("reshape2", "ShapeTensor", 3)
->AsIntermediate();
fill_constant4 =
OpNode("fill_constant4", "fill_constant")->AsIntermediate();
fill_constant4_out =
VarNode("fill_constant4_out")
->assert_is_op_output("fill_constant", "Out")
->assert_is_op_nth_input("reshape2", "ShapeTensor", 4)
->AsIntermediate();

fill_constant5 = OpNode("fill_constant5", "fill_constant")
->assert_op_attr_satisfied<float>(
"value", [](float value) { return value == -1; })
->AsIntermediate();
fill_constant5_out =
VarNode("fill_constant5_out")
->assert_is_op_output("fill_constant", "Out")
->assert_is_op_nth_input("reshape2", "ShapeTensor", 0)
->AsIntermediate();
fill_constant6 =
OpNode("fill_constant6", "fill_constant")->AsIntermediate();
fill_constant6_out =
VarNode("fill_constant6_out")
->assert_is_op_output("fill_constant", "Out")
->assert_is_op_nth_input("reshape2", "ShapeTensor", 2)
->AsIntermediate();
}
// reshape2
auto* fc_qkv_reshape2 =
OpNode("fc_qkv_reshape2", "reshape2")->AsIntermediate();
Expand Down Expand Up @@ -1270,22 +1373,41 @@ class XPUSingleEncoderV2Fuser : public FuseBase {
}

// use links here
// ln, qkv
// ln
if (norm_before_) {
ln_before->LinksFrom({input, ln_before_bias, ln_before_scale})
.LinksTo({ln_before_out, ln_before_mean, ln_before_var});
fc_qkv->LinksFrom({ln_before_out, fc_qkv_y}).LinksTo({fc_qkv_out});
} else {
fc_qkv->LinksFrom({input, fc_qkv_y}).LinksTo({fc_qkv_out});
ln_before_out = input;
}
// fusion_qkv
fc_qkv->LinksFrom({ln_before_out, fc_qkv_y}).LinksTo({fc_qkv_out});
// bias and reshape
if (with_fusion_qkv_bias_) {
fc_qkv_add->LinksFrom({fc_qkv_out, fc_qkv_add_y})
.LinksTo({fc_qkv_add_out});
fc_qkv_reshape2->LinksFrom({fc_qkv_add_out})
} else {
fc_qkv_add_out = fc_qkv_out;
}

if (with_dyn_reshape_) {
shape->LinksFrom({ln_before_out}).LinksTo({shape_out});
shape_slice->LinksFrom({shape_out}).LinksTo({shape_slice_out});

*fill_constant1 >> *fill_constant1_out >> *fc_qkv_reshape2;
*fill_constant2 >> *fill_constant2_out >> *fc_qkv_reshape2;
*fill_constant3 >> *fill_constant3_out >> *fc_qkv_reshape2;
*fill_constant4 >> *fill_constant4_out >> *fc_qkv_reshape2;
fc_qkv_reshape2
->LinksFrom({fc_qkv_add_out,
fill_constant1_out,
shape_slice_out,
fill_constant2_out,
fill_constant3_out,
fill_constant4_out})
.LinksTo({fc_qkv_reshape2_out, fc_qkv_reshape2_xshape});
} else {
fc_qkv_reshape2->LinksFrom({fc_qkv_out})
fc_qkv_reshape2->LinksFrom({fc_qkv_add_out})
.LinksTo({fc_qkv_reshape2_out, fc_qkv_reshape2_xshape});
}
// transpose
Expand All @@ -1307,8 +1429,20 @@ class XPUSingleEncoderV2Fuser : public FuseBase {
.LinksTo({qkv_matmul_out});
qkv_transpose2->LinksFrom({qkv_matmul_out})
.LinksTo({qkv_transpose2_out, qkv_transpose2_xshape});
qkv_reshape2->LinksFrom({qkv_transpose2_out})
.LinksTo({qkv_reshape2_out, qkv_reshape2_xshape});

if (with_dyn_reshape_) {
*fill_constant5 >> *fill_constant5_out >> *qkv_reshape2;
*fill_constant6 >> *fill_constant6_out >> *qkv_reshape2;
qkv_reshape2
->LinksFrom({qkv_transpose2_out,
fill_constant5_out,
shape_slice_out,
fill_constant6_out})
.LinksTo({qkv_reshape2_out, qkv_reshape2_xshape});
} else {
qkv_reshape2->LinksFrom({qkv_transpose2_out})
.LinksTo({qkv_reshape2_out, qkv_reshape2_xshape});
}
// qkv_fc1
qkv_mul->LinksFrom({qkv_reshape2_out, qkv_mul_y}).LinksTo({qkv_mul_out});
qkv_add->LinksFrom({qkv_mul_out, qkv_add_y}).LinksTo({qkv_add_out});
Expand Down Expand Up @@ -1505,6 +1639,7 @@ class XPUSingleEncoderV2Fuser : public FuseBase {
std::string mul_type_;
bool with_fusion_qkv_bias_;
bool norm_before_;
bool with_dyn_reshape_;
// quant_info: mul input_max, output_max * 6 + matmul x_max:y_max, output_max
void set_quant_info(Scope* scope,
const key2nodes_t& matched,
Expand All @@ -1517,8 +1652,8 @@ class XPUSingleEncoderV2Fuser : public FuseBase {
"qkv_mul_4"};
const std::vector<std::string> matmul_ops = {"qk_matmul", "qkv_matmul"};

bool mul_quant = false;
bool matmul_quant = false;
// bool mul_quant = false;
// bool matmul_quant = false;
AlbertVan marked this conversation as resolved.
Show resolved Hide resolved
const int ops_size = quant_mul_ops.size() + matmul_ops.size();
std::vector<std::string> op_quant_types(ops_size, "not_quantized");
std::vector<std::string> weight_max_tensor_name(quant_mul_ops.size());
Expand All @@ -1532,21 +1667,16 @@ class XPUSingleEncoderV2Fuser : public FuseBase {
get_weight_max_tensor_name(fc_weight_names[i]);
auto op_info = matched.at(quant_mul_ops[i])->stmt()->op_info();
if (is_int8_quantized_op(op_info) || is_int16_quantized_op(op_info)) {
mul_quant = true;
break;
CHECK(false) << "mul quantized will be supported later";
AlbertVan marked this conversation as resolved.
Show resolved Hide resolved
}
}
for (size_t i = 0; i < matmul_ops.size(); ++i) {
auto op_info = matched.at(matmul_ops[i])->stmt()->op_info();
if (is_int8_quantized_op(op_info) || is_int16_quantized_op(op_info)) {
matmul_quant = true;
break;
CHECK(false) << "matmul quantized will be supported later";
AlbertVan marked this conversation as resolved.
Show resolved Hide resolved
}
}
// quant is not supported in XPUSingleEncoderV2Fuser
if (mul_quant || matmul_quant) {
CHECK(false) << "mul matmul quantized will be supported later";
}
op_desc->SetAttr<std::vector<std::string>>("quant_types", op_quant_types);
op_desc->SetAttr<std::vector<std::string>>("Y0_max",
weight_max_tensor_name);
Expand Down Expand Up @@ -2153,20 +2283,23 @@ class XPUMultiEncoderFusePass : public ProgramPass {
for (auto& matmul_type : matmul_types) {
for (auto& mul_type : mul_types) {
for (auto& fusion_qkv_bias : {true, false}) {
for (auto norm_before : {true}) {
fusion::XPUSingleEncoderV2Fuser single_encoder_fuser(
act_type,
input_pos,
qkv_ln_2_out_pos,
matmul_type,
mul_type,
fusion_qkv_bias,
norm_before);
single_encoder_fuser(graph.get());

fusion::XPUMultiEncoderFuser multi_encoder_fuser(
fc_precision, adaptive_seqlen, true);
multi_encoder_fuser(graph.get());
for (auto& with_dyn_reshape : {true, false}) {
for (auto norm_before : {true}) {
fusion::XPUSingleEncoderV2Fuser single_encoder_fuser(
act_type,
input_pos,
qkv_ln_2_out_pos,
matmul_type,
mul_type,
fusion_qkv_bias,
norm_before,
with_dyn_reshape);
single_encoder_fuser(graph.get());

fusion::XPUMultiEncoderFuser multi_encoder_fuser(
fc_precision, adaptive_seqlen, true);
multi_encoder_fuser(graph.get());
}
}
}
}
Expand Down
55 changes: 35 additions & 20 deletions lite/kernels/xpu/batch_norm_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ namespace lite {
namespace kernels {
namespace xpu {

void BatchNormCompute::Run() {
template <class T, PrecisionType PType>
void BatchNormCompute<T, PType>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();
float epsilon = param.epsilon;
Expand All @@ -37,19 +38,19 @@ void BatchNormCompute::Run() {
}

int r =
xdnn::batch_norm_infer<float>(ctx.GetRawContext(),
param.x->data<float>(),
param.y->mutable_data<float>(TARGET(kXPU)),
x_shape[0],
x_shape[1],
x_shape[2],
x_shape[3],
epsilon,
param.scale->data<float>(),
param.bias->data<float>(),
param.mean->data<float>(),
param.variance->data<float>(),
true);
xdnn::batch_norm_infer<T>(ctx.GetRawContext(),
param.x->template data<T>(),
param.y->template mutable_data<T>(TARGET(kXPU)),
x_shape[0],
x_shape[1],
x_shape[2],
x_shape[3],
epsilon,
param.scale->template data<float>(),
param.bias->template data<float>(),
param.mean->template data<float>(),
param.variance->template data<float>(),
true);

CHECK_EQ(r, 0);
}
Expand All @@ -59,12 +60,12 @@ void BatchNormCompute::Run() {
} // namespace lite
} // namespace paddle

REGISTER_LITE_KERNEL(batch_norm,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::BatchNormCompute,
def)
namespace xpu = paddle::lite::kernels::xpu;

using BatchNorm_FP32 = xpu::BatchNormCompute<float, PRECISION(kFloat)>;
using BatchNorm_FP16 = xpu::BatchNormCompute<float16, PRECISION(kFP16)>;

REGISTER_LITE_KERNEL(batch_norm, kXPU, kFloat, kNCHW, BatchNorm_FP32, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
Expand All @@ -76,3 +77,17 @@ REGISTER_LITE_KERNEL(batch_norm,
.BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

REGISTER_LITE_KERNEL(
batch_norm, kXPU, kFP16, kNCHW, BatchNorm_FP16, DISABLE_XPU1_fp16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Mean", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Variance", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
3 changes: 2 additions & 1 deletion lite/kernels/xpu/batch_norm_compute.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ namespace lite {
namespace kernels {
namespace xpu {

class BatchNormCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
template <class T, PrecisionType PType>
class BatchNormCompute : public KernelLite<TARGET(kXPU), PType> {
public:
using param_t = operators::BatchNormParam;

Expand Down