From 97ea82826d4100a562d8edb30cb9ac3c4b468d0f Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Tue, 3 Nov 2020 05:56:33 +0000 Subject: [PATCH] Improve AArch64 depthwise convolution through smlal/smlal2 intrinsic (#6711) * Improve depthwise convolution through smlal/smlal2 intrinsic - Added an intrinsic to load a single int16x8 vector and produce two int32x4 output vectors through smlal/smlal2 instructions - Changed the NHWC depthwise schedule to accomodate the aforementioned intrinsic Change-Id: I347c3bf98fa8dd87057304dcda0d78e558424c57 * Address review comments * Rebasing - 2 * Rebasing - 3 * Rebasing - 3 * Fix linting --- python/tvm/topi/arm_cpu/depthwise_conv2d.py | 65 ++++++++++++-- python/tvm/topi/arm_cpu/tensor_intrin.py | 90 +++++++++++++++++++ .../topi/python/test_topi_depthwise_conv2d.py | 54 +++++++++++ 3 files changed, 200 insertions(+), 9 deletions(-) diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py index 3c32d3e1f3f2..441b0a5a3688 100644 --- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py +++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py @@ -25,6 +25,8 @@ from .. import nn from ..utils import traverse_inline, get_const_tuple, get_const_int from ..nn.utils import get_pad_tuple +from .tensor_intrin import smlal_int16_int32 +from .arm_utils import is_aarch64_arm @autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu") @@ -222,7 +224,6 @@ def compute_depthwise_conv2d_nhwc(_, data, kernel, strides, padding, dilation, o output : tvm.te.Tensor 4-D with shape [batch, out_height, out_width, out_channel] """ - out_dtype = out_dtype or data.dtype N, IH, IW, IC = get_const_tuple(data.shape) @@ -288,10 +289,18 @@ def schedule_depthwise_conv2d_nhwc(cfg, outs): ##### space definition begin ##### n, h, w, c = s[out].op.axis + # Split the number of input/output channels cfg.define_split("tile_c", c, num_outputs=2) + # Split the height of the convolution _, hi = cfg.define_split("tile_h", h, num_outputs=2) + # Split the width of the convolution _, wi = cfg.define_split("tile_w", w, num_outputs=2) + # Additional out (e.g., requantization, bias addition, etc..) + # 0: locate the output on the second last axis of the main compuation + # 1: locate the output closest to the main computation cfg.define_knob("locate_output", [0, 1]) + # Determine if we should unroll the computation of the inner tile + cfg.define_knob("unroll_tile", [True, False]) # fallback support if cfg.is_fallback: @@ -299,10 +308,15 @@ def schedule_depthwise_conv2d_nhwc(cfg, outs): cfg["tile_h"] = SplitEntity([-1, 2]) cfg["tile_w"] = SplitEntity([-1, 2]) cfg["locate_output"] = OtherOptionEntity(1) + cfg["unroll_tile"] = OtherOptionEntity(True) ##### space definition end ##### def schedule_conv(conv): conv_data = conv.op.input_tensors[0] + kernel_data = conv.op.input_tensors[1] + in_type = conv_data.dtype + + _, _, IC, channel_multiplier = get_const_tuple(kernel_data.shape) n, w, h, c = conv.op.axis r_h, r_w = conv.op.reduce_axis @@ -310,24 +324,53 @@ def schedule_conv(conv): wo, wi = cfg["tile_w"].apply(s, conv, w) co, ci = cfg["tile_c"].apply(s, conv, c) + split_val = cfg["tile_c"].size[-1] + use_tensorization = ( + (in_type == "int16") + and (split_val == 8) + and (IC % split_val == 0) + and (channel_multiplier == 1) + and is_aarch64_arm() + ) + + data_pad_value = -1 if conv_data.name == "data_pad": assert isinstance(conv_data.op, tvm.te.ComputeOp) - # Define a policy for padding computation - cfg.define_knob("data_pad_inline", [1, 2, 3]) + # Define a strategy for padding computation + cfg.define_knob("data_pad_strategy", [1, 2, 3]) if cfg.is_fallback: - cfg["data_pad_inline"] = OtherOptionEntity(3) - if cfg["data_pad_inline"].val == 1: + # We cannot inline padding when tensorizing. + # So, if we can tensorize, let's compute_at the closest axis + cfg["data_pad_strategy"] = ( + OtherOptionEntity(2) if use_tensorization else OtherOptionEntity(3) + ) + # Compute padding on the third to last axis of the computation + if cfg["data_pad_strategy"].val == 1: s[conv_data].vectorize(list(s[conv_data].op.axis)[-1]) s[conv_data].compute_at(s[conv], ho) - if cfg["data_pad_inline"].val == 2: + # Compute padding on the second to last axis of the computation + if cfg["data_pad_strategy"].val == 2: s[conv_data].vectorize(list(s[conv_data].op.axis)[-1]) s[conv_data].compute_at(s[conv], wo) - if cfg["data_pad_inline"].val == 3: + # Inline padding during computation + if cfg["data_pad_strategy"].val == 3: s[conv_data].compute_inline() + data_pad_value = cfg["data_pad_strategy"].val + + if use_tensorization and data_pad_value != 3: + smlal = smlal_int16_int32() + s[conv].tensorize(ci, smlal) + else: + s[conv].vectorize(ci) + + if cfg["unroll_tile"].val: + s[conv].unroll(r_h) + s[conv].unroll(r_w) + s[conv].unroll(wi) + s[conv].unroll(hi) s[conv].reorder(n, ho, wo, co, hi, wi, r_h, r_w, ci) fused_n_ho = s[conv].fuse(n, ho) - s[conv].vectorize(ci) return fused_n_ho def schedule_conv_out(out): @@ -335,13 +378,17 @@ def schedule_conv_out(out): co, ci = cfg["tile_c"].apply(s, out, c) wo, wi = cfg["tile_w"].apply(s, out, w) ho, hi = cfg["tile_h"].apply(s, out, h) - s[out].reorder(n, ho, wo, co, hi, wi) + s[out].reorder(n, ho, wo, co, hi, wi, ci) + if cfg["unroll_tile"]: + s[out].unroll(wi) + s[out].unroll(hi) if out.dtype in ["int8", "uint8"]: # In case of quantized convolution further split the channel in batches of 4 elements # so that we can use arm intrinsics to run fixed_point_multiplication ci_outer, ci_inner = s[out].split(ci, 4) s[out].vectorize(ci_inner) + s[out].unroll(ci_outer) fused_n_ho = s[out].fuse(n, ho) return hi, wi, fused_n_ho diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py index 196f788e6b8c..1b999dfe4e80 100644 --- a/python/tvm/topi/arm_cpu/tensor_intrin.py +++ b/python/tvm/topi/arm_cpu/tensor_intrin.py @@ -879,6 +879,96 @@ def _instr(index): ) +def smlal_int16_int32(): + """ + Intrinsic to be used in order to load two int16x8 vectors and multiply + them together through a pair of smlal/smlal2 instructions. The pseudo-code + for the algorithm is as follows: + + vec_a = vload(A, "int16x8") + vec_b = vload(B, "int16x8") + + vec_c[0:4] += vec_a[0:4]*vec_b[0:4] // -> smlal instruction + vec_c[4:8] += vec_a[4:8]*vec_b[4:8] // -> smlal2 instruction + + So we load a single int16x8 vector and we accumulate its lower (0:4) and + higher part separately. + """ + int16_lanes = 8 + A = te.placeholder((int16_lanes,), dtype="int16", name="A") + B = te.placeholder((int16_lanes, 1), dtype="int16", name="B") + C = te.compute( + (int16_lanes,), + lambda i: A[i].astype("int32") * B[i, 0].astype("int32"), + name="C", + ) + + a_buffer = tvm.tir.decl_buffer( + A.shape, dtype="int16", name="a_buffer", offset_factor=1, strides=[1] + ) + b_buffer = tvm.tir.decl_buffer( + B.shape, + dtype="int16", + name="b_buffer", + offset_factor=1, + strides=[te.var("sb"), 1], + ) + c_buffer = tvm.tir.decl_buffer( + C.shape, + dtype="int32", + name="c_buffer", + offset_factor=1, + strides=[1], + ) + + def _intrin_func(ins, outs): + def _instr(index): + ib = tvm.tir.ir_builder.create() + if index == 1: + ib.emit(outs[0].vstore(0, tvm.tir.const(0, "int32x8"))) + return ib.get() + + vec_a = ins[0].vload([0], "int16x8") + vec_b = ins[1].vload([0, 0], "int16x8") + inst = "llvm.aarch64.neon.smull" + + # Higher part of the vector + vec_c_h = outs[0].vload([4], "int32x4") + vec_a_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_a) + vec_b_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_b) + vmull_h = tvm.tir.call_llvm_pure_intrin( + "int32x4", inst, tvm.tir.const(2, "uint32"), vec_a_h, vec_b_h + ) + vec_out_h = vec_c_h + vmull_h + + # Lower part of the vector + vec_c_l = outs[0].vload([0], "int32x4") + vec_a_l = tvm.tir.call_intrin("int16x4", "tir.vectorlow", vec_a) + vec_b_l = tvm.tir.call_intrin("int16x4", "tir.vectorlow", vec_b) + vmull_l = tvm.tir.call_llvm_pure_intrin( + "int32x4", inst, tvm.tir.const(2, "uint32"), vec_a_l, vec_b_l + ) + vec_out_l = vec_c_l + vmull_l + + # Combine higher and lower part in a single int32x8 vector to store + # (this will require two different store instructions, since the + # length of a NEON vector is fixed at 128 + vec_out = tvm.tir.call_intrin("int32x8", "tir.vectorcombine", vec_out_l, vec_out_h) + ib.emit(outs[0].vstore(0, vec_out)) + return ib.get() + + # body, reset, update + return _instr(0), _instr(1), _instr(2) + + buffer_params = {"offset_factor": 1} + return te.decl_tensor_intrin( + C.op, + _intrin_func, + binds={A: a_buffer, B: b_buffer, C: c_buffer}, + default_buffer_params=buffer_params, + ) + + def _q_multiply_shift_arm(op): """ Implementation of q_multiply_shift_arm through arm intrinsics diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d.py b/tests/python/topi/python/test_topi_depthwise_conv2d.py index d5fbef98593c..55d2fe0c4e52 100644 --- a/tests/python/topi/python/test_topi_depthwise_conv2d.py +++ b/tests/python/topi/python/test_topi_depthwise_conv2d.py @@ -56,6 +56,55 @@ } +def compile_depthwise_NHWC_int8_arm( + batch, + in_channel, + in_size, + kernel, + depth_multiplier, + stride, + padding, + add_bias=False, + dilation=1, +): + pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel)) + padding_sum = pad_top + pad_left + pad_bottom + pad_right + + in_height = in_width = in_size + A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="int16") + W = te.placeholder((kernel, kernel, in_channel, depth_multiplier), name="W", dtype="int16") + bias = te.placeholder((in_channel * depth_multiplier,), name="bias", dtype="int32") + dtype = "int32" + + device = "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu" + compute = topi.arm_cpu.compute_depthwise_conv2d_nhwc + schedule = topi.arm_cpu.schedule_depthwise_conv2d_nhwc + + if not tvm.testing.device_enabled(device): + print("Skip because %s is not enabled" % device) + return + + print("Compiling on arm AArch64 target: %s" % device) + with tvm.target.Target(device): + assert topi.arm_cpu.arm_utils.is_aarch64_arm(), "AArch64 target not recognized" + + C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype) + if add_bias: + C += bias + ins_outs = [A, W, bias, C] + else: + ins_outs = [A, W, C] + + s = schedule([C]) + + func = tvm.build( + s, + ins_outs, + device, + name="depthwise_conv2d", + ) + + def depthwise_conv2d_with_workload_nchw( batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1 ): @@ -478,6 +527,7 @@ def test_depthwise_conv2d(): depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "SAME") depthwise_conv2d_with_workload_nhwc(1, 728, 32, 1, 3, 1, "VALID") depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "VALID") + # dilation = 2 # disabled because it uses too large shared memory on cuda # depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME", dilation=2) @@ -487,6 +537,10 @@ def test_depthwise_conv2d(): depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "SAME") depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "VALID") + # Test compilation on arm devices + compile_depthwise_NHWC_int8_arm(1, 728, 32, 1, 3, 1, "SAME") + compile_depthwise_NHWC_int8_arm(1, 728, 32, 1, 1, 1, "SAME", True) + if __name__ == "__main__": test_depthwise_conv2d()