From 97ea82826d4100a562d8edb30cb9ac3c4b468d0f Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Tue, 3 Nov 2020 05:56:33 +0000
Subject: [PATCH] Improve AArch64 depthwise convolution through smlal/smlal2
 intrinsic (#6711)

* Improve depthwise convolution through smlal/smlal2 intrinsic

- Added an intrinsic to load a single int16x8 vector and produce two
  int32x4 output vectors through smlal/smlal2 instructions

- Changed the NHWC depthwise schedule to accomodate the aforementioned
  intrinsic

Change-Id: I347c3bf98fa8dd87057304dcda0d78e558424c57

* Address review comments

* Rebasing - 2

* Rebasing - 3

* Rebasing - 3

* Fix linting
---
 python/tvm/topi/arm_cpu/depthwise_conv2d.py   | 65 ++++++++++++--
 python/tvm/topi/arm_cpu/tensor_intrin.py      | 90 +++++++++++++++++++
 .../topi/python/test_topi_depthwise_conv2d.py | 54 +++++++++++
 3 files changed, 200 insertions(+), 9 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
index 3c32d3e1f3f2..441b0a5a3688 100644
--- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py
+++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py
@@ -25,6 +25,8 @@
 from .. import nn
 from ..utils import traverse_inline, get_const_tuple, get_const_int
 from ..nn.utils import get_pad_tuple
+from .tensor_intrin import smlal_int16_int32
+from .arm_utils import is_aarch64_arm
 
 
 @autotvm.register_topi_compute("depthwise_conv2d_nchw.arm_cpu")
@@ -222,7 +224,6 @@ def compute_depthwise_conv2d_nhwc(_, data, kernel, strides, padding, dilation, o
     output : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
     """
-
     out_dtype = out_dtype or data.dtype
 
     N, IH, IW, IC = get_const_tuple(data.shape)
@@ -288,10 +289,18 @@ def schedule_depthwise_conv2d_nhwc(cfg, outs):
 
     ##### space definition begin #####
     n, h, w, c = s[out].op.axis
+    # Split the number of input/output channels
     cfg.define_split("tile_c", c, num_outputs=2)
+    # Split the height of the convolution
     _, hi = cfg.define_split("tile_h", h, num_outputs=2)
+    # Split the width of the convolution
     _, wi = cfg.define_split("tile_w", w, num_outputs=2)
+    # Additional out (e.g., requantization, bias addition, etc..)
+    # 0: locate the output on the second last axis of the main compuation
+    # 1: locate the output closest to the main computation
     cfg.define_knob("locate_output", [0, 1])
+    # Determine if we should unroll the computation of the inner tile
+    cfg.define_knob("unroll_tile", [True, False])
 
     # fallback support
     if cfg.is_fallback:
@@ -299,10 +308,15 @@ def schedule_depthwise_conv2d_nhwc(cfg, outs):
         cfg["tile_h"] = SplitEntity([-1, 2])
         cfg["tile_w"] = SplitEntity([-1, 2])
         cfg["locate_output"] = OtherOptionEntity(1)
+        cfg["unroll_tile"] = OtherOptionEntity(True)
     ##### space definition end #####
 
     def schedule_conv(conv):
         conv_data = conv.op.input_tensors[0]
+        kernel_data = conv.op.input_tensors[1]
+        in_type = conv_data.dtype
+
+        _, _, IC, channel_multiplier = get_const_tuple(kernel_data.shape)
 
         n, w, h, c = conv.op.axis
         r_h, r_w = conv.op.reduce_axis
@@ -310,24 +324,53 @@ def schedule_conv(conv):
         wo, wi = cfg["tile_w"].apply(s, conv, w)
         co, ci = cfg["tile_c"].apply(s, conv, c)
 
+        split_val = cfg["tile_c"].size[-1]
+        use_tensorization = (
+            (in_type == "int16")
+            and (split_val == 8)
+            and (IC % split_val == 0)
+            and (channel_multiplier == 1)
+            and is_aarch64_arm()
+        )
+
+        data_pad_value = -1
         if conv_data.name == "data_pad":
             assert isinstance(conv_data.op, tvm.te.ComputeOp)
-            # Define a policy for padding computation
-            cfg.define_knob("data_pad_inline", [1, 2, 3])
+            # Define a strategy for padding computation
+            cfg.define_knob("data_pad_strategy", [1, 2, 3])
             if cfg.is_fallback:
-                cfg["data_pad_inline"] = OtherOptionEntity(3)
-            if cfg["data_pad_inline"].val == 1:
+                # We cannot inline padding when tensorizing.
+                # So, if we can tensorize, let's compute_at the closest axis
+                cfg["data_pad_strategy"] = (
+                    OtherOptionEntity(2) if use_tensorization else OtherOptionEntity(3)
+                )
+            # Compute padding on the third to last axis of the computation
+            if cfg["data_pad_strategy"].val == 1:
                 s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
                 s[conv_data].compute_at(s[conv], ho)
-            if cfg["data_pad_inline"].val == 2:
+            # Compute padding on the second to last axis of the computation
+            if cfg["data_pad_strategy"].val == 2:
                 s[conv_data].vectorize(list(s[conv_data].op.axis)[-1])
                 s[conv_data].compute_at(s[conv], wo)
-            if cfg["data_pad_inline"].val == 3:
+            # Inline padding during computation
+            if cfg["data_pad_strategy"].val == 3:
                 s[conv_data].compute_inline()
+            data_pad_value = cfg["data_pad_strategy"].val
+
+        if use_tensorization and data_pad_value != 3:
+            smlal = smlal_int16_int32()
+            s[conv].tensorize(ci, smlal)
+        else:
+            s[conv].vectorize(ci)
+
+        if cfg["unroll_tile"].val:
+            s[conv].unroll(r_h)
+            s[conv].unroll(r_w)
+            s[conv].unroll(wi)
+            s[conv].unroll(hi)
 
         s[conv].reorder(n, ho, wo, co, hi, wi, r_h, r_w, ci)
         fused_n_ho = s[conv].fuse(n, ho)
-        s[conv].vectorize(ci)
         return fused_n_ho
 
     def schedule_conv_out(out):
@@ -335,13 +378,17 @@ def schedule_conv_out(out):
         co, ci = cfg["tile_c"].apply(s, out, c)
         wo, wi = cfg["tile_w"].apply(s, out, w)
         ho, hi = cfg["tile_h"].apply(s, out, h)
-        s[out].reorder(n, ho, wo, co, hi, wi)
+        s[out].reorder(n, ho, wo, co, hi, wi, ci)
+        if cfg["unroll_tile"]:
+            s[out].unroll(wi)
+            s[out].unroll(hi)
 
         if out.dtype in ["int8", "uint8"]:
             # In case of quantized convolution further split the channel in batches of 4 elements
             # so that we can use arm intrinsics to run fixed_point_multiplication
             ci_outer, ci_inner = s[out].split(ci, 4)
             s[out].vectorize(ci_inner)
+            s[out].unroll(ci_outer)
 
         fused_n_ho = s[out].fuse(n, ho)
         return hi, wi, fused_n_ho
diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py
index 196f788e6b8c..1b999dfe4e80 100644
--- a/python/tvm/topi/arm_cpu/tensor_intrin.py
+++ b/python/tvm/topi/arm_cpu/tensor_intrin.py
@@ -879,6 +879,96 @@ def _instr(index):
     )
 
 
+def smlal_int16_int32():
+    """
+    Intrinsic to be used in order to load two int16x8 vectors and multiply
+    them together through a pair of smlal/smlal2 instructions. The pseudo-code
+    for the algorithm is as follows:
+
+        vec_a = vload(A, "int16x8")
+        vec_b = vload(B, "int16x8")
+
+        vec_c[0:4] += vec_a[0:4]*vec_b[0:4] //  -> smlal instruction
+        vec_c[4:8] += vec_a[4:8]*vec_b[4:8] // -> smlal2 instruction
+
+    So we load a single int16x8 vector and we accumulate its lower (0:4) and
+    higher part separately.
+    """
+    int16_lanes = 8
+    A = te.placeholder((int16_lanes,), dtype="int16", name="A")
+    B = te.placeholder((int16_lanes, 1), dtype="int16", name="B")
+    C = te.compute(
+        (int16_lanes,),
+        lambda i: A[i].astype("int32") * B[i, 0].astype("int32"),
+        name="C",
+    )
+
+    a_buffer = tvm.tir.decl_buffer(
+        A.shape, dtype="int16", name="a_buffer", offset_factor=1, strides=[1]
+    )
+    b_buffer = tvm.tir.decl_buffer(
+        B.shape,
+        dtype="int16",
+        name="b_buffer",
+        offset_factor=1,
+        strides=[te.var("sb"), 1],
+    )
+    c_buffer = tvm.tir.decl_buffer(
+        C.shape,
+        dtype="int32",
+        name="c_buffer",
+        offset_factor=1,
+        strides=[1],
+    )
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.tir.ir_builder.create()
+            if index == 1:
+                ib.emit(outs[0].vstore(0, tvm.tir.const(0, "int32x8")))
+                return ib.get()
+
+            vec_a = ins[0].vload([0], "int16x8")
+            vec_b = ins[1].vload([0, 0], "int16x8")
+            inst = "llvm.aarch64.neon.smull"
+
+            # Higher part of the vector
+            vec_c_h = outs[0].vload([4], "int32x4")
+            vec_a_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_a)
+            vec_b_h = tvm.tir.call_intrin("int16x4", "tir.vectorhigh", vec_b)
+            vmull_h = tvm.tir.call_llvm_pure_intrin(
+                "int32x4", inst, tvm.tir.const(2, "uint32"), vec_a_h, vec_b_h
+            )
+            vec_out_h = vec_c_h + vmull_h
+
+            # Lower part of the vector
+            vec_c_l = outs[0].vload([0], "int32x4")
+            vec_a_l = tvm.tir.call_intrin("int16x4", "tir.vectorlow", vec_a)
+            vec_b_l = tvm.tir.call_intrin("int16x4", "tir.vectorlow", vec_b)
+            vmull_l = tvm.tir.call_llvm_pure_intrin(
+                "int32x4", inst, tvm.tir.const(2, "uint32"), vec_a_l, vec_b_l
+            )
+            vec_out_l = vec_c_l + vmull_l
+
+            # Combine higher and lower part in a single int32x8 vector to store
+            # (this will require two different store instructions, since the
+            # length of a NEON vector is fixed at 128
+            vec_out = tvm.tir.call_intrin("int32x8", "tir.vectorcombine", vec_out_l, vec_out_h)
+            ib.emit(outs[0].vstore(0, vec_out))
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    buffer_params = {"offset_factor": 1}
+    return te.decl_tensor_intrin(
+        C.op,
+        _intrin_func,
+        binds={A: a_buffer, B: b_buffer, C: c_buffer},
+        default_buffer_params=buffer_params,
+    )
+
+
 def _q_multiply_shift_arm(op):
     """
     Implementation of q_multiply_shift_arm through arm intrinsics
diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d.py b/tests/python/topi/python/test_topi_depthwise_conv2d.py
index d5fbef98593c..55d2fe0c4e52 100644
--- a/tests/python/topi/python/test_topi_depthwise_conv2d.py
+++ b/tests/python/topi/python/test_topi_depthwise_conv2d.py
@@ -56,6 +56,55 @@
 }
 
 
+def compile_depthwise_NHWC_int8_arm(
+    batch,
+    in_channel,
+    in_size,
+    kernel,
+    depth_multiplier,
+    stride,
+    padding,
+    add_bias=False,
+    dilation=1,
+):
+    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel))
+    padding_sum = pad_top + pad_left + pad_bottom + pad_right
+
+    in_height = in_width = in_size
+    A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="int16")
+    W = te.placeholder((kernel, kernel, in_channel, depth_multiplier), name="W", dtype="int16")
+    bias = te.placeholder((in_channel * depth_multiplier,), name="bias", dtype="int32")
+    dtype = "int32"
+
+    device = "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu"
+    compute = topi.arm_cpu.compute_depthwise_conv2d_nhwc
+    schedule = topi.arm_cpu.schedule_depthwise_conv2d_nhwc
+
+    if not tvm.testing.device_enabled(device):
+        print("Skip because %s is not enabled" % device)
+        return
+
+    print("Compiling on arm AArch64 target: %s" % device)
+    with tvm.target.Target(device):
+        assert topi.arm_cpu.arm_utils.is_aarch64_arm(), "AArch64 target not recognized"
+
+        C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
+        if add_bias:
+            C += bias
+            ins_outs = [A, W, bias, C]
+        else:
+            ins_outs = [A, W, C]
+
+        s = schedule([C])
+
+        func = tvm.build(
+            s,
+            ins_outs,
+            device,
+            name="depthwise_conv2d",
+        )
+
+
 def depthwise_conv2d_with_workload_nchw(
     batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1
 ):
@@ -478,6 +527,7 @@ def test_depthwise_conv2d():
     depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "SAME")
     depthwise_conv2d_with_workload_nhwc(1, 728, 32, 1, 3, 1, "VALID")
     depthwise_conv2d_with_workload_nhwc(4, 256, 64, 2, 5, 2, "VALID")
+
     # dilation = 2
     # disabled because it uses too large shared memory on cuda
     # depthwise_conv2d_with_workload_nhwc(1, 728, 64, 1, 3, 1, "SAME", dilation=2)
@@ -487,6 +537,10 @@ def test_depthwise_conv2d():
     depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "SAME")
     depthwise_conv2d_with_workload_NCHWc(1, 728, 32, 1, 3, 1, "VALID")
 
+    # Test compilation on arm devices
+    compile_depthwise_NHWC_int8_arm(1, 728, 32, 1, 3, 1, "SAME")
+    compile_depthwise_NHWC_int8_arm(1, 728, 32, 1, 1, 1, "SAME", True)
+
 
 if __name__ == "__main__":
     test_depthwise_conv2d()