AArch64 base algorithm refactoring in LLVM (apache#6907)

* AArch64 base algorithm refactoring in LLVM - I refactored the assembly in arm_cpu/tensor_intrin.py to use LLVM+TIR - Removed the `interleave` boolean parameter in the intrinsic to switch among two different interleaving modes. LLVM will now take care of interleaving the instructions - Applied the changes accordingly to conv2d_gemm.py to call the right instrinsic Note: I found LLVM very sensible to the choice of the `-mcpu`. So, in order to preserve performance, it is important to specify the right `-mcpu` when creating the LLVM target * Fix linting * Fix linting -2 * Fixing comments * Address review comments * Fix spaces around ':' in docstrings
trevor-m · Dec 4, 2020 · c9a3efb · c9a3efb
1 parent b302b76
commit c9a3efb
Show file tree

Hide file tree

Showing 2 changed files with 348 additions and 413 deletions.
diff --git a/python/tvm/topi/arm_cpu/conv2d_gemm.py b/python/tvm/topi/arm_cpu/conv2d_gemm.py
@@ -24,8 +24,7 @@
 from ..utils import get_const_tuple, get_const_int
 from ..nn.utils import get_pad_tuple
 from .tensor_intrin import (
-    gemm_quantized,
-    gemm_quantized_impl,
+    gemm_4x4_int8_int8_int32,
     gemm_acc_4x4_int8_int8_int32,
     gemm_acc_nx16_int8_int8_int32,
     gemm_acc_2x2_int8_int8_int32,
@@ -51,11 +50,8 @@ def configure_knobs(cfg, M, K):
 
     if not is_dotprod_available():
         cfg.define_knob("gemm_quantized_unroll", [True, False])
-        cfg.define_knob("gemm_quantized_interleave", [True, False])
-
         if cfg.is_fallback:
             cfg["gemm_quantized_unroll"] = OtherOptionEntity(False)
-            cfg["gemm_quantized_interleave"] = OtherOptionEntity(True)
 
 
 # Compute function
@@ -361,14 +357,9 @@ def schedule_conv2d_gemm_interleaved(cfg, s, out, final_out):
         elif is_aarch64_arm():
             s[C_interleaved].reorder(yi, xi)
             K = A_interleaved_input.shape[2]
+            assert in_type in ["int8", "uint8"], "Only int8 and uint8 gemm are supported"
             unroll = cfg["gemm_quantized_unroll"].val
-            interleave = cfg["gemm_quantized_interleave"].val
-            gemm = gemm_quantized(M, N, K, unroll, interleave, in_type, out_type)
-            s[C_interleaved].pragma(
-                b_outer_gemm_fused,
-                "import_llvm",
-                gemm_quantized_impl(M, N, K, unroll, interleave, in_type),
-            )
+            gemm = gemm_4x4_int8_int8_int32(M, N, K, unroll, in_type)
             s[C_interleaved].tensorize(yi, gemm)
 
     # Output transform