[microNPU] Fix layout transform matrix

One of the layout transforms currently causes the cascader to stripe across B16 axis (which is not allowed), so change that and deal with the implications to the get_valid_block_configs. Change-Id: I04199f9f35fcc31618581567483cfb80d3b5aad2
apache · Mar 25, 2022 · aab1d7c · aab1d7c
1 parent 14084f4
commit aab1d7c
Show file tree

Hide file tree

Showing 10 changed files with 88 additions and 61 deletions.
diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py
@@ -439,6 +439,23 @@ def is_partkernel(
 
         return part_kernel_first_utilization > depth_first_utilization or ifm_channels <= 8
 
+    def _get_input_banks(self, input_block_shape, input_bytewidth):
+        input_bytes = input_block_shape.area() * self._align(
+            input_block_shape.depth * input_bytewidth, 8
+        )
+        input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
+        input_banks = _round_up(input_banks, self._input_granularity)
+
+        return input_banks
+
+    def _get_accumulator_banks(self, output_block_shape, acc_bytewidth, depth):
+        acc_depth = _round_up(min(output_block_shape.depth, depth), 8)
+        acc_bytes = output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
+        acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
+        acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])
+
+        return acc_banks
+
     def get_elementwise_block_config(
         self,
         ifm_propagator: Propagator,
@@ -533,16 +550,9 @@ def get_elementwise_block_config(
             input2_block.round_up(self._input_micro_block)
 
             # Banks required for input block
-            input_bytes = input_block.area() * self._align(input_block.depth * input_bytewidth, 8)
-            input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
-            input_banks = _round_up(input_banks, self._input_granularity)
-
+            input_banks = self._get_input_banks(input_block, input_bytewidth)
             # Banks required for input2 block
-            input2_bytes = input2_block.area() * self._align(
-                input2_block.depth * input_bytewidth, 8
-            )
-            input2_banks = _round_up_div(input2_bytes, self._bank_size_bytes) * 2
-            input2_banks = _round_up(input2_banks, self._input_granularity)
+            input2_banks = self._get_input_banks(input2_block, input_bytewidth)
 
             # Check whether or not both IFMs fit into SHRAM
             if (input_banks + input2_banks) <= banks_available:
@@ -561,6 +571,29 @@ def get_elementwise_block_config(
 
         return block_config
 
+    def _get_subkernel_propagator(
+        self, op_attrs, ifm_propagator, input_layout, output_layout, depth
+    ):
+        op_type = op_attrs.get("op")
+        stride_h = int(op_attrs.get("stride_h", 1))
+        stride_w = int(op_attrs.get("stride_w", 1))
+        transform = ifm_propagator.transform
+
+        if input_layout == "NHCWB16":
+            transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
+            transform[3][-1] = min(transform[3][-1], self._subkernel_limits[1] - stride_w)
+        else:
+            transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
+            transform[2][-1] = min(transform[2][-1], self._subkernel_limits[1] - stride_w)
+
+        if op_type in ("ethosu_pooling", "ethosu_depthwise_conv2d"):
+            if output_layout == "NHCWB16" and input_layout == "NHWC":
+                transform[3][-1] = depth
+            elif output_layout == "NHCWB16" and input_layout == "NHCWB16":
+                transform[2][-1] = depth // 16
+
+        return Propagator(transform, ifm_propagator.offset)
+
     def get_valid_block_configs(
         self,
         ifm_propagator: Propagator,
@@ -612,33 +645,13 @@ def get_valid_block_configs(
         op_type = op_attrs.get("op")
         op_str = op_attrs.get("op_str")
         activation = op_attrs.get("activation", "NONE")
-        stride_h = int(op_attrs.get("stride_h", 1))
-        stride_w = int(op_attrs.get("stride_w", 1))
         upscaling_factor = 1 if op_attrs.get("upscale", "NONE") == "NONE" else 2
 
-        subkernel_transform = ifm_propagator.transform
         if output_layout == "NHCWB16":
             output_shape = _Shape([1, ofm_shape[1], ofm_shape[3], ofm_channels])
         else:
             output_shape = _Shape(ofm_shape)
 
-        if input_layout == "NHCWB16":
-            subkernel_transform[1][-1] = min(
-                subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h
-            )
-            subkernel_transform[3][-1] = min(
-                subkernel_transform[3][-1], self._subkernel_limits[1] - stride_w
-            )
-        else:
-            subkernel_transform[1][-1] = min(
-                subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h
-            )
-            subkernel_transform[2][-1] = min(
-                subkernel_transform[2][-1], self._subkernel_limits[1] - stride_w
-            )
-
-        subkernel_propagator = Propagator(subkernel_transform, ifm_propagator.offset)
-
         # Define search space
         max_height = min(output_shape.height, self._max_block_shape.height)
         min_height = max(self._micro_block.height, upscaling_factor)
@@ -655,7 +668,7 @@ def get_valid_block_configs(
         if activation == "LUT" and not self._lut_reserved:
             banks_available -= 2
 
-        # Input block depth has additional limitations for Operators that require full input depth
+        # Input block depth has additional limitations for operators that require full input depth
         input_block_depth = 0
         is_partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w)
         if op_type == "ethosu_conv2d":
@@ -669,6 +682,10 @@ def get_valid_block_configs(
                 # Block depth has to be less than full depth or a multiple of the split depth
                 continue
 
+            subkernel_propagator = self._get_subkernel_propagator(
+                op_attrs, ifm_propagator, input_layout, output_layout, depth
+            )
+
             for width in range(min_width, max_width + min_width, min_width):
                 for height in range(min_height, max_height + min_height, min_height):
                     if output_layout == "NHCWB16":
@@ -709,19 +726,11 @@ def get_valid_block_configs(
                         input_block_shape.depth = input_block_depth
 
                     # Banks required for input block
-                    input_bytes = input_block_shape.area() * self._align(
-                        input_block_shape.depth * input_bytewidth, 8
-                    )
-                    input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
-                    input_banks = _round_up(input_banks, self._input_granularity)
-
+                    input_banks = self._get_input_banks(input_block_shape, input_bytewidth)
                     # Banks required for accumulation
-                    acc_depth = _round_up(min(output_block_shape.depth, ofm_channels), 8)
-                    acc_bytes = (
-                        output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
+                    acc_banks = self._get_accumulator_banks(
+                        output_block_shape, acc_bytewidth, depth
                     )
-                    acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
-                    acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])
 
                     if (input_banks + acc_banks) <= banks_available:
                         output_cycles = self._get_output_cycles(

diff --git a/python/tvm/relay/backend/contrib/ethosu/te/convolution.py b/python/tvm/relay/backend/contrib/ethosu/te/convolution.py
@@ -187,7 +187,7 @@ def conv2d_compute(
         [1, 0, 0, 0, 0, 0],
         [0, 1, 0, 0, 0, 0],
         [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
+        [0, 0, 0, 0, 0, ofm_channels],
         [0, 0, 0, 0, 0, 1],
     ]
     ifm_matrix = [

diff --git a/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py b/python/tvm/relay/backend/contrib/ethosu/te/depthwise.py
@@ -181,7 +181,7 @@ def depthwise_conv2d_compute(
         [1, 0, 0, 0, 0, 0],
         [0, 1, 0, 0, 0, 0],
         [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
+        [0, 0, 0, 0, 0, channels],
         [0, 0, 0, 0, 0, 1],
     ]
     ifm_matrix = [

diff --git a/python/tvm/relay/backend/contrib/ethosu/te/pooling.py b/python/tvm/relay/backend/contrib/ethosu/te/pooling.py
@@ -169,7 +169,7 @@ def pooling_compute(
         [1, 0, 0, 0, 0, 0],
         [0, 1, 0, 0, 0, 0],
         [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
+        [0, 0, 0, 0, 0, int(ofm_channels)],
         [0, 0, 0, 0, 0, 1],
     ]
     ifm_matrix = [

diff --git a/src/contrib/ethosu/cascader/block_config.cc b/src/contrib/ethosu/cascader/block_config.cc
@@ -37,15 +37,17 @@ void BlockConfigNode::VisitAttrs(AttrVisitor* v) {
   v->Visit("_input_shape", &tmp_arr);
   tmp_arr = make_array(output_shape_);
   v->Visit("_output_shape", &tmp_arr);
+  v->Visit("_compute_cycles", &compute_cycles_);
+  v->Visit("_output_cycles", &output_cycles_);
 }
 
 BlockConfig::BlockConfig(const std::vector<int>& input_shape, const std::vector<int>& output_shape,
                          int compute_cycles, int output_cycles) {
   auto n = make_object<BlockConfigNode>();
   n->input_shape_ = std::move(input_shape);
   n->output_shape_ = std::move(output_shape);
-  n->compute_cycles_ = compute_cycles;
-  n->output_cycles_ = output_cycles;
+  n->compute_cycles_ = std::move(compute_cycles);
+  n->output_cycles_ = std::move(output_cycles);
   data_ = std::move(n);
 }
 

diff --git a/tests/python/contrib/test_ethosu/cascader/infra.py b/tests/python/contrib/test_ethosu/cascader/infra.py
@@ -64,7 +64,15 @@ def create_te_graph(func):
         return te_graph, consts
 
     def make_matrices(
-        op_type, kernel, stride, padding, ifm_layout, ofm_layout, dilation=(1, 1), ifm_channels=1
+        op_type,
+        kernel,
+        stride,
+        padding,
+        ifm_layout,
+        ofm_layout,
+        dilation=(1, 1),
+        ifm_channels=1,
+        ofm_channels=1,
     ):
         kernel_h, kernel_w = kernel
         stride_h, stride_w = stride
@@ -83,7 +91,7 @@ def make_matrices(
             [1, 0, 0, 0, 0, 0],
             [0, 1, 0, 0, 0, 0],
             [0, 0, 0, 1, 0, 0],
-            [0, 0, 16, 0, 1, -16],
+            [0, 0, 0, 0, 0, ofm_channels],
             [0, 0, 0, 0, 0, 1],
         ]
         if op_type == "ethosu_conv2d":

diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
@@ -164,7 +164,7 @@
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
-                ((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 16, 1, 4, 4)),
+                ((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 10, 1, 6, 4)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 # Depthwise Conv2D
                 ((1, 6, 10, 16), (1, 6, 1, 10, 16)),
@@ -182,7 +182,7 @@
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
-                ((1, 10, 6, 8), (1, 16, 1, 4, 8)),
+                ((1, 10, 6, 8), (1, 10, 1, 6, 8)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 # Depthwise Conv2D
                 ((1, 6, 10, 16), (1, 6, 1, 10, 16)),
@@ -252,20 +252,22 @@ def test_best_block_config(
         [0, 0, 0, 0, 16],
         [0, 0, 0, 0, 1],
     ]
-    nhcwb16_to_nhwc = [
-        [1, 0, 0, 0, 0, 0],
-        [0, 1, 0, 0, 0, 0],
-        [0, 0, 0, 1, 0, 0],
-        [0, 0, 16, 0, 1, -16],
-        [0, 0, 0, 0, 0, 1],
-    ]
-    ifm_matrix, ifm_offset, weight_matrix, weight_offset, _, _ = make_matrices(
-        op_type, kernel, stride, padding, layouts[0], layouts[1], dilation, in_shape[3]
-    )
 
     ofm_channels = out_shape[3]
     ifm_channels = in_shape[3]
 
+    ifm_matrix, ifm_offset, weight_matrix, weight_offset, _, _ = make_matrices(
+        op_type,
+        kernel,
+        stride,
+        padding,
+        layouts[0],
+        layouts[1],
+        dilation,
+        ifm_channels,
+        ofm_channels,
+    )
+
     if layouts[0] == "NHCWB16":
         in_shape = [
             int(math.ceil(n)) for n in np.matmul(nhwc_to_nhcwb16, in_shape + (1,)).tolist()[:-1]
@@ -321,9 +323,12 @@ def test_best_block_config(
     # Add tensors
     input_tensor = cs.Tensor(in_shape, "int8")
     part.set_input(0, input_tensor)
-    if op_type in ("ethosu_conv2d", "ethosu_depthwise_conv2d"):
+    if op_type == "ethosu_conv2d":
         weight_tensor = cs.Tensor([ofm_channels, kernel[0], kernel[1], ifm_channels], "int8")
         part.set_input(1, weight_tensor)
+    elif op_type == "ethosu_depthwise_conv2d":
+        weight_tensor = cs.Tensor([ofm_channels, kernel[0], kernel[1], 1], "int8")
+        part.set_input(1, weight_tensor)
 
     output_tensor = cs.Tensor(out_shape, "int8")
     part.set_output(output_tensor)

diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_conv2d_matcher.py
@@ -82,6 +82,7 @@ def test_ethosu_conv2d_matcher(
         ofm_layout,
         dilation,
         ifm_channels,
+        ofm_channels,
     )
 
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")

diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_depthwise2d_matcher.py
@@ -83,6 +83,7 @@ def test_ethosu_depthwise2d_matcher(kernel, stride, dilation, padding, ifm_layou
         ifm_layout,
         ofm_layout,
         dilation,
+        ofm_channels=ofm_channels,
     )
 
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")

diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_pooling_matcher.py
@@ -66,6 +66,7 @@ def test_ethosu_pooling_matcher(pool_shape, stride, padding, ifm_layout, ofm_lay
         padding,
         ifm_layout,
         ofm_layout,
+        ofm_channels=ofm_channels,
     )
 
     device_config = cs.EthosuDeviceConfig("ethos-u55-256")