Skip to content

Commit

Permalink
[microNPU] Fix layout transform matrix
Browse files Browse the repository at this point in the history
One of the layout transforms currently causes the cascader to stripe
across B16 axis (which is not allowed), so change that and deal with
the implications to the get_valid_block_configs.

Change-Id: I04199f9f35fcc31618581567483cfb80d3b5aad2
  • Loading branch information
ekalda committed Mar 25, 2022
1 parent 14084f4 commit aab1d7c
Show file tree
Hide file tree
Showing 10 changed files with 88 additions and 61 deletions.
91 changes: 50 additions & 41 deletions python/tvm/contrib/ethosu/cascader/device_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,23 @@ def is_partkernel(

return part_kernel_first_utilization > depth_first_utilization or ifm_channels <= 8

def _get_input_banks(self, input_block_shape, input_bytewidth):
input_bytes = input_block_shape.area() * self._align(
input_block_shape.depth * input_bytewidth, 8
)
input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
input_banks = _round_up(input_banks, self._input_granularity)

return input_banks

def _get_accumulator_banks(self, output_block_shape, acc_bytewidth, depth):
acc_depth = _round_up(min(output_block_shape.depth, depth), 8)
acc_bytes = output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])

return acc_banks

def get_elementwise_block_config(
self,
ifm_propagator: Propagator,
Expand Down Expand Up @@ -533,16 +550,9 @@ def get_elementwise_block_config(
input2_block.round_up(self._input_micro_block)

# Banks required for input block
input_bytes = input_block.area() * self._align(input_block.depth * input_bytewidth, 8)
input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
input_banks = _round_up(input_banks, self._input_granularity)

input_banks = self._get_input_banks(input_block, input_bytewidth)
# Banks required for input2 block
input2_bytes = input2_block.area() * self._align(
input2_block.depth * input_bytewidth, 8
)
input2_banks = _round_up_div(input2_bytes, self._bank_size_bytes) * 2
input2_banks = _round_up(input2_banks, self._input_granularity)
input2_banks = self._get_input_banks(input2_block, input_bytewidth)

# Check whether or not both IFMs fit into SHRAM
if (input_banks + input2_banks) <= banks_available:
Expand All @@ -561,6 +571,29 @@ def get_elementwise_block_config(

return block_config

def _get_subkernel_propagator(
self, op_attrs, ifm_propagator, input_layout, output_layout, depth
):
op_type = op_attrs.get("op")
stride_h = int(op_attrs.get("stride_h", 1))
stride_w = int(op_attrs.get("stride_w", 1))
transform = ifm_propagator.transform

if input_layout == "NHCWB16":
transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
transform[3][-1] = min(transform[3][-1], self._subkernel_limits[1] - stride_w)
else:
transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
transform[2][-1] = min(transform[2][-1], self._subkernel_limits[1] - stride_w)

if op_type in ("ethosu_pooling", "ethosu_depthwise_conv2d"):
if output_layout == "NHCWB16" and input_layout == "NHWC":
transform[3][-1] = depth
elif output_layout == "NHCWB16" and input_layout == "NHCWB16":
transform[2][-1] = depth // 16

return Propagator(transform, ifm_propagator.offset)

def get_valid_block_configs(
self,
ifm_propagator: Propagator,
Expand Down Expand Up @@ -612,33 +645,13 @@ def get_valid_block_configs(
op_type = op_attrs.get("op")
op_str = op_attrs.get("op_str")
activation = op_attrs.get("activation", "NONE")
stride_h = int(op_attrs.get("stride_h", 1))
stride_w = int(op_attrs.get("stride_w", 1))
upscaling_factor = 1 if op_attrs.get("upscale", "NONE") == "NONE" else 2

subkernel_transform = ifm_propagator.transform
if output_layout == "NHCWB16":
output_shape = _Shape([1, ofm_shape[1], ofm_shape[3], ofm_channels])
else:
output_shape = _Shape(ofm_shape)

if input_layout == "NHCWB16":
subkernel_transform[1][-1] = min(
subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h
)
subkernel_transform[3][-1] = min(
subkernel_transform[3][-1], self._subkernel_limits[1] - stride_w
)
else:
subkernel_transform[1][-1] = min(
subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h
)
subkernel_transform[2][-1] = min(
subkernel_transform[2][-1], self._subkernel_limits[1] - stride_w
)

subkernel_propagator = Propagator(subkernel_transform, ifm_propagator.offset)

# Define search space
max_height = min(output_shape.height, self._max_block_shape.height)
min_height = max(self._micro_block.height, upscaling_factor)
Expand All @@ -655,7 +668,7 @@ def get_valid_block_configs(
if activation == "LUT" and not self._lut_reserved:
banks_available -= 2

# Input block depth has additional limitations for Operators that require full input depth
# Input block depth has additional limitations for operators that require full input depth
input_block_depth = 0
is_partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w)
if op_type == "ethosu_conv2d":
Expand All @@ -669,6 +682,10 @@ def get_valid_block_configs(
# Block depth has to be less than full depth or a multiple of the split depth
continue

subkernel_propagator = self._get_subkernel_propagator(
op_attrs, ifm_propagator, input_layout, output_layout, depth
)

for width in range(min_width, max_width + min_width, min_width):
for height in range(min_height, max_height + min_height, min_height):
if output_layout == "NHCWB16":
Expand Down Expand Up @@ -709,19 +726,11 @@ def get_valid_block_configs(
input_block_shape.depth = input_block_depth

# Banks required for input block
input_bytes = input_block_shape.area() * self._align(
input_block_shape.depth * input_bytewidth, 8
)
input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
input_banks = _round_up(input_banks, self._input_granularity)

input_banks = self._get_input_banks(input_block_shape, input_bytewidth)
# Banks required for accumulation
acc_depth = _round_up(min(output_block_shape.depth, ofm_channels), 8)
acc_bytes = (
output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
acc_banks = self._get_accumulator_banks(
output_block_shape, acc_bytewidth, depth
)
acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])

if (input_banks + acc_banks) <= banks_available:
output_cycles = self._get_output_cycles(
Expand Down
2 changes: 1 addition & 1 deletion python/tvm/relay/backend/contrib/ethosu/te/convolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def conv2d_compute(
[1, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 16, 0, 1, -16],
[0, 0, 0, 0, 0, ofm_channels],
[0, 0, 0, 0, 0, 1],
]
ifm_matrix = [
Expand Down
2 changes: 1 addition & 1 deletion python/tvm/relay/backend/contrib/ethosu/te/depthwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def depthwise_conv2d_compute(
[1, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 16, 0, 1, -16],
[0, 0, 0, 0, 0, channels],
[0, 0, 0, 0, 0, 1],
]
ifm_matrix = [
Expand Down
2 changes: 1 addition & 1 deletion python/tvm/relay/backend/contrib/ethosu/te/pooling.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def pooling_compute(
[1, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 16, 0, 1, -16],
[0, 0, 0, 0, 0, int(ofm_channels)],
[0, 0, 0, 0, 0, 1],
]
ifm_matrix = [
Expand Down
6 changes: 4 additions & 2 deletions src/contrib/ethosu/cascader/block_config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,17 @@ void BlockConfigNode::VisitAttrs(AttrVisitor* v) {
v->Visit("_input_shape", &tmp_arr);
tmp_arr = make_array(output_shape_);
v->Visit("_output_shape", &tmp_arr);
v->Visit("_compute_cycles", &compute_cycles_);
v->Visit("_output_cycles", &output_cycles_);
}

BlockConfig::BlockConfig(const std::vector<int>& input_shape, const std::vector<int>& output_shape,
int compute_cycles, int output_cycles) {
auto n = make_object<BlockConfigNode>();
n->input_shape_ = std::move(input_shape);
n->output_shape_ = std::move(output_shape);
n->compute_cycles_ = compute_cycles;
n->output_cycles_ = output_cycles;
n->compute_cycles_ = std::move(compute_cycles);
n->output_cycles_ = std::move(output_cycles);
data_ = std::move(n);
}

Expand Down
12 changes: 10 additions & 2 deletions tests/python/contrib/test_ethosu/cascader/infra.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,15 @@ def create_te_graph(func):
return te_graph, consts

def make_matrices(
op_type, kernel, stride, padding, ifm_layout, ofm_layout, dilation=(1, 1), ifm_channels=1
op_type,
kernel,
stride,
padding,
ifm_layout,
ofm_layout,
dilation=(1, 1),
ifm_channels=1,
ofm_channels=1,
):
kernel_h, kernel_w = kernel
stride_h, stride_w = stride
Expand All @@ -83,7 +91,7 @@ def make_matrices(
[1, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 16, 0, 1, -16],
[0, 0, 0, 0, 0, ofm_channels],
[0, 0, 0, 0, 0, 1],
]
if op_type == "ethosu_conv2d":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@
((1, 6, 5, 16), (1, 6, 1, 5, 16)),
((1, 4, 4, 16), (1, 4, 1, 4, 16)),
((1, 8, 4, 16), (1, 8, 1, 4, 16)),
((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 16, 1, 4, 4)),
((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 10, 1, 6, 4)),
((1, 6, 5, 16), (1, 6, 1, 5, 16)),
# Depthwise Conv2D
((1, 6, 10, 16), (1, 6, 1, 10, 16)),
Expand All @@ -182,7 +182,7 @@
((1, 6, 5, 16), (1, 6, 1, 5, 16)),
((1, 4, 4, 16), (1, 4, 1, 4, 16)),
((1, 8, 4, 16), (1, 8, 1, 4, 16)),
((1, 10, 6, 8), (1, 16, 1, 4, 8)),
((1, 10, 6, 8), (1, 10, 1, 6, 8)),
((1, 6, 5, 16), (1, 6, 1, 5, 16)),
# Depthwise Conv2D
((1, 6, 10, 16), (1, 6, 1, 10, 16)),
Expand Down Expand Up @@ -252,20 +252,22 @@ def test_best_block_config(
[0, 0, 0, 0, 16],
[0, 0, 0, 0, 1],
]
nhcwb16_to_nhwc = [
[1, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 16, 0, 1, -16],
[0, 0, 0, 0, 0, 1],
]
ifm_matrix, ifm_offset, weight_matrix, weight_offset, _, _ = make_matrices(
op_type, kernel, stride, padding, layouts[0], layouts[1], dilation, in_shape[3]
)

ofm_channels = out_shape[3]
ifm_channels = in_shape[3]

ifm_matrix, ifm_offset, weight_matrix, weight_offset, _, _ = make_matrices(
op_type,
kernel,
stride,
padding,
layouts[0],
layouts[1],
dilation,
ifm_channels,
ofm_channels,
)

if layouts[0] == "NHCWB16":
in_shape = [
int(math.ceil(n)) for n in np.matmul(nhwc_to_nhcwb16, in_shape + (1,)).tolist()[:-1]
Expand Down Expand Up @@ -321,9 +323,12 @@ def test_best_block_config(
# Add tensors
input_tensor = cs.Tensor(in_shape, "int8")
part.set_input(0, input_tensor)
if op_type in ("ethosu_conv2d", "ethosu_depthwise_conv2d"):
if op_type == "ethosu_conv2d":
weight_tensor = cs.Tensor([ofm_channels, kernel[0], kernel[1], ifm_channels], "int8")
part.set_input(1, weight_tensor)
elif op_type == "ethosu_depthwise_conv2d":
weight_tensor = cs.Tensor([ofm_channels, kernel[0], kernel[1], 1], "int8")
part.set_input(1, weight_tensor)

output_tensor = cs.Tensor(out_shape, "int8")
part.set_output(output_tensor)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def test_ethosu_conv2d_matcher(
ofm_layout,
dilation,
ifm_channels,
ofm_channels,
)

device_config = cs.EthosuDeviceConfig("ethos-u55-256")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def test_ethosu_depthwise2d_matcher(kernel, stride, dilation, padding, ifm_layou
ifm_layout,
ofm_layout,
dilation,
ofm_channels=ofm_channels,
)

device_config = cs.EthosuDeviceConfig("ethos-u55-256")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def test_ethosu_pooling_matcher(pool_shape, stride, padding, ifm_layout, ofm_lay
padding,
ifm_layout,
ofm_layout,
ofm_channels=ofm_channels,
)

device_config = cs.EthosuDeviceConfig("ethos-u55-256")
Expand Down

0 comments on commit aab1d7c

Please sign in to comment.