Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Hexagon] Implement avg_pool2d slice op #11417

Merged
merged 13 commits into from
Jun 15, 2022
22 changes: 22 additions & 0 deletions python/tvm/topi/hexagon/slice_ops/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

""" Computes and Schedules for Hexagon slice ops. """

# pylint: disable=wildcard-import

from .avg_pool2d import avg_pool2d_compute, avg_pool2d_schedule, avg_pool2d_STIR_schedule
198 changes: 198 additions & 0 deletions python/tvm/topi/hexagon/slice_ops/avg_pool2d.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

from tvm.ir.module import IRModule
from tvm import te
from tvm import tir
from tvm.script import tir as T
from ..utils import apply_transform, get_layout_transform_fn


# The slice op implementation for avg_pool2d makes serveral assumptions:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this be moved from a comment to a docstring?

# 1) Both input and output are a multiple of croutons, and the input is already
# padded for a given output shape as per any crouton and non-crouton related
# padding.
# 2) The current implementation assumes 'count_include_pad' to be 'True'. It can
# modified to support 'False' but the element count for the pooling window must
# be pre-computed and provided as an input to reduce the run-time overhead.
# 3) 'padding' is also ignored. It must be handled outside of the sliced op.
# 4) Please note that this implementation will not work if the output was padded
# for the croutons. Since we loop over the logical output shape, this can result
# into out-of-bound access for the input.

def avg_pool2d_compute(A, out_shape, kernel, stride, dilation):
kh, kw = kernel
rh = te.reduce_axis((0, kh), name="rh")
rw = te.reduce_axis((0, kw), name="rw")
ob, oh, ow, oc = out_shape
sh, sw = stride
dh, dw = dilation
Area = float(1) / (kh * kw)

Lunderberg marked this conversation as resolved.
Show resolved Hide resolved
Sum = te.compute(
out_shape,
lambda b, h, w, c: te.sum(
A[b, h * sh + dh * rh, w * sw + dw * rw, c].astype("float32"), axis=[rh, rw]
),
name="sum",
)
Avg = te.compute(
out_shape, lambda b, h, w, c: (Sum[b, h, w, c] * Area).astype(A.dtype), name="avg"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick: The name Area threw me a bit, as I initially thought Area should be the area of the kernel relative to a single value, rather than the area of a value relative to the kernel. Can we rename Area to either InvArea or NumValues?

)
return Avg


# Schedule for input and output layout nhwc-8h2w32c2w
def STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: str):
func = te.create_prim_func([ins, outs])
s = tir.Schedule(func)
Sum = s.get_block("sum")
Avg = s.get_block("avg")

apply_transform(s, Sum, 0, "read", input_layout)
apply_transform(s, Avg, 0, "write", output_layout)

# Schedule 'Sum'
bn, bh, bw, bc, rx, ry = s.get_loops(Sum)
bho, bhi = s.split(bh, [None, 8])
bwo, bwi = s.split(bw, [None, 4])
bwio, bwii = s.split(bwi, [None, 2]) # Doesn't seem to be doing anything
bco, bci = s.split(bc, [None, 32])
s.reorder(bn, bho, bwo, bco, bhi, bwio, rx, ry, bci, bwii) # --- DOESN'T do anything
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you have before and after these lines? Running the test case test_avg_pool2d_slice.py::TestAvgPool2dSlice::test_avg_pool2d_slice[nhwc-8h2w32c2w-False-str ide0-kernel0-float16-dilation0-padding0-True-nhwc-8h2w32c2w-output_shape0-False] and using print(s.mod.script()), I can see the loopnest before this line to have extents T.grid(1, 1, 8, 2, 2, 2, 1, 32, 3, 3) and afterward to have extents T.grid(1, 1, 2, 1, 8, 2, 3, 3, 32, 2), so it does look like the reorder is having an effect.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right. I do see the loops getting reordered after this line. However, when I print it again after s.compute_at(Sum, hi), I don't see the reordered/fused loopnest anymore.

bci_wii = s.fuse(bci, bwii) # --- DOESN'T do anything
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same question here, after fusing I see extents T.grid(1, 1, 2, 1, 8, 2, 3, 3, 64) and can't reproduce the lack of effect.

# s.vectorize(bci_wii) # --- DOESN'T WORK -- errors out

# Schedule 'Avg'
n, h, w, c = s.get_loops(Avg)
ho, hi = s.split(h, [None, 8])
wo, wi = s.split(w, [None, 4])
wio, wii = s.split(wi, [None, 2])
co, ci = s.split(c, [None, 32])
s.reorder(n, ho, wo, co, hi, wio, ci, wii)
ci_wii = s.fuse(ci, wii)
s.vectorize(ci_wii)

s.compute_at(Sum, hi)
return s


# Schedule for output layout: n11c-1024c, input layout: nhwc-8h2w32c2w
def STIR_schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str):
func = te.create_prim_func([ins, outs])
s = tir.Schedule(func)
Sum = s.get_block("sum")
Avg = s.get_block("avg")

apply_transform(s, Sum, 0, "read", input_layout)
apply_transform(s, Avg, 0, "write", output_layout)

bn, bh, bw, bc, rx, ry = s.get_loops(Sum)
bco, bci = s.split(bc, [None, 1024])
bcio, bcii = s.split(bci, [None, 64])
s.reorder(bn, bh, bw, bco, bcio, rx, ry, bcii) # --- DOESN'T do anything
# s.vectorize(bcii) # --- DOESN'T WORK -- errors out

n, h, w, c = s.get_loops(Avg)
co, ci = s.split(c, [None, 1024])
cio, cii = s.split(ci, [None, 64])
s.vectorize(cii)

s.compute_at(Sum, cio)
return s


# TIR based schedule
def avg_pool2d_STIR_schedule(outs, ins, output_layout: str, input_layout: str):
output_layout += "-1d"
input_layout += "-1d"
if output_layout == "nhwc-8h2w32c2w-1d":
return STIR_schedule_nhwc_8h2w32c2w(outs, ins, output_layout, input_layout)
if output_layout == "n11c-1024c-1d":
return STIR_schedule_n11c_1024c(outs, ins, output_layout, input_layout)
else:
raise RuntimeError(f"Unexpected layout '{output_layout}'")


# Schedule for input and output layout nhwc-8h2w32c2w
def schedule_nhwc_8h2w32c2w(outs, ins, output_layout: str, input_layout: str):
A = ins
M = outs
s = te.create_schedule([M.op])
B = s[M].op.input_tensors[0]

# Apply layout transformation
input_layout = get_layout_transform_fn(input_layout)
output_layout = get_layout_transform_fn(output_layout)
s[A].transform_layout(input_layout)
M_axis = s[M].transform_layout(output_layout)

# Schedule 'M'
m_inner = s[M].fuse(M_axis[7], M_axis[6])
s[M].vectorize(m_inner)

# Schedule 'B'
bn, bh, bw, bc = s[B].op.axis
rx, ry = s[B].op.reduce_axis
bwo, bwi = s[B].split(bw, factor=4)
bwio, bwii = s[B].split(bwi, factor=2)
bco, bci = s[B].split(bc, factor=32)
s[B].reorder(bn, bco, bh, bwo, bwio, ry, rx, bci, bwii)
b_inner = s[B].fuse(bci, bwii)
# s[B].vectorize(b_inner) # Doesn't work

s[B].compute_at(s[M], M_axis[5])
return s


# Schedule for output layout: n11c-1024c, input layout: nhwc-8h2w32c2w
def schedule_n11c_1024c(outs, ins, output_layout: str, input_layout: str):
A = ins
M = outs
s = te.create_schedule([M.op])
B = s[M].op.input_tensors[0]

# Apply layout transformation
input_layout = get_layout_transform_fn(input_layout)
output_layout = get_layout_transform_fn(output_layout)
s[A].transform_layout(input_layout)
M_axis = s[M].transform_layout(output_layout)

# Schedule 'M'
mco, mci = s[M].split(M_axis[4], factor=64)
s[M].vectorize(mci)

# Schedule 'B'
bn, bh, bw, bc = s[B].op.axis
rx, ry = s[B].op.reduce_axis
bco, bci = s[B].split(bc, factor=64)
s[B].reorder(bco, rx, ry, bci)
# s[B].vectorize(bci) # Doesn't work

s[B].compute_at(s[M], mco)
return s


# te based schedule
def avg_pool2d_schedule(outs, ins, output_layout: str, input_layout: str):
output_layout += "-2d"
input_layout += "-2d"
if output_layout == "nhwc-8h2w32c2w-2d":
return schedule_nhwc_8h2w32c2w(outs, ins, output_layout, input_layout)
if output_layout == "n11c-1024c-2d":
return schedule_n11c_1024c(outs, ins, output_layout, input_layout)
else:
raise RuntimeError(f"Unexpected layout '{output_layout}'")
75 changes: 75 additions & 0 deletions python/tvm/topi/hexagon/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

from tvm import te


def n11c_1024c_2d(n, h, w, c):
return [n, h, w, c // 1024, te.AXIS_SEPARATOR, c % 1024]


def n11c_1024c_1d(n, h, w, c):
return [n, h, w, c // 1024, c % 1024]


def nhwc_8h2w32c2w_2d(n, h, w, c):
return [n, h // 8, w // 4, c // 32, te.AXIS_SEPARATOR, h % 8, (w % 4) // 2, c % 32, w % 2]


def nhwc_8h2w32c2w_1d(n, h, w, c):
return [n, h // 8, w // 4, c // 32, h % 8, (w % 4) // 2, c % 32, w % 2]
Copy link
Contributor

@cconvey cconvey May 25, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would these functions' purpose be clearer if their names started with something like get_shape_..., xform_layout_..., etc?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe. Although, I would prefer the current names.



def get_layout_transform_fn(layout):
if layout == "nhwc-8h2w32c2w-2d":
return nhwc_8h2w32c2w_2d
if layout == "nhwc-8h2w32c2w-1d":
return nhwc_8h2w32c2w_1d
elif layout == "n11c-1024c-2d":
return n11c_1024c_2d
elif layout == "n11c-1024c-1d":
return n11c_1024c_1d
else:
raise RuntimeError(f"Unexpected layout '{layout}'")


def apply_transform(s, block, block_index: int, buffer_type: str, layout: str):
"""Apply transform layout on a buffer

Parameters
----------
s: Schedule
block : BlockRV
The block that accesses the target buffer
buffer_index: int
The index of the buffer in block's read or write region
buffer_type : str
Type of the buffer index, "read" or "write"
layout : str
Layout of the buffer
"""
transform_fn = get_layout_transform_fn(layout)
if layout == "nhwc-8h2w32c2w-1d":
axis_separators = [4]
elif layout == "n11c-1024c-1d":
axis_separators = [2]
else:
raise RuntimeError(f"Unexpected layout '{layout}'")

s.transform_layout(block, block_index, buffer_type, transform_fn)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI, after #11269 lands, the calling layout_transform will also handle the call to set_axis_separators, so this function may become simpler or empty.

if axis_separators:
s.set_axis_separator(block, block_index, buffer_type, axis_separators)
18 changes: 16 additions & 2 deletions tests/python/contrib/test_hexagon/infrastructure.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,7 @@ def allocate_hexagon_array(
numpy.prod(tensor_shape[dim_i:dim_f])
for dim_i, dim_f in zip(boundaries[:-1], boundaries[1:])
]

arr = tvm.nd.empty(physical_shape, dtype=dtype, device=dev)
arr = tvm.nd.empty(physical_shape, dtype=dtype, device=dev, mem_scope=mem_scope)

if data is not None:
arr.copyfrom(data.reshape(physical_shape))
Expand Down Expand Up @@ -228,3 +227,18 @@ def compute(n, ho, wo, ko, hi, wi, ki):
)

return output_shape, compute


# Transpose and reshape numpy array according to the specified layout
def transform_numpy(arr_np, layout):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function seems to assume that the supplied layout is NHWC. Is that a safe assumption for all expected uses of the function?

If no, then should we put nhwc into the function name, or perhaps change its argument list to something like (arr_np, current_layout, new_layout)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right that this function is making an assumption about the supplied layout which can transform the input incorrectly. Thanks for the suggestion, @cconvey!

if layout == "nhwc":
return arr_np
elif layout == "nhwc-8h2w32c2w":
N, H, W, C = arr_np.shape
return arr_np.reshape([N, H // 8, 8, W // 4, 2, 2, C // 32, 32]).transpose(0, 1, 3, 6, 2, 4, 7, 5)
elif layout == "n11c-1024c":
N, H, W, C = arr_np.shape
assert (H == 1 and W == 1), "The size of H and W must be 1!"
return arr_np.reshape([N, C//1024, 1024]).transpose(0, 1, 2)
Lunderberg marked this conversation as resolved.
Show resolved Hide resolved
else:
raise RuntimeError(f"Unexpected layout '{layout}'")
Loading