Skip to content

Commit

Permalink
[TOPI] Using x86 schedules for ARM conv2d.
Browse files Browse the repository at this point in the history
  • Loading branch information
anijain2305 committed Apr 14, 2020
1 parent f08d5d7 commit f6bfd8c
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 9 deletions.
54 changes: 46 additions & 8 deletions python/tvm/relay/op/strategy/arm_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,15 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
if groups == 1:
if layout == "NCHW":
if kernel_layout == "OIHW":
# Commenting the spatial pack as x86 NCHWc schedules perform better.
# strategy.add_implementation(
# wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack),
# wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack),
# name="conv2d_nchw_spatial_pack.arm_cpu")
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack),
name="conv2d_nchw_spatial_pack.arm_cpu")
wrap_compute_conv2d(topi.x86.conv2d_nchw),
wrap_topi_schedule(topi.x86.schedule_conv2d_nchw),
name="conv2d_nchw.x86")
# check if winograd algorithm is applicable
_, _, kh, kw = get_const_tuple(kernel.shape)
pt, pl, pb, pr = topi.nn.get_pad_tuple(padding, (kh, kw))
Expand Down Expand Up @@ -100,11 +105,14 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
if layout == "NCHW":
assert kernel_layout == "OIHW" or re.match(r"OIHW\d*o", kernel_layout)
if kernel_layout == "OIHW":
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw),
wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw),
name="depthwise_conv2d_nchw.arm_cpu")

# Commenting the spatial pack as x86 NCHWc schedules perform better.
# if kernel_layout == "OIHW":
# strategy.add_implementation(
# wrap_compute_conv2d(topi.arm_cpu.depthwise_conv2d_nchw),
# wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw),
# name="depthwise_conv2d_nchw.arm_cpu")

# TODO:
# This schedule has incorrect result on some hardware platforms (like NV Jetson TX2)
# Let us comment it out but not remove.
Expand All @@ -115,6 +123,13 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
# wrap_topi_schedule(topi.arm_cpu.schedule_depthwise_conv2d_nchw_spatial_pack),
# name="depthwise_conv2d_nchw_spatial_pack.arm_cpu",
# plevel=15)

channel_multiplier = get_const_tuple(inputs[1].shape)[1]
if channel_multiplier == 1 and dilation_h == 1 and dilation_w == 1:
strategy.add_implementation(
wrap_compute_conv2d(topi.x86.depthwise_conv2d_nchw),
wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_nchw),
name="depthwise_conv2d_nchw.x86")
elif layout == "NHWC":
assert kernel_layout == "HWOI"
logger.warning("depthwise_conv2d with layout NHWC is not optimized for arm cpu.")
Expand All @@ -138,6 +153,29 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
format(layout))
return strategy

@conv2d_NCHWc_strategy.register("arm_cpu")
def conv2d_NCHWc_strategy_arm_cpu(attrs, inputs, out_type, target):
"""conv2d_NCHWc x86 strategy"""
strategy = _op.OpStrategy()
data, kernel = inputs
logger.warning("Trying x86 Conv NCHWc.")
strategy.add_implementation(
wrap_compute_conv2d(topi.x86.conv2d_NCHWc, True, True),
wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc),
name="conv2d_NCHWc.x86")
return strategy

@depthwise_conv2d_NCHWc_strategy.register("cpu")
def depthwise_conv2d_NCHWc_strategy_arm_cpu(attrs, inputs, out_type, target):
"""depthwise_conv2d x86 strategy"""
strategy = _op.OpStrategy()
logger.warning("Trying x86 DWC NCHWc.")
strategy.add_implementation(
wrap_compute_conv2d(topi.x86.depthwise_conv2d_NCHWc, True, True),
wrap_topi_schedule(topi.x86.schedule_depthwise_conv2d_NCHWc),
name="depthwise_conv2d_NCHWc.x86")
return strategy

def wrap_compute_conv2d_winograd_nnpack(topi_compute):
"""wrap topi compute for conv2d_winograd NNPack"""
def _compute_conv2d_nnpack(attrs, inputs, out_type):
Expand Down
104 changes: 104 additions & 0 deletions topi/python/topi/arm_cpu/conv2d_alter_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
data, kernel = tinfos
out_dtype = out_type.dtype

data_tensor, kernel_tensor = tinfos
data_dtype = data_tensor.dtype
kernel_dtype = kernel_tensor.dtype

idxd = tvm.tir.indexdiv

if topi_tmpl == "conv2d_nchw_spatial_pack.arm_cpu":
Expand Down Expand Up @@ -169,4 +173,104 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):

return relay.nn.conv2d(*inputs, **new_attrs)

if topi_tmpl == "conv2d_NCHWc.x86":
# we only convert conv2d_NCHW to conv2d_NCHWc for x86
assert data_layout == "NCHW" and kernel_layout == "OIHW"
if cfg.is_fallback:
_get_default_config(cfg, data_tensor, kernel_tensor, strides, padding,
out_dtype, False, data_layout)
batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]

# update new attrs
new_attrs['channels'] = out_channel
new_attrs['data_layout'] = 'NCHW%dc' % ic_bn
# (oc, ic, h, w) -> (OC, IC, h, w, ic, oc)
new_attrs['kernel_layout'] = 'OIHW%di%do' % (ic_bn, oc_bn)
new_attrs['out_layout'] = 'NCHW%dc' % oc_bn

# Store altered operator's config
new_data = te.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
dtype=data_dtype)
new_kernel = te.placeholder((out_channel//oc_bn, in_channel//ic_bn,
kh, kw, ic_bn, oc_bn), dtype=kernel_tensor.dtype)
new_workload = autotvm.task.args_to_workload(
[new_data, new_kernel, strides, padding, dilation, new_attrs["data_layout"],
new_attrs["out_layout"], out_dtype], topi_tmpl)
dispatch_ctx.update(target, new_workload, cfg)
return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs)

if topi_tmpl == "conv2d_NCHWc_int8.x86":
# TODO(@icemelon9, @anijain2305): Need to support data layout NHWC with kernel layout HWIO
assert data_layout == "NCHW" and kernel_layout == "OIHW"
if cfg.is_fallback:
_get_default_config_int8(cfg, data_tensor, kernel_tensor, strides, padding,
out_dtype, False, data_layout)

batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape)
ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
n_elems = 4

# convert kernel data layout from 4D to 7D
data_expr, kernel_expr = inputs
kernel_IHWO = relay.transpose(kernel_expr, axes=(1, 2, 3, 0))
kernel_IHWOo = relay.reshape(kernel_IHWO, (in_channel, kh, kw, out_channel//oc_bn, oc_bn))
kernel_OHWoI = relay.transpose(kernel_IHWOo, axes=(3, 1, 2, 4, 0))
kernel_OHWoIi = relay.reshape(kernel_OHWoI, (out_channel//oc_bn, kh, kw, oc_bn,
in_channel//ic_bn, ic_bn))
kernel_OHWoIie = relay.reshape(kernel_OHWoIi, (out_channel//oc_bn, kh, kw, oc_bn,
in_channel//ic_bn, ic_bn//n_elems, n_elems))
kernel_OIHWioe = relay.transpose(kernel_OHWoIie, axes=(0, 4, 1, 2, 5, 3, 6))

# update new attrs
new_attrs['channels'] = out_channel
new_attrs['data_layout'] = 'NCHW%dc' % ic_bn
new_attrs['out_layout'] = 'NCHW%dc' % oc_bn

# Store altered operator's config.
new_data = te.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
dtype=data_dtype)
new_kernel = te.placeholder((out_channel // oc_bn,
in_channel // ic_bn,
kh,
kw,
ic_bn // n_elems,
oc_bn,
n_elems), dtype=kernel_dtype)
new_workload = autotvm.task.args_to_workload(
[new_data, new_kernel, strides, padding, dilation, new_attrs['data_layout'],
new_attrs['out_layout'], out_dtype], topi_tmpl)
dispatch_ctx.update(target, new_workload, cfg)

return relay.nn.contrib_conv2d_nchwc(data_expr, kernel_OIHWioe, **new_attrs)

if topi_tmpl == "depthwise_conv2d_NCHWc.x86":
assert data_layout == "NCHW" and kernel_layout == "OIHW"
if cfg.is_fallback:
_get_default_config(cfg, data_tensor, kernel_tensor, strides, padding,
out_dtype, True, data_layout)

batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape)
ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
assert channel_multiplier == 1

# update new attrs
new_attrs['channels'] = out_channel
new_attrs['data_layout'] = 'NCHW%dc' % ic_bn
new_attrs['kernel_layout'] = 'OIHW1i%do' % oc_bn
new_attrs['out_layout'] = 'NCHW%dc' % oc_bn

# Store altered operator's config.
new_data = te.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
dtype=data_dtype)
new_kernel = te.placeholder((out_channel//oc_bn, 1, kh, kw, 1, oc_bn), dtype=kernel_dtype)
new_workload = autotvm.task.args_to_workload(
[new_data, new_kernel, strides, padding, dilation, new_attrs['data_layout'],
new_attrs['out_layout'], out_dtype], topi_tmpl)
dispatch_ctx.update(target, new_workload, cfg)
return relay.nn.contrib_depthwise_conv2d_nchwc(*inputs, **new_attrs)

return None
2 changes: 1 addition & 1 deletion topi/python/topi/x86/conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, out_layo

cfg.define_split("tile_ic", in_channel, num_outputs=2)
cfg.define_split("tile_oc", num_filter, num_outputs=2)
cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda y: y.size[-1] <= 64)
cfg.define_split("tile_ow", ow, num_outputs=2, filter=lambda y: y.size[-1] <= 64, policy="verbose")
if is_kernel_1x1:
cfg.define_knob("tile_oh", [1, 2] if oh > 1 else [1])
else:
Expand Down

0 comments on commit f6bfd8c

Please sign in to comment.