giuseros commented on a change in pull request #6095:
URL: https://github.com/apache/incubator-tvm/pull/6095#discussion_r458814745
##########
File path: topi/python/topi/arm_cpu/depthwise_conv2d.py
##########
@@ -181,6 +181,154 @@ def depthwise_conv2d_nchw_spatial_pack(cfg, data, kernel,
strides, padding, dila
return _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation,
out_dtype, num_tile=2)
[email protected]_topi_compute("depthwise_conv2d_nhwc.arm_cpu")
+def compute_depthwise_conv2d_nhwc(_, data, kernel, strides, padding, dilation,
out_dtype):
+ """TOPI compute callback for depthwise_conv2d nhwc
+
+ Parameters
+ ----------
+ cfg: ConfigEntity
+ The config for this template
+
+ data : tvm.te.Tensor
+ 4-D with shape [batch, in_height, in_width, in_channel]
+
+ kernel : tvm.te.Tensor
+ 4-D with shape [filter_height, filter_width, in_channel,
channel_multiplier]
+
+ strides : list of two ints
+ [stride_height, stride_width]
+
+ padding : list of two ints
+ [pad_height, pad_width]
+
+ dilation : list of two ints
+ [dilation_height, dilation_width]
+
+ out_dtype: str
+ The output type. This is used for mixed precision.
+
+ Returns
+ -------
+ output : tvm.te.Tensor
+ 4-D with shape [batch, out_height, out_width, out_channel]
+ """
+
+ out_dtype = out_dtype or data.dtype
+
+ N, IH, IW, IC = get_const_tuple(data.shape)
+
+ if isinstance(dilation, int):
+ dilation_h = dilation_w = dilation
+ else:
+ dilation_h, dilation_w = dilation
+
+ KH, KW, IC, channel_multiplier = get_const_tuple(kernel.shape)
+
+ dilated_kernel_h = (KH - 1) * dilation_h + 1
+ dilated_kernel_w = (KW - 1) * dilation_w + 1
+
+ pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
+ padding, (dilated_kernel_h, dilated_kernel_w))
+ HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides,
strides)
+
+ OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1
+ OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
+
+ if pad_top or pad_left:
+ data_pad = nn.pad(data, [0, pad_top, pad_left, 0], [0, pad_down,
pad_right, 0],
+ name="data_pad")
+ else:
+ data_pad = data
+
+ output_shape = (N, OH, OW, IC*channel_multiplier)
+
+ idxdiv = tvm.tir.indexdiv
+ idxmod = tvm.tir.indexmod
+
+ reduce_h = te.reduce_axis((0, KH), name='reduce_h')
+ reduce_w = te.reduce_axis((0, KW), name='reduce_w')
+
+ out = te.compute(output_shape, lambda n, h, w, c:
+ te.sum(data_pad[n,
+ HSTR*h+dilation_h*reduce_h,
+ w*WSTR+reduce_w*dilation_w,
+ idxdiv(c,
channel_multiplier)].astype(out_dtype) *
+ kernel[reduce_h,
+ reduce_w,
+ idxdiv(c, channel_multiplier),
+ idxmod(c,
channel_multiplier)].astype(out_dtype),
+ axis=[reduce_h, reduce_w]),
+ name='depthwise_conv2d_nhwc_output')
+ return out
+
[email protected]_topi_schedule("depthwise_conv2d_nhwc.arm_cpu")
+def schedule_depthwise_conv2d_nhwc(cfg, outs):
+ """Create the schedule for depthwise_conv2d_nchw_spatial_pack"""
+ outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
+ s = te.create_schedule([x.op for x in outs])
+ out = outs[0]
+
+ ##### space definition begin #####
+ n, h, w, c = s[out].op.axis
+ cfg.define_split('tile_c', c, num_outputs=2)
+ _, hi = cfg.define_split('tile_h', h, num_outputs=2)
+ _, wi = cfg.define_split('tile_w', w, num_outputs=2)
+ cfg.define_annotate('locate_output', [hi, wi], 'locate_cache',
num_anchor=1)
+
+ # fallback support
+ if cfg.is_fallback:
+ cfg['tile_c'] = SplitEntity([-1, 8])
+ cfg['tile_h'] = SplitEntity([-1, 2])
+ cfg['tile_w'] = SplitEntity([-1, 2])
+ cfg['locate_output'] = AnnotateEntity([1])
+ ##### space definition end #####
+
+ def schedule_conv(conv):
+ conv_data = conv.op.input_tensors[0]
+ if conv_data.name == "data_pad":
+ s[conv_data].compute_inline()
Review comment:
Since those will be other 4 knobs to add, I extracted and tuned the
depthwise operators in mobilenet_v2 with the 4 different policies (no padding,
inline, `compute_at{ho,wo}`) and reported the results in terms of TFlite/TVM
times (higher is better)
|H/W | C |S | inline | compute_at(ho)
|compute_at(wo) | no-inline |
|----|-----|---|------------------------|----------------------|---------------------|----------------------|
|112 | 96 |2 | 1.452941176470588 | 0.7042857142857142
|0.8355932203389829 | 0.36249999999999993 |
|56 |144 |1 | 1.7249999999999999 | 0.85
|0.9714285714285715 | 1.38 |
|56 |144 |2 | 3.028571428571429 | 1.3187499999999999
|0.45869565217391306 | 1.5214285714285716 |
|28 |192 |1 | 1.711111111111111 | 0.76
|0.5166666666666667 | 1.409090909090909 |
|28 |192 |2 | 1.6833333333333333 | 0.5666666666666667
|0.48095238095238096 | 1.442857142857143 |
|14 |384 |1 | 3.15 | 1.26 |0.63
| 0.5727272727272728 |
|14 |576 |1 | 0.8863636363636364 | 0.97 |0.40625
| 0.527027027027027 |
|14 |576 |2 | 2.4 | 0.6857142857142858
|0.7000000000000001 | 0.6714285714285715 |
|7 |960 |1 | 2.9272727272727272 | 1.211320754716981
|0.9056338028169014 | 1.3416666666666668 |
* Since this is a memory bound operator, not inlining padding is always
going to behave poorly, so I would remove it to reduce the tuning time. There
would be the argument also to avoid the `compute_at` policies, but since I
didn't try other networks (and in small cases it seems to run better) I would
leave those 3 knobs (instead of 4)
* Except one case, we are always faster (sometimes a lot faster) than
TFlite. Once I am done with this and other few improvements I will compare
with ACL as well.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]