[tvm] branch main updated: [AutoScheduler] Enable winograd for conv2d and layout rewrite for conv3d (#7168)

lmzheng Sat, 26 Dec 2020 03:45:27 -0800

This is an automated email from the ASF dual-hosted git repository.

lmzheng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new c1bca8e  [AutoScheduler] Enable winograd for conv2d and layout rewrite 
for conv3d (#7168)
c1bca8e is described below

commit c1bca8e32c53a0a04a7287be82bcc5735edb8abc
Author: Lianmin Zheng <[email protected]>
AuthorDate: Sat Dec 26 03:45:08 2020 -0800

    [AutoScheduler] Enable winograd for conv2d and layout rewrite for conv3d 
(#7168)
    
    * [AutoScheduler] Enable winograd for conv2d & Enable layout rewrite for 
conv3d
    
    * fix test
    
    * fix test
    
    * update tutorials
---
 docs/conf.py                                       |   7 +-
 include/tvm/relay/attrs/nn.h                       |  14 +--
 python/tvm/relay/op/strategy/generic.py            |  18 ++--
 python/tvm/relay/op/strategy/x86.py                | 102 ++++++++++++++++++---
 python/tvm/topi/cuda/conv2d_winograd.py            |   9 +-
 python/tvm/topi/nn/conv2d.py                       |  56 +++++++++--
 python/tvm/topi/nn/conv3d.py                       |  40 +++++++-
 python/tvm/topi/x86/conv2d_alter_op.py             |  54 ++++++++---
 .../search_policy/sketch_policy_rules.cc           |  13 ++-
 src/relay/op/nn/convolution.h                      |  12 ++-
 .../transforms/auto_scheduler_layout_rewrite.cc    |  12 ++-
 .../relay/test_auto_scheduler_layout_rewrite.py    |  90 ++++++++++++++----
 .../relay/test_auto_scheduler_task_extraction.py   |   2 +-
 tests/python/relay/test_auto_scheduler_tuning.py   |  32 ++++++-
 14 files changed, 379 insertions(+), 82 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index a7198bf..ad838f7 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -250,7 +250,12 @@ within_subsection_order = {
         "tune_relay_arm.py",
         "tune_relay_mobile_gpu.py",
     ],
-    "auto_scheduler": ["tune_matmul_x86.py", "tune_conv2d_layer_cuda.py"],
+    "auto_scheduler": [
+        "tune_matmul_x86.py",
+        "tune_conv2d_layer_cuda.py",
+        "tune_network_x86.py",
+        "tune_network_cuda.py",
+    ],
     "dev": ["low_level_custom_pass.py", "use_pass_infra.py", 
"bring_your_own_datatypes.py"],
 }
 
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 7bfd580..4d867be 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -210,9 +210,10 @@ struct Conv2DWinogradAttrs : public 
tvm::AttrsNode<Conv2DWinogradAttrs> {
   int groups;
   IndexExpr channels;
   Array<IndexExpr> kernel_size;
-  std::string data_layout;
-  std::string kernel_layout;
-  std::string out_layout;
+  tvm::String data_layout;
+  tvm::String kernel_layout;
+  tvm::String out_layout;
+  tvm::String auto_scheduler_rewritten_layout;  // The layout after 
auto-scheduler's layout rewrite
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(Conv2DWinogradAttrs, "relay.attrs.Conv2DWinogradAttrs") {
@@ -300,9 +301,10 @@ struct Conv3DAttrs : public tvm::AttrsNode<Conv3DAttrs> {
   int groups;
   IndexExpr channels;
   Array<IndexExpr> kernel_size;
-  std::string data_layout;
-  std::string kernel_layout;
-  std::string out_layout;
+  tvm::String data_layout;
+  tvm::String kernel_layout;
+  tvm::String out_layout;
+  tvm::String auto_scheduler_rewritten_layout;  // The layout after 
auto-scheduler's layout rewrite
   DataType out_dtype;
 
   TVM_DECLARE_ATTRS(Conv3DAttrs, "relay.attrs.Conv3DAttrs") {
diff --git a/python/tvm/relay/op/strategy/generic.py 
b/python/tvm/relay/op/strategy/generic.py
index 6864266..ea572ba 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -28,7 +28,10 @@ logger = logging.getLogger("strategy")
 
 
 def naive_schedule(_, outs, target):
-    """Return the naive default schedule"""
+    """Return the naive default schedule.
+    This function acts as a placeholder for op implementations that uses 
auto-scheduler.
+    Implemenations using this function should only be used along with 
auto-scheduler.
+    """
     if "gpu" in target.keys:
         # For GPU, we at least need thread binding to make a valid schedule.
         # So the naive schedule cannot be compiled.
@@ -502,7 +505,7 @@ def conv3d_transpose_strategy(attrs, inputs, out_type, 
target):
 
 
 # conv3d
-def wrap_compute_conv3d(topi_compute, need_layout=False):
+def wrap_compute_conv3d(topi_compute, need_layout=False, 
need_auto_scheduler_layout=False):
     """wrap conv3d topi compute"""
 
     def _compute_conv3d(attrs, inputs, out_type):
@@ -519,11 +522,14 @@ def wrap_compute_conv3d(topi_compute, need_layout=False):
             raise ValueError("Dilation should be positive value")
         if groups != 1:
             raise ValueError("Not support arbitrary group number for conv3d")
+
+        args = [inputs[0], inputs[1], strides, padding, dilation]
         if need_layout:
-            out = topi_compute(inputs[0], inputs[1], strides, padding, 
dilation, layout, out_dtype)
-        else:
-            out = topi_compute(inputs[0], inputs[1], strides, padding, 
dilation, out_dtype)
-        return [out]
+            args.append(layout)
+        args.append(out_dtype)
+        if need_auto_scheduler_layout:
+            args.append(get_auto_scheduler_rewritten_layout(attrs))
+        return [topi_compute(*args)]
 
     return _compute_conv3d
 
diff --git a/python/tvm/relay/op/strategy/x86.py 
b/python/tvm/relay/op/strategy/x86.py
index 841213a..9e3e191 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -91,6 +91,7 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
     """conv2d x86 strategy"""
     strategy = _op.OpStrategy()
     data, kernel = inputs
+    stride_h, stride_w = get_const_tuple(attrs.strides)
     dilation_h, dilation_w = get_const_tuple(attrs.dilation)
     groups = attrs.groups
     layout = attrs.data_layout
@@ -125,6 +126,35 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
                 wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc),
                 name="conv2d_nhwc.x86",
             )
+
+            judge_winograd_auto_scheduler = False
+            if len(kernel.shape) == 4:
+                kernel_h, kernel_w, _, co = get_const_tuple(kernel.shape)
+                judge_winograd_auto_scheduler = (
+                    "float" in data.dtype
+                    and "float" in kernel.dtype
+                    and kernel_h == 3
+                    and kernel_w == 3
+                    and stride_h == 1
+                    and stride_w == 1
+                    and dilation_h == 1
+                    and dilation_w == 1
+                    and 64 < co < 512
+                    # The last condition of co is based on our profiling of 
resnet workloads
+                    # on skylake avx512 machines. We found winograd is faster 
than direct
+                    # only when co is within this range
+                )
+
+            # register auto-scheduler implementations
+            if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler:
+                strategy.add_implementation(
+                    wrap_compute_conv2d(
+                        topi.nn.conv2d_winograd_nhwc, 
need_auto_scheduler_layout=True
+                    ),
+                    naive_schedule,  # this implementation should never be 
picked by autotvm
+                    name="conv2d_nhwc.winograd",
+                    plevel=15,
+                )
         elif layout == "HWCN":
             assert kernel_layout == "HWIO"
             if not is_auto_scheduler_enabled():
@@ -269,20 +299,39 @@ def conv3d_strategy_cpu(attrs, inputs, out_type, target):
     """conv3d generic strategy"""
     strategy = _op.OpStrategy()
     layout = attrs.data_layout
-    if layout == "NCDHW":
-        strategy.add_implementation(
-            wrap_compute_conv3d(topi.x86.conv3d_ncdhw),
-            wrap_topi_schedule(topi.x86.schedule_conv3d_ncdhw),
-            name="conv3d_ncdhw.x86",
-        )
-    elif layout == "NDHWC":
-        strategy.add_implementation(
-            wrap_compute_conv3d(topi.x86.conv3d_ndhwc),
-            wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc),
-            name="conv3d_ndhwc.x86",
-        )
+    if is_auto_scheduler_enabled():
+        # Use auto-scheduler. We should provide clear compute definition 
without autotvm templates
+        # or packed layouts.
+        if layout == "NCDHW":
+            strategy.add_implementation(
+                wrap_compute_conv3d(topi.nn.conv3d_ncdhw, 
need_auto_scheduler_layout=True),
+                naive_schedule,
+                name="conv3d_ncdhw.x86",
+            )
+        elif layout == "NDHWC":
+            strategy.add_implementation(
+                wrap_compute_conv3d(topi.nn.conv3d_ndhwc, 
need_auto_scheduler_layout=True),
+                naive_schedule,
+                name="conv3d_ndhwc.x86",
+            )
+        else:
+            raise ValueError("Not support this layout {} yet".format(layout))
     else:
-        raise ValueError("Not support this layout {} yet".format(layout))
+        # Use autotvm templates
+        if layout == "NCDHW":
+            strategy.add_implementation(
+                wrap_compute_conv3d(topi.x86.conv3d_ncdhw),
+                wrap_topi_schedule(topi.x86.schedule_conv3d_ncdhw),
+                name="conv3d_ncdhw.x86",
+            )
+        elif layout == "NDHWC":
+            strategy.add_implementation(
+                wrap_compute_conv3d(topi.x86.conv3d_ndhwc),
+                wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc),
+                name="conv3d_ndhwc.x86",
+            )
+        else:
+            raise ValueError("Not support this layout {} yet".format(layout))
     return strategy
 
 
@@ -476,3 +525,30 @@ def scatter_nd_strategy_cpu(attrs, inputs, out_type, 
target):
         plevel=10,
     )
     return strategy
+
+
+@conv2d_winograd_without_weight_transfrom_strategy.register("cpu")
+def conv2d_winograd_without_weight_transfrom_strategy_cpu(attrs, inputs, 
out_type, target):
+    """conv2d_winograd_without_weight_transfrom cpu strategy"""
+    dilation = attrs.get_int_tuple("dilation")
+    groups = attrs.get_int("groups")
+    layout = attrs.data_layout
+    strides = attrs.get_int_tuple("strides")
+    assert dilation == (1, 1), "Do not support dilate now"
+    assert strides == (1, 1), "Do not support strides now"
+    assert groups == 1, "Do not supoort arbitrary group number"
+    strategy = _op.OpStrategy()
+    if layout == "NHWC":
+        strategy.add_implementation(
+            wrap_compute_conv2d(
+                topi.nn.conv2d_winograd_nhwc_without_weight_transform,
+                need_auto_scheduler_layout=True,
+            ),
+            naive_schedule,
+            name="ansor.winograd",
+        )
+    else:
+        raise RuntimeError(
+            "Unsupported conv2d_winograd_without_weight_transfrom layout 
{}".format(layout)
+        )
+    return strategy
diff --git a/python/tvm/topi/cuda/conv2d_winograd.py 
b/python/tvm/topi/cuda/conv2d_winograd.py
index 96ab064..8a3f009 100644
--- a/python/tvm/topi/cuda/conv2d_winograd.py
+++ b/python/tvm/topi/cuda/conv2d_winograd.py
@@ -363,7 +363,14 @@ def 
schedule_conv2d_nchw_winograd_without_weight_transform(cfg, outs):
 
 @conv2d_winograd_nhwc.register(["cuda", "gpu"])
 def conv2d_winograd_nhwc_cuda(
-    data, weight, strides, padding, dilation, out_dtype, pre_computed=False
+    data,
+    weight,
+    strides,
+    padding,
+    dilation,
+    out_dtype,
+    pre_computed=False,
+    auto_scheduler_rewritten_layout="",
 ):
     """Conv2D Winograd in NHWC layout.
     This is a clean version to be used by the auto-scheduler for both CPU and 
GPU.
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index e2384c4..886470b 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -982,6 +982,7 @@ def _conv2d_winograd_nhwc_impl(
     out_dtype,
     tile_size,
     pre_computed=False,
+    auto_scheduler_rewritten_layout="",
 ):
     """Conv2D Winograd implementation in NHWC layout.
     This is a clean version to be used by the auto-scheduler for both CPU and 
GPU.
@@ -1002,8 +1003,10 @@ def _conv2d_winograd_nhwc_impl(
         Specifies the output data type.
     tile_size : int
         The size of the tile to use for the Winograd filter
-    pre_computed: bool
+    pre_computed: bool = False
         Whether the kernel is precomputed
+    auto_scheduler_rewritten_layout: str = ""
+        The layout after auto-scheduler's layout rewrite pass.
 
     Returns
     -------
@@ -1020,7 +1023,16 @@ def _conv2d_winograd_nhwc_impl(
     if not pre_computed:
         KH, KW, CI, CO = get_const_tuple(weight.shape)
     else:
-        H_CAT, W_CAT, CO, CI = get_const_tuple(weight.shape)
+        if auto_scheduler_rewritten_layout:
+            H_CAT, W_CAT, CO, CI = get_const_tuple(
+                auto_scheduler.get_shape_from_rewritten_layout(
+                    auto_scheduler_rewritten_layout, ["eps", "nu", "co", "ci"]
+                )
+            )
+            auto_scheduler.remove_index_check(weight)
+        else:
+            H_CAT, W_CAT, CO, CI = get_const_tuple(weight.shape)
+
         KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
 
     pad_t, pad_l, pad_b, pad_r = get_pad_tuple(padding, (KH, KW))
@@ -1052,8 +1064,10 @@ def _conv2d_winograd_nhwc_impl(
             ),
             name="kernel_pack",
         )
+        attrs = {}
     else:
         kernel_pack = weight
+        attrs = {"layout_free_placeholders": [kernel_pack]}
 
     # pack data tile
     input_tile = te.compute(
@@ -1085,9 +1099,12 @@ def _conv2d_winograd_nhwc_impl(
             data_pack[eps][nu][p][ci] * kernel_pack[eps][nu][co][ci], axis=[ci]
         ),
         name="bgemm",
-        attrs={"layout_free_placeholders": [kernel_pack]},
+        attrs=attrs,
     )
 
+    if auto_scheduler_rewritten_layout:
+        bgemm = auto_scheduler.rewrite_compute_body(bgemm, 
auto_scheduler_rewritten_layout)
+
     # inverse transform
     r_a = te.reduce_axis((0, alpha), "r_a")
     r_b = te.reduce_axis((0, alpha), "r_b")
@@ -1112,7 +1129,16 @@ def _conv2d_winograd_nhwc_impl(
 
 
 @tvm.target.generic_func
-def conv2d_winograd_nhwc(data, weight, strides, padding, dilation, out_dtype, 
pre_computed=False):
+def conv2d_winograd_nhwc(
+    data,
+    weight,
+    strides,
+    padding,
+    dilation,
+    out_dtype,
+    pre_computed=False,
+    auto_scheduler_rewritten_layout="",
+):
     """Conv2D Winograd in NHWC layout.
     This is a clean version to be used by the auto-scheduler for both CPU and 
GPU.
 
@@ -1132,6 +1158,8 @@ def conv2d_winograd_nhwc(data, weight, strides, padding, 
dilation, out_dtype, pr
         Specifies the output data type.
     pre_computed: bool
         Whether the kernel is precomputed
+    auto_scheduler_rewritten_layout: str = ""
+        The layout after auto-scheduler's layout rewrite pass.
 
     Returns
     -------
@@ -1149,11 +1177,18 @@ def conv2d_winograd_nhwc(data, weight, strides, 
padding, dilation, out_dtype, pr
         out_dtype,
         tile_size,
         pre_computed,
+        auto_scheduler_rewritten_layout,
     )
 
 
 def conv2d_winograd_nhwc_without_weight_transform(
-    data, weight, strides, padding, dilation, out_dtype
+    data,
+    weight,
+    strides,
+    padding,
+    dilation,
+    out_dtype,
+    auto_scheduler_rewritten_layout="",
 ):
     """Conv2D Winograd without layout transform in NHWC layout.
     This is a clean version to be used by the auto-scheduler for both CPU and 
GPU.
@@ -1172,6 +1207,8 @@ def conv2d_winograd_nhwc_without_weight_transform(
         dilation size, or [dilation_height, dilation_width]
     out_dtype : str, optional
         Specifies the output data type.
+    auto_scheduler_rewritten_layout: str = ""
+        The layout after auto-scheduler's layout rewrite pass.
 
     Returns
     -------
@@ -1180,5 +1217,12 @@ def conv2d_winograd_nhwc_without_weight_transform(
     """
 
     return conv2d_winograd_nhwc(
-        data, weight, strides, padding, dilation, out_dtype, pre_computed=True
+        data,
+        weight,
+        strides,
+        padding,
+        dilation,
+        out_dtype,
+        pre_computed=True,
+        auto_scheduler_rewritten_layout=auto_scheduler_rewritten_layout,
     )
diff --git a/python/tvm/topi/nn/conv3d.py b/python/tvm/topi/nn/conv3d.py
index f3cda28..2679588 100644
--- a/python/tvm/topi/nn/conv3d.py
+++ b/python/tvm/topi/nn/conv3d.py
@@ -18,7 +18,7 @@
 # pylint: disable=unused-argument, redefined-builtin, no-else-return
 """Conv3D operators"""
 import tvm
-from tvm import te
+from tvm import te, auto_scheduler
 
 from .pad import pad
 from .utils import get_pad_tuple3d
@@ -104,7 +104,15 @@ def conv3d_ncdhw(Input, Filter, stride, padding, dilation, 
out_dtype=None):
     )
 
 
-def conv3d_ndhwc(Input, Filter, stride, padding, dilation, 
out_dtype="float32"):
+def conv3d_ndhwc(
+    Input,
+    Filter,
+    stride,
+    padding,
+    dilation,
+    out_dtype="float32",
+    auto_scheduler_rewritten_layout="",
+):
     """Convolution operator in NDHWC layout.
 
     Parameters
@@ -124,6 +132,12 @@ def conv3d_ndhwc(Input, Filter, stride, padding, dilation, 
out_dtype="float32"):
     dilation: int or a list/tuple of three ints
         dilation size, or [dilation_depth, dilation_height, dilation_width]
 
+    out_dtype: str = "float32",
+        The type of output tensor
+
+    auto_scheduler_rewritten_layout: str = ""
+        The layout after auto-scheduler's layout rewrite pass.
+
     Returns
     -------
     Output : tvm.te.Tensor
@@ -143,7 +157,22 @@ def conv3d_ndhwc(Input, Filter, stride, padding, dilation, 
out_dtype="float32"):
         dilation_d, dilation_h, dilation_w = dilation
 
     batch, in_depth, in_height, in_width, in_channel = Input.shape
-    kernel_d, kernel_h, kernel_w, channel, num_filter = Filter.shape
+
+    if auto_scheduler_rewritten_layout:
+        # Infer shape for the rewritten layout
+        (
+            kernel_d,
+            kernel_h,
+            kernel_w,
+            channel,
+            num_filter,
+        ) = auto_scheduler.get_shape_from_rewritten_layout(
+            auto_scheduler_rewritten_layout, ["rd", "rh", "rw", "rc", "cc"]
+        )
+        auto_scheduler.remove_index_check(Filter)
+    else:
+        kernel_d, kernel_h, kernel_w, channel, num_filter = Filter.shape
+
     # compute the output shape
     dilated_kernel_d = (kernel_d - 1) * dilation_d + 1
     dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
@@ -178,7 +207,12 @@ def conv3d_ndhwc(Input, Filter, stride, padding, dilation, 
out_dtype="float32"):
         ),
         name="Conv3dOutput",
         tag="conv3d_ndhwc",
+        attrs={"layout_free_placeholders": [Filter]},
     )
+
+    if auto_scheduler_rewritten_layout:
+        Output = auto_scheduler.rewrite_compute_body(Output, 
auto_scheduler_rewritten_layout)
+
     return Output
 
 
diff --git a/python/tvm/topi/x86/conv2d_alter_op.py 
b/python/tvm/topi/x86/conv2d_alter_op.py
index db3c232..979dc5a 100644
--- a/python/tvm/topi/x86/conv2d_alter_op.py
+++ b/python/tvm/topi/x86/conv2d_alter_op.py
@@ -40,21 +40,6 @@ _OIHWio_matcher = re.compile("^OIHW[0-9]+i[0-9]+o$")
 def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     target = tvm.target.Target.current(allow_none=False)
     dispatch_ctx = autotvm.task.DispatchContext.current
-    if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest):
-        cfg = dispatch_ctx.query(target, None)
-        workload = cfg.workload
-    else:
-        _, outs = relay.backend.compile_engine.select_implementation(
-            relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
-        )
-        workload = autotvm.task.get_workload(outs)
-        if workload is None:
-            # The best implementation is not an AutoTVM template,
-            # we then assume it's not necessary to alter this op.
-            return None
-        cfg = dispatch_ctx.query(target, workload)
-
-    topi_tmpl = workload[0]
     new_attrs = {k: attrs[k] for k in attrs.keys()}
 
     # Parse the attributes.
@@ -68,6 +53,45 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     kernel_dtype = kernel_tensor.dtype
     out_dtype = out_type.dtype
 
+    if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest):
+        cfg = dispatch_ctx.query(target, None)
+        workload = cfg.workload
+    else:
+        impl, outs = relay.backend.compile_engine.select_implementation(
+            relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
+        )
+        workload = autotvm.task.get_workload(outs)
+        if workload is None:
+            # The best implementation is not an AutoTVM template.
+            # It may be from the auto-scheduler
+            if impl.name.find("winograd") != -1:
+                if dilation != (1, 1):
+                    logger.warning("Does not support weight pre-transform for 
dilated convolution.")
+                    return None
+
+                assert data_layout == "NHWC" and kernel_layout == "HWIO"
+                N, H, W, CI = get_const_tuple(data_tensor.shape)
+                KH, KW, _, CO = get_const_tuple(kernel_tensor.shape)
+
+                # Pre-compute weight transformation in winograd
+                tile_size = 4
+                # HWIO -> OIHW
+                kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0, 
1])
+                # alpha, alpha, CO, CI
+                weight = relay.nn.contrib_conv2d_winograd_weight_transform(
+                    kernel_transform, tile_size=tile_size
+                )
+                new_attrs["tile_size"] = tile_size
+                new_attrs["channels"] = CO
+                return 
relay.nn.contrib_conv2d_winograd_without_weight_transform(
+                    inputs[0], weight, **new_attrs
+                )
+            return None
+
+        cfg = dispatch_ctx.query(target, workload)
+
+    topi_tmpl = workload[0]
+
     if topi_tmpl == "conv2d_NCHWc.x86":
         # we only convert conv2d_NCHW to conv2d_NCHWc for x86
         if data_layout == "NCHW" and kernel_layout == "OIHW":
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc 
b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
index 814e72a..f704fe9 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
@@ -806,12 +806,10 @@ PopulationGenerationRule::ResultKind 
InitThreadBind::Apply(SketchPolicyNode* pol
         total_space_extent *= pint->value;
       }
 
-      // Check if the total space extent is too small for multi-level thread 
binding
-      if (total_space_extent <= 
policy->search_task->hardware_params->warp_size) {
-        Iterator fused_it;
-        *state = FuseAllOuterSpaceIterators(*state, stage_id, &fused_it);
-        state->bind(stage_id, fused_it, IteratorAnnotation::kThreadX);
-        continue;
+      bool check_min_thread_extent = true;
+      // If the total space extent is too small, disable the check of minimal 
thread extent
+      if (total_space_extent <= 
policy->search_task->hardware_params->warp_size * 2) {
+        check_min_thread_extent = false;
       }
 
       // Fuse the outermost space tile as blockIdx
@@ -856,7 +854,8 @@ PopulationGenerationRule::ResultKind 
InitThreadBind::Apply(SketchPolicyNode* pol
         to_fuse.push_back((*state)->stages[stage_id]->iters[i]);
       }
       const auto& threadidx_it = state->fuse(stage_id, to_fuse);
-      if (GetExtent(threadidx_it) < 
policy->search_task->hardware_params->warp_size) {
+      if (check_min_thread_extent &&
+          GetExtent(threadidx_it) < 
policy->search_task->hardware_params->warp_size) {
         return ResultKind::kInvalid;
       }
       state->bind(stage_id, threadidx_it, IteratorAnnotation::kThreadX);
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index 2b3a978..c08d355 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -356,8 +356,16 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs, 
const Attrs& attrs,
       weight_dtype = weight->dtype;
     }
 
-    // assign result to reporter
-    reporter->Assign(types[1], TensorType(wshape, weight_dtype));
+    if (param->auto_scheduler_rewritten_layout.size() == 0) {
+      // Normal case: assign result to reporter
+      reporter->Assign(types[1], TensorType(wshape, weight_dtype));
+    } else {
+      // If the layout is rewritten by auto-scheduler,
+      // we just forcly apply the layout provided by auto-scheduler and
+      // skip the normal inference logic.
+      {}  // do nothing
+    }
+
   } else {
     // use weight to infer the conv shape.
     if (weight == nullptr) return false;
diff --git a/src/relay/transforms/auto_scheduler_layout_rewrite.cc 
b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
index 53e7a02..edc4119 100644
--- a/src/relay/transforms/auto_scheduler_layout_rewrite.cc
+++ b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
@@ -83,6 +83,10 @@ class FuncMutator : public ExprMutator {
       Attrs updated_attrs;
       if (auto pattr = call->attrs.as<Conv2DAttrs>()) {
         updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout);
+      } else if (auto pattr = call->attrs.as<Conv2DWinogradAttrs>()) {
+        updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout);
+      } else if (auto pattr = call->attrs.as<Conv3DAttrs>()) {
+        updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout);
       } else if (auto pattr = call->attrs.as<DenseAttrs>()) {
         updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout);
       } else if (auto pattr = call->attrs.as<BatchMatmulAttrs>()) {
@@ -99,7 +103,9 @@ class FuncMutator : public ExprMutator {
   std::deque<std::string> ori_layouts_queue_;
   std::deque<std::string> new_layouts_queue_;
 
-  std::vector<std::string> target_ops_{"nn.conv2d", "nn.dense", 
"nn.batch_matmul"};
+  std::vector<std::string> target_ops_{"nn.conv2d", "nn.conv3d",
+                                       
"nn.contrib_conv2d_winograd_without_weight_transform",
+                                       "nn.dense", "nn.batch_matmul"};
 };
 
 Expr AutoSchedulerLayoutRewriter::VisitExpr_(const CallNode* n) {
@@ -156,6 +162,10 @@ 
TVM_REGISTER_GLOBAL("relay.attrs.get_auto_scheduler_rewritten_layout")
     .set_body_typed([](const Attrs& attrs) {
       if (attrs->IsInstance<Conv2DAttrs>()) {
         return attrs.as<Conv2DAttrs>()->auto_scheduler_rewritten_layout;
+      } else if (attrs->IsInstance<Conv2DWinogradAttrs>()) {
+        return 
attrs.as<Conv2DWinogradAttrs>()->auto_scheduler_rewritten_layout;
+      } else if (attrs->IsInstance<Conv3DAttrs>()) {
+        return attrs.as<Conv3DAttrs>()->auto_scheduler_rewritten_layout;
       } else if (attrs->IsInstance<DenseAttrs>()) {
         return attrs.as<DenseAttrs>()->auto_scheduler_rewritten_layout;
       } else if (attrs->IsInstance<BatchMatmulAttrs>()) {
diff --git a/tests/python/relay/test_auto_scheduler_layout_rewrite.py 
b/tests/python/relay/test_auto_scheduler_layout_rewrite.py
index 577f6d6..56b57e1 100644
--- a/tests/python/relay/test_auto_scheduler_layout_rewrite.py
+++ b/tests/python/relay/test_auto_scheduler_layout_rewrite.py
@@ -30,8 +30,8 @@ def get_np_array(var, dtype):
 
 
 def get_relay_conv2d(
-    outc=128,
-    inc=64,
+    outc=32,
+    inc=32,
     height=14,
     width=14,
     kh=3,
@@ -70,6 +70,49 @@ def get_relay_conv2d(
     return mod, data, weight
 
 
+def get_relay_conv3d(
+    outc=8,
+    inc=8,
+    depth=8,
+    height=7,
+    width=7,
+    kd=1,
+    kh=1,
+    kw=1,
+    batch=1,
+    pad=0,
+    stride=1,
+    dilation=1,
+    layout="NDHWC",
+):
+    dtype = "float32"
+    if layout == "NDHWC":
+        kernel_layout = "DHWIO"
+        d = relay.var("data", shape=(batch, depth, height, width, inc), 
dtype=dtype)
+        w = relay.var("weight", shape=(kd, kh, kw, inc, outc), dtype=dtype)
+    elif layout == "NCDHW":
+        kernel_layout = "OIDHW"
+        d = relay.var("data", shape=(batch, inc, depth, height, width), 
dtype=dtype)
+        w = relay.var("weight", shape=(outc, inc, kd, kh, kw), dtype=dtype)
+
+    y = relay.nn.conv3d(
+        d,
+        w,
+        padding=pad,
+        kernel_size=(kd, kh, kw),
+        strides=(stride, stride, stride),
+        dilation=(dilation, dilation, dilation),
+        channels=outc,
+        groups=1,
+        data_layout=layout,
+        kernel_layout=kernel_layout,
+    )
+    mod = tvm.IRModule()
+    mod["main"] = relay.Function([d, w], y)
+    data, weight = get_np_array(d, dtype), get_np_array(w, dtype)
+    return mod, data, weight
+
+
 def get_relay_dense(m=128, n=128, k=128):
     dtype = "float32"
     d = relay.var("data", shape=(m, k), dtype=dtype)
@@ -95,7 +138,9 @@ def get_relay_batchmm(batch=4, m=128, n=128, k=128):
 def tune_and_check(mod, data, weight):
     # Extract tasks from a relay program
     target = tvm.target.Target("llvm")
-    tasks, task_weights = auto_scheduler.extract_tasks(mod, target=target, 
params={})
+    tasks, task_weights = auto_scheduler.extract_tasks(
+        mod, target=target, params={"weight": weight}
+    )
 
     with tempfile.NamedTemporaryFile() as fp:
         log_file = fp.name
@@ -110,16 +155,19 @@ def tune_and_check(mod, data, weight):
         )
         tuner.tune(tune_option, search_policy="sketch.random")
 
-        # Compile and run
-        def compile_and_run(disabled_pass={}):
-            with auto_scheduler.ApplyHistoryBest(log_file):
-                with tvm.transform.PassContext(
-                    opt_level=3,
-                    config={"relay.backend.use_auto_scheduler": True},
-                    disabled_pass=disabled_pass,
-                ):
-                    lib = relay.build(mod, target=target, params={"weight": 
weight})
+        # Compile
+        with auto_scheduler.ApplyHistoryBest(log_file):
+            with tvm.transform.PassContext(
+                opt_level=3,
+                config={"relay.backend.use_auto_scheduler": True},
+            ):
+                lib = relay.build(mod, target=target, params={"weight": 
weight})
 
+        # Compile without auto-scheduler for correctness check
+        with tvm.transform.PassContext(opt_level=0):
+            lib2 = relay.build(mod, target=target, params={"weight": weight})
+
+        def get_output(data, lib):
             ctx = tvm.cpu()
             module = graph_runtime.GraphModule(lib["default"](ctx))
             module.set_input("data", data)
@@ -128,19 +176,27 @@ def tune_and_check(mod, data, weight):
             return module.get_output(0).asnumpy()
 
         # Check correctness
-        actual_output = compile_and_run()
-        expected_output = 
compile_and_run(disabled_pass={"AutoSchedulerLayoutRewrite"})
+        actual_output = get_output(data, lib)
+        expected_output = get_output(data, lib2)
 
         tvm.testing.assert_allclose(actual_output, expected_output, rtol=1e-4, 
atol=1e-4)
 
 
 def test_conv2d():
-    # wrap the search in a new thread to avoid the conflict
-    # between python's multiprocessing and tvm's thread pool
     mod, data, weight = get_relay_conv2d(kh=1, kw=1)
     tune_and_check(mod, data, weight)
 
 
+def test_conv2d_winograd():
+    mod, data, weight = get_relay_conv2d(kh=3, kw=3)
+    tune_and_check(mod, data, weight)
+
+
+def test_conv3d():
+    mod, data, weight = get_relay_conv3d()
+    tune_and_check(mod, data, weight)
+
+
 def test_dense():
     mod, data, weight = get_relay_dense()
     tune_and_check(mod, data, weight)
@@ -153,5 +209,7 @@ def test_batch_matmul():
 
 if __name__ == "__main__":
     test_conv2d()
+    test_conv2d_winograd()
+    test_conv3d()
     test_dense()
     test_batch_matmul()
diff --git a/tests/python/relay/test_auto_scheduler_task_extraction.py 
b/tests/python/relay/test_auto_scheduler_task_extraction.py
index a58e04a..531d041 100644
--- a/tests/python/relay/test_auto_scheduler_task_extraction.py
+++ b/tests/python/relay/test_auto_scheduler_task_extraction.py
@@ -46,7 +46,7 @@ def get_network(name, batch_size=1, layout="NHWC"):
             num_layers=50, batch_size=batch_size, layout=layout, 
image_shape=image_shape
         )
     elif name == "winograd-test":
-        input_shape = [1, 23, 40, 128]
+        input_shape = [1, 23, 40, 32]
 
         data = relay.var("data", shape=input_shape, dtype="float32")
         net = relay.testing.layers.conv2d(
diff --git a/tests/python/relay/test_auto_scheduler_tuning.py 
b/tests/python/relay/test_auto_scheduler_tuning.py
index d42373c..4ae434d 100644
--- a/tests/python/relay/test_auto_scheduler_tuning.py
+++ b/tests/python/relay/test_auto_scheduler_tuning.py
@@ -17,8 +17,11 @@
 """Test end-to-end network tuning with auto-scheduler"""
 import tempfile
 
-import tvm.testing
+import numpy as np
+
 from tvm import auto_scheduler, relay
+from tvm.contrib import graph_runtime
+import tvm.testing
 
 from test_auto_scheduler_task_extraction import get_network
 
@@ -53,9 +56,30 @@ def tune_network(network, target):
             ):
                 lib = relay.build(mod, target=target, params=params)
 
-    # Todo(merrymercy): when the cpu backend is upstreamed, do the following 
things:
-    # 1. compile without history to test the fallback mechanism
-    # 2. check the correctness of layout rewrite / winograd pre-transform
+        # Compile without auto-scheduler and any other optimization for 
correctness check
+        with tvm.transform.PassContext(opt_level=0):
+            lib2 = relay.build(mod, target=target, params=params)
+
+        # Check the correctness
+        def get_output(data, lib):
+            ctx = tvm.gpu()
+            module = graph_runtime.GraphModule(lib["default"](ctx))
+            module.set_input("data", data)
+            module.run()
+            return module.get_output(0).asnumpy()
+
+        np.random.seed(0)
+        if network == "mlp":
+            data = np.random.uniform(size=(1, 32))
+        elif network == "winograd-test":
+            data = np.random.uniform(size=(1, 23, 40, 32))
+        else:
+            raise ValueError("Unknown network: " + network)
+
+        actual_output = get_output(data, lib)
+        expected_output = get_output(data, lib2)
+
+        tvm.testing.assert_allclose(actual_output, expected_output, rtol=1e-4, 
atol=1e-4)
 
 
 @tvm.testing.requires_cuda

[tvm] branch main updated: [AutoScheduler] Enable winograd for conv2d and layout rewrite for conv3d (#7168)

Reply via email to