This is an automated email from the ASF dual-hosted git repository.
lmzheng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new c1bca8e [AutoScheduler] Enable winograd for conv2d and layout rewrite
for conv3d (#7168)
c1bca8e is described below
commit c1bca8e32c53a0a04a7287be82bcc5735edb8abc
Author: Lianmin Zheng <[email protected]>
AuthorDate: Sat Dec 26 03:45:08 2020 -0800
[AutoScheduler] Enable winograd for conv2d and layout rewrite for conv3d
(#7168)
* [AutoScheduler] Enable winograd for conv2d & Enable layout rewrite for
conv3d
* fix test
* fix test
* update tutorials
---
docs/conf.py | 7 +-
include/tvm/relay/attrs/nn.h | 14 +--
python/tvm/relay/op/strategy/generic.py | 18 ++--
python/tvm/relay/op/strategy/x86.py | 102 ++++++++++++++++++---
python/tvm/topi/cuda/conv2d_winograd.py | 9 +-
python/tvm/topi/nn/conv2d.py | 56 +++++++++--
python/tvm/topi/nn/conv3d.py | 40 +++++++-
python/tvm/topi/x86/conv2d_alter_op.py | 54 ++++++++---
.../search_policy/sketch_policy_rules.cc | 13 ++-
src/relay/op/nn/convolution.h | 12 ++-
.../transforms/auto_scheduler_layout_rewrite.cc | 12 ++-
.../relay/test_auto_scheduler_layout_rewrite.py | 90 ++++++++++++++----
.../relay/test_auto_scheduler_task_extraction.py | 2 +-
tests/python/relay/test_auto_scheduler_tuning.py | 32 ++++++-
14 files changed, 379 insertions(+), 82 deletions(-)
diff --git a/docs/conf.py b/docs/conf.py
index a7198bf..ad838f7 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -250,7 +250,12 @@ within_subsection_order = {
"tune_relay_arm.py",
"tune_relay_mobile_gpu.py",
],
- "auto_scheduler": ["tune_matmul_x86.py", "tune_conv2d_layer_cuda.py"],
+ "auto_scheduler": [
+ "tune_matmul_x86.py",
+ "tune_conv2d_layer_cuda.py",
+ "tune_network_x86.py",
+ "tune_network_cuda.py",
+ ],
"dev": ["low_level_custom_pass.py", "use_pass_infra.py",
"bring_your_own_datatypes.py"],
}
diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
index 7bfd580..4d867be 100644
--- a/include/tvm/relay/attrs/nn.h
+++ b/include/tvm/relay/attrs/nn.h
@@ -210,9 +210,10 @@ struct Conv2DWinogradAttrs : public
tvm::AttrsNode<Conv2DWinogradAttrs> {
int groups;
IndexExpr channels;
Array<IndexExpr> kernel_size;
- std::string data_layout;
- std::string kernel_layout;
- std::string out_layout;
+ tvm::String data_layout;
+ tvm::String kernel_layout;
+ tvm::String out_layout;
+ tvm::String auto_scheduler_rewritten_layout; // The layout after
auto-scheduler's layout rewrite
DataType out_dtype;
TVM_DECLARE_ATTRS(Conv2DWinogradAttrs, "relay.attrs.Conv2DWinogradAttrs") {
@@ -300,9 +301,10 @@ struct Conv3DAttrs : public tvm::AttrsNode<Conv3DAttrs> {
int groups;
IndexExpr channels;
Array<IndexExpr> kernel_size;
- std::string data_layout;
- std::string kernel_layout;
- std::string out_layout;
+ tvm::String data_layout;
+ tvm::String kernel_layout;
+ tvm::String out_layout;
+ tvm::String auto_scheduler_rewritten_layout; // The layout after
auto-scheduler's layout rewrite
DataType out_dtype;
TVM_DECLARE_ATTRS(Conv3DAttrs, "relay.attrs.Conv3DAttrs") {
diff --git a/python/tvm/relay/op/strategy/generic.py
b/python/tvm/relay/op/strategy/generic.py
index 6864266..ea572ba 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -28,7 +28,10 @@ logger = logging.getLogger("strategy")
def naive_schedule(_, outs, target):
- """Return the naive default schedule"""
+ """Return the naive default schedule.
+ This function acts as a placeholder for op implementations that uses
auto-scheduler.
+ Implemenations using this function should only be used along with
auto-scheduler.
+ """
if "gpu" in target.keys:
# For GPU, we at least need thread binding to make a valid schedule.
# So the naive schedule cannot be compiled.
@@ -502,7 +505,7 @@ def conv3d_transpose_strategy(attrs, inputs, out_type,
target):
# conv3d
-def wrap_compute_conv3d(topi_compute, need_layout=False):
+def wrap_compute_conv3d(topi_compute, need_layout=False,
need_auto_scheduler_layout=False):
"""wrap conv3d topi compute"""
def _compute_conv3d(attrs, inputs, out_type):
@@ -519,11 +522,14 @@ def wrap_compute_conv3d(topi_compute, need_layout=False):
raise ValueError("Dilation should be positive value")
if groups != 1:
raise ValueError("Not support arbitrary group number for conv3d")
+
+ args = [inputs[0], inputs[1], strides, padding, dilation]
if need_layout:
- out = topi_compute(inputs[0], inputs[1], strides, padding,
dilation, layout, out_dtype)
- else:
- out = topi_compute(inputs[0], inputs[1], strides, padding,
dilation, out_dtype)
- return [out]
+ args.append(layout)
+ args.append(out_dtype)
+ if need_auto_scheduler_layout:
+ args.append(get_auto_scheduler_rewritten_layout(attrs))
+ return [topi_compute(*args)]
return _compute_conv3d
diff --git a/python/tvm/relay/op/strategy/x86.py
b/python/tvm/relay/op/strategy/x86.py
index 841213a..9e3e191 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -91,6 +91,7 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
"""conv2d x86 strategy"""
strategy = _op.OpStrategy()
data, kernel = inputs
+ stride_h, stride_w = get_const_tuple(attrs.strides)
dilation_h, dilation_w = get_const_tuple(attrs.dilation)
groups = attrs.groups
layout = attrs.data_layout
@@ -125,6 +126,35 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc),
name="conv2d_nhwc.x86",
)
+
+ judge_winograd_auto_scheduler = False
+ if len(kernel.shape) == 4:
+ kernel_h, kernel_w, _, co = get_const_tuple(kernel.shape)
+ judge_winograd_auto_scheduler = (
+ "float" in data.dtype
+ and "float" in kernel.dtype
+ and kernel_h == 3
+ and kernel_w == 3
+ and stride_h == 1
+ and stride_w == 1
+ and dilation_h == 1
+ and dilation_w == 1
+ and 64 < co < 512
+ # The last condition of co is based on our profiling of
resnet workloads
+ # on skylake avx512 machines. We found winograd is faster
than direct
+ # only when co is within this range
+ )
+
+ # register auto-scheduler implementations
+ if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler:
+ strategy.add_implementation(
+ wrap_compute_conv2d(
+ topi.nn.conv2d_winograd_nhwc,
need_auto_scheduler_layout=True
+ ),
+ naive_schedule, # this implementation should never be
picked by autotvm
+ name="conv2d_nhwc.winograd",
+ plevel=15,
+ )
elif layout == "HWCN":
assert kernel_layout == "HWIO"
if not is_auto_scheduler_enabled():
@@ -269,20 +299,39 @@ def conv3d_strategy_cpu(attrs, inputs, out_type, target):
"""conv3d generic strategy"""
strategy = _op.OpStrategy()
layout = attrs.data_layout
- if layout == "NCDHW":
- strategy.add_implementation(
- wrap_compute_conv3d(topi.x86.conv3d_ncdhw),
- wrap_topi_schedule(topi.x86.schedule_conv3d_ncdhw),
- name="conv3d_ncdhw.x86",
- )
- elif layout == "NDHWC":
- strategy.add_implementation(
- wrap_compute_conv3d(topi.x86.conv3d_ndhwc),
- wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc),
- name="conv3d_ndhwc.x86",
- )
+ if is_auto_scheduler_enabled():
+ # Use auto-scheduler. We should provide clear compute definition
without autotvm templates
+ # or packed layouts.
+ if layout == "NCDHW":
+ strategy.add_implementation(
+ wrap_compute_conv3d(topi.nn.conv3d_ncdhw,
need_auto_scheduler_layout=True),
+ naive_schedule,
+ name="conv3d_ncdhw.x86",
+ )
+ elif layout == "NDHWC":
+ strategy.add_implementation(
+ wrap_compute_conv3d(topi.nn.conv3d_ndhwc,
need_auto_scheduler_layout=True),
+ naive_schedule,
+ name="conv3d_ndhwc.x86",
+ )
+ else:
+ raise ValueError("Not support this layout {} yet".format(layout))
else:
- raise ValueError("Not support this layout {} yet".format(layout))
+ # Use autotvm templates
+ if layout == "NCDHW":
+ strategy.add_implementation(
+ wrap_compute_conv3d(topi.x86.conv3d_ncdhw),
+ wrap_topi_schedule(topi.x86.schedule_conv3d_ncdhw),
+ name="conv3d_ncdhw.x86",
+ )
+ elif layout == "NDHWC":
+ strategy.add_implementation(
+ wrap_compute_conv3d(topi.x86.conv3d_ndhwc),
+ wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc),
+ name="conv3d_ndhwc.x86",
+ )
+ else:
+ raise ValueError("Not support this layout {} yet".format(layout))
return strategy
@@ -476,3 +525,30 @@ def scatter_nd_strategy_cpu(attrs, inputs, out_type,
target):
plevel=10,
)
return strategy
+
+
+@conv2d_winograd_without_weight_transfrom_strategy.register("cpu")
+def conv2d_winograd_without_weight_transfrom_strategy_cpu(attrs, inputs,
out_type, target):
+ """conv2d_winograd_without_weight_transfrom cpu strategy"""
+ dilation = attrs.get_int_tuple("dilation")
+ groups = attrs.get_int("groups")
+ layout = attrs.data_layout
+ strides = attrs.get_int_tuple("strides")
+ assert dilation == (1, 1), "Do not support dilate now"
+ assert strides == (1, 1), "Do not support strides now"
+ assert groups == 1, "Do not supoort arbitrary group number"
+ strategy = _op.OpStrategy()
+ if layout == "NHWC":
+ strategy.add_implementation(
+ wrap_compute_conv2d(
+ topi.nn.conv2d_winograd_nhwc_without_weight_transform,
+ need_auto_scheduler_layout=True,
+ ),
+ naive_schedule,
+ name="ansor.winograd",
+ )
+ else:
+ raise RuntimeError(
+ "Unsupported conv2d_winograd_without_weight_transfrom layout
{}".format(layout)
+ )
+ return strategy
diff --git a/python/tvm/topi/cuda/conv2d_winograd.py
b/python/tvm/topi/cuda/conv2d_winograd.py
index 96ab064..8a3f009 100644
--- a/python/tvm/topi/cuda/conv2d_winograd.py
+++ b/python/tvm/topi/cuda/conv2d_winograd.py
@@ -363,7 +363,14 @@ def
schedule_conv2d_nchw_winograd_without_weight_transform(cfg, outs):
@conv2d_winograd_nhwc.register(["cuda", "gpu"])
def conv2d_winograd_nhwc_cuda(
- data, weight, strides, padding, dilation, out_dtype, pre_computed=False
+ data,
+ weight,
+ strides,
+ padding,
+ dilation,
+ out_dtype,
+ pre_computed=False,
+ auto_scheduler_rewritten_layout="",
):
"""Conv2D Winograd in NHWC layout.
This is a clean version to be used by the auto-scheduler for both CPU and
GPU.
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index e2384c4..886470b 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -982,6 +982,7 @@ def _conv2d_winograd_nhwc_impl(
out_dtype,
tile_size,
pre_computed=False,
+ auto_scheduler_rewritten_layout="",
):
"""Conv2D Winograd implementation in NHWC layout.
This is a clean version to be used by the auto-scheduler for both CPU and
GPU.
@@ -1002,8 +1003,10 @@ def _conv2d_winograd_nhwc_impl(
Specifies the output data type.
tile_size : int
The size of the tile to use for the Winograd filter
- pre_computed: bool
+ pre_computed: bool = False
Whether the kernel is precomputed
+ auto_scheduler_rewritten_layout: str = ""
+ The layout after auto-scheduler's layout rewrite pass.
Returns
-------
@@ -1020,7 +1023,16 @@ def _conv2d_winograd_nhwc_impl(
if not pre_computed:
KH, KW, CI, CO = get_const_tuple(weight.shape)
else:
- H_CAT, W_CAT, CO, CI = get_const_tuple(weight.shape)
+ if auto_scheduler_rewritten_layout:
+ H_CAT, W_CAT, CO, CI = get_const_tuple(
+ auto_scheduler.get_shape_from_rewritten_layout(
+ auto_scheduler_rewritten_layout, ["eps", "nu", "co", "ci"]
+ )
+ )
+ auto_scheduler.remove_index_check(weight)
+ else:
+ H_CAT, W_CAT, CO, CI = get_const_tuple(weight.shape)
+
KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
pad_t, pad_l, pad_b, pad_r = get_pad_tuple(padding, (KH, KW))
@@ -1052,8 +1064,10 @@ def _conv2d_winograd_nhwc_impl(
),
name="kernel_pack",
)
+ attrs = {}
else:
kernel_pack = weight
+ attrs = {"layout_free_placeholders": [kernel_pack]}
# pack data tile
input_tile = te.compute(
@@ -1085,9 +1099,12 @@ def _conv2d_winograd_nhwc_impl(
data_pack[eps][nu][p][ci] * kernel_pack[eps][nu][co][ci], axis=[ci]
),
name="bgemm",
- attrs={"layout_free_placeholders": [kernel_pack]},
+ attrs=attrs,
)
+ if auto_scheduler_rewritten_layout:
+ bgemm = auto_scheduler.rewrite_compute_body(bgemm,
auto_scheduler_rewritten_layout)
+
# inverse transform
r_a = te.reduce_axis((0, alpha), "r_a")
r_b = te.reduce_axis((0, alpha), "r_b")
@@ -1112,7 +1129,16 @@ def _conv2d_winograd_nhwc_impl(
@tvm.target.generic_func
-def conv2d_winograd_nhwc(data, weight, strides, padding, dilation, out_dtype,
pre_computed=False):
+def conv2d_winograd_nhwc(
+ data,
+ weight,
+ strides,
+ padding,
+ dilation,
+ out_dtype,
+ pre_computed=False,
+ auto_scheduler_rewritten_layout="",
+):
"""Conv2D Winograd in NHWC layout.
This is a clean version to be used by the auto-scheduler for both CPU and
GPU.
@@ -1132,6 +1158,8 @@ def conv2d_winograd_nhwc(data, weight, strides, padding,
dilation, out_dtype, pr
Specifies the output data type.
pre_computed: bool
Whether the kernel is precomputed
+ auto_scheduler_rewritten_layout: str = ""
+ The layout after auto-scheduler's layout rewrite pass.
Returns
-------
@@ -1149,11 +1177,18 @@ def conv2d_winograd_nhwc(data, weight, strides,
padding, dilation, out_dtype, pr
out_dtype,
tile_size,
pre_computed,
+ auto_scheduler_rewritten_layout,
)
def conv2d_winograd_nhwc_without_weight_transform(
- data, weight, strides, padding, dilation, out_dtype
+ data,
+ weight,
+ strides,
+ padding,
+ dilation,
+ out_dtype,
+ auto_scheduler_rewritten_layout="",
):
"""Conv2D Winograd without layout transform in NHWC layout.
This is a clean version to be used by the auto-scheduler for both CPU and
GPU.
@@ -1172,6 +1207,8 @@ def conv2d_winograd_nhwc_without_weight_transform(
dilation size, or [dilation_height, dilation_width]
out_dtype : str, optional
Specifies the output data type.
+ auto_scheduler_rewritten_layout: str = ""
+ The layout after auto-scheduler's layout rewrite pass.
Returns
-------
@@ -1180,5 +1217,12 @@ def conv2d_winograd_nhwc_without_weight_transform(
"""
return conv2d_winograd_nhwc(
- data, weight, strides, padding, dilation, out_dtype, pre_computed=True
+ data,
+ weight,
+ strides,
+ padding,
+ dilation,
+ out_dtype,
+ pre_computed=True,
+ auto_scheduler_rewritten_layout=auto_scheduler_rewritten_layout,
)
diff --git a/python/tvm/topi/nn/conv3d.py b/python/tvm/topi/nn/conv3d.py
index f3cda28..2679588 100644
--- a/python/tvm/topi/nn/conv3d.py
+++ b/python/tvm/topi/nn/conv3d.py
@@ -18,7 +18,7 @@
# pylint: disable=unused-argument, redefined-builtin, no-else-return
"""Conv3D operators"""
import tvm
-from tvm import te
+from tvm import te, auto_scheduler
from .pad import pad
from .utils import get_pad_tuple3d
@@ -104,7 +104,15 @@ def conv3d_ncdhw(Input, Filter, stride, padding, dilation,
out_dtype=None):
)
-def conv3d_ndhwc(Input, Filter, stride, padding, dilation,
out_dtype="float32"):
+def conv3d_ndhwc(
+ Input,
+ Filter,
+ stride,
+ padding,
+ dilation,
+ out_dtype="float32",
+ auto_scheduler_rewritten_layout="",
+):
"""Convolution operator in NDHWC layout.
Parameters
@@ -124,6 +132,12 @@ def conv3d_ndhwc(Input, Filter, stride, padding, dilation,
out_dtype="float32"):
dilation: int or a list/tuple of three ints
dilation size, or [dilation_depth, dilation_height, dilation_width]
+ out_dtype: str = "float32",
+ The type of output tensor
+
+ auto_scheduler_rewritten_layout: str = ""
+ The layout after auto-scheduler's layout rewrite pass.
+
Returns
-------
Output : tvm.te.Tensor
@@ -143,7 +157,22 @@ def conv3d_ndhwc(Input, Filter, stride, padding, dilation,
out_dtype="float32"):
dilation_d, dilation_h, dilation_w = dilation
batch, in_depth, in_height, in_width, in_channel = Input.shape
- kernel_d, kernel_h, kernel_w, channel, num_filter = Filter.shape
+
+ if auto_scheduler_rewritten_layout:
+ # Infer shape for the rewritten layout
+ (
+ kernel_d,
+ kernel_h,
+ kernel_w,
+ channel,
+ num_filter,
+ ) = auto_scheduler.get_shape_from_rewritten_layout(
+ auto_scheduler_rewritten_layout, ["rd", "rh", "rw", "rc", "cc"]
+ )
+ auto_scheduler.remove_index_check(Filter)
+ else:
+ kernel_d, kernel_h, kernel_w, channel, num_filter = Filter.shape
+
# compute the output shape
dilated_kernel_d = (kernel_d - 1) * dilation_d + 1
dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
@@ -178,7 +207,12 @@ def conv3d_ndhwc(Input, Filter, stride, padding, dilation,
out_dtype="float32"):
),
name="Conv3dOutput",
tag="conv3d_ndhwc",
+ attrs={"layout_free_placeholders": [Filter]},
)
+
+ if auto_scheduler_rewritten_layout:
+ Output = auto_scheduler.rewrite_compute_body(Output,
auto_scheduler_rewritten_layout)
+
return Output
diff --git a/python/tvm/topi/x86/conv2d_alter_op.py
b/python/tvm/topi/x86/conv2d_alter_op.py
index db3c232..979dc5a 100644
--- a/python/tvm/topi/x86/conv2d_alter_op.py
+++ b/python/tvm/topi/x86/conv2d_alter_op.py
@@ -40,21 +40,6 @@ _OIHWio_matcher = re.compile("^OIHW[0-9]+i[0-9]+o$")
def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
target = tvm.target.Target.current(allow_none=False)
dispatch_ctx = autotvm.task.DispatchContext.current
- if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest):
- cfg = dispatch_ctx.query(target, None)
- workload = cfg.workload
- else:
- _, outs = relay.backend.compile_engine.select_implementation(
- relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
- )
- workload = autotvm.task.get_workload(outs)
- if workload is None:
- # The best implementation is not an AutoTVM template,
- # we then assume it's not necessary to alter this op.
- return None
- cfg = dispatch_ctx.query(target, workload)
-
- topi_tmpl = workload[0]
new_attrs = {k: attrs[k] for k in attrs.keys()}
# Parse the attributes.
@@ -68,6 +53,45 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
kernel_dtype = kernel_tensor.dtype
out_dtype = out_type.dtype
+ if isinstance(dispatch_ctx, autotvm.task.ApplyGraphBest):
+ cfg = dispatch_ctx.query(target, None)
+ workload = cfg.workload
+ else:
+ impl, outs = relay.backend.compile_engine.select_implementation(
+ relay.op.get("nn.conv2d"), attrs, tinfos, out_type, target
+ )
+ workload = autotvm.task.get_workload(outs)
+ if workload is None:
+ # The best implementation is not an AutoTVM template.
+ # It may be from the auto-scheduler
+ if impl.name.find("winograd") != -1:
+ if dilation != (1, 1):
+ logger.warning("Does not support weight pre-transform for
dilated convolution.")
+ return None
+
+ assert data_layout == "NHWC" and kernel_layout == "HWIO"
+ N, H, W, CI = get_const_tuple(data_tensor.shape)
+ KH, KW, _, CO = get_const_tuple(kernel_tensor.shape)
+
+ # Pre-compute weight transformation in winograd
+ tile_size = 4
+ # HWIO -> OIHW
+ kernel_transform = relay.transpose(inputs[1], axes=[3, 2, 0,
1])
+ # alpha, alpha, CO, CI
+ weight = relay.nn.contrib_conv2d_winograd_weight_transform(
+ kernel_transform, tile_size=tile_size
+ )
+ new_attrs["tile_size"] = tile_size
+ new_attrs["channels"] = CO
+ return
relay.nn.contrib_conv2d_winograd_without_weight_transform(
+ inputs[0], weight, **new_attrs
+ )
+ return None
+
+ cfg = dispatch_ctx.query(target, workload)
+
+ topi_tmpl = workload[0]
+
if topi_tmpl == "conv2d_NCHWc.x86":
# we only convert conv2d_NCHW to conv2d_NCHWc for x86
if data_layout == "NCHW" and kernel_layout == "OIHW":
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc
b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
index 814e72a..f704fe9 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
@@ -806,12 +806,10 @@ PopulationGenerationRule::ResultKind
InitThreadBind::Apply(SketchPolicyNode* pol
total_space_extent *= pint->value;
}
- // Check if the total space extent is too small for multi-level thread
binding
- if (total_space_extent <=
policy->search_task->hardware_params->warp_size) {
- Iterator fused_it;
- *state = FuseAllOuterSpaceIterators(*state, stage_id, &fused_it);
- state->bind(stage_id, fused_it, IteratorAnnotation::kThreadX);
- continue;
+ bool check_min_thread_extent = true;
+ // If the total space extent is too small, disable the check of minimal
thread extent
+ if (total_space_extent <=
policy->search_task->hardware_params->warp_size * 2) {
+ check_min_thread_extent = false;
}
// Fuse the outermost space tile as blockIdx
@@ -856,7 +854,8 @@ PopulationGenerationRule::ResultKind
InitThreadBind::Apply(SketchPolicyNode* pol
to_fuse.push_back((*state)->stages[stage_id]->iters[i]);
}
const auto& threadidx_it = state->fuse(stage_id, to_fuse);
- if (GetExtent(threadidx_it) <
policy->search_task->hardware_params->warp_size) {
+ if (check_min_thread_extent &&
+ GetExtent(threadidx_it) <
policy->search_task->hardware_params->warp_size) {
return ResultKind::kInvalid;
}
state->bind(stage_id, threadidx_it, IteratorAnnotation::kThreadX);
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index 2b3a978..c08d355 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -356,8 +356,16 @@ bool Conv3DRel(const Array<Type>& types, int num_inputs,
const Attrs& attrs,
weight_dtype = weight->dtype;
}
- // assign result to reporter
- reporter->Assign(types[1], TensorType(wshape, weight_dtype));
+ if (param->auto_scheduler_rewritten_layout.size() == 0) {
+ // Normal case: assign result to reporter
+ reporter->Assign(types[1], TensorType(wshape, weight_dtype));
+ } else {
+ // If the layout is rewritten by auto-scheduler,
+ // we just forcly apply the layout provided by auto-scheduler and
+ // skip the normal inference logic.
+ {} // do nothing
+ }
+
} else {
// use weight to infer the conv shape.
if (weight == nullptr) return false;
diff --git a/src/relay/transforms/auto_scheduler_layout_rewrite.cc
b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
index 53e7a02..edc4119 100644
--- a/src/relay/transforms/auto_scheduler_layout_rewrite.cc
+++ b/src/relay/transforms/auto_scheduler_layout_rewrite.cc
@@ -83,6 +83,10 @@ class FuncMutator : public ExprMutator {
Attrs updated_attrs;
if (auto pattr = call->attrs.as<Conv2DAttrs>()) {
updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout);
+ } else if (auto pattr = call->attrs.as<Conv2DWinogradAttrs>()) {
+ updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout);
+ } else if (auto pattr = call->attrs.as<Conv3DAttrs>()) {
+ updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout);
} else if (auto pattr = call->attrs.as<DenseAttrs>()) {
updated_attrs = CopyAttrsWithNewLayout(pattr, new_layout);
} else if (auto pattr = call->attrs.as<BatchMatmulAttrs>()) {
@@ -99,7 +103,9 @@ class FuncMutator : public ExprMutator {
std::deque<std::string> ori_layouts_queue_;
std::deque<std::string> new_layouts_queue_;
- std::vector<std::string> target_ops_{"nn.conv2d", "nn.dense",
"nn.batch_matmul"};
+ std::vector<std::string> target_ops_{"nn.conv2d", "nn.conv3d",
+
"nn.contrib_conv2d_winograd_without_weight_transform",
+ "nn.dense", "nn.batch_matmul"};
};
Expr AutoSchedulerLayoutRewriter::VisitExpr_(const CallNode* n) {
@@ -156,6 +162,10 @@
TVM_REGISTER_GLOBAL("relay.attrs.get_auto_scheduler_rewritten_layout")
.set_body_typed([](const Attrs& attrs) {
if (attrs->IsInstance<Conv2DAttrs>()) {
return attrs.as<Conv2DAttrs>()->auto_scheduler_rewritten_layout;
+ } else if (attrs->IsInstance<Conv2DWinogradAttrs>()) {
+ return
attrs.as<Conv2DWinogradAttrs>()->auto_scheduler_rewritten_layout;
+ } else if (attrs->IsInstance<Conv3DAttrs>()) {
+ return attrs.as<Conv3DAttrs>()->auto_scheduler_rewritten_layout;
} else if (attrs->IsInstance<DenseAttrs>()) {
return attrs.as<DenseAttrs>()->auto_scheduler_rewritten_layout;
} else if (attrs->IsInstance<BatchMatmulAttrs>()) {
diff --git a/tests/python/relay/test_auto_scheduler_layout_rewrite.py
b/tests/python/relay/test_auto_scheduler_layout_rewrite.py
index 577f6d6..56b57e1 100644
--- a/tests/python/relay/test_auto_scheduler_layout_rewrite.py
+++ b/tests/python/relay/test_auto_scheduler_layout_rewrite.py
@@ -30,8 +30,8 @@ def get_np_array(var, dtype):
def get_relay_conv2d(
- outc=128,
- inc=64,
+ outc=32,
+ inc=32,
height=14,
width=14,
kh=3,
@@ -70,6 +70,49 @@ def get_relay_conv2d(
return mod, data, weight
+def get_relay_conv3d(
+ outc=8,
+ inc=8,
+ depth=8,
+ height=7,
+ width=7,
+ kd=1,
+ kh=1,
+ kw=1,
+ batch=1,
+ pad=0,
+ stride=1,
+ dilation=1,
+ layout="NDHWC",
+):
+ dtype = "float32"
+ if layout == "NDHWC":
+ kernel_layout = "DHWIO"
+ d = relay.var("data", shape=(batch, depth, height, width, inc),
dtype=dtype)
+ w = relay.var("weight", shape=(kd, kh, kw, inc, outc), dtype=dtype)
+ elif layout == "NCDHW":
+ kernel_layout = "OIDHW"
+ d = relay.var("data", shape=(batch, inc, depth, height, width),
dtype=dtype)
+ w = relay.var("weight", shape=(outc, inc, kd, kh, kw), dtype=dtype)
+
+ y = relay.nn.conv3d(
+ d,
+ w,
+ padding=pad,
+ kernel_size=(kd, kh, kw),
+ strides=(stride, stride, stride),
+ dilation=(dilation, dilation, dilation),
+ channels=outc,
+ groups=1,
+ data_layout=layout,
+ kernel_layout=kernel_layout,
+ )
+ mod = tvm.IRModule()
+ mod["main"] = relay.Function([d, w], y)
+ data, weight = get_np_array(d, dtype), get_np_array(w, dtype)
+ return mod, data, weight
+
+
def get_relay_dense(m=128, n=128, k=128):
dtype = "float32"
d = relay.var("data", shape=(m, k), dtype=dtype)
@@ -95,7 +138,9 @@ def get_relay_batchmm(batch=4, m=128, n=128, k=128):
def tune_and_check(mod, data, weight):
# Extract tasks from a relay program
target = tvm.target.Target("llvm")
- tasks, task_weights = auto_scheduler.extract_tasks(mod, target=target,
params={})
+ tasks, task_weights = auto_scheduler.extract_tasks(
+ mod, target=target, params={"weight": weight}
+ )
with tempfile.NamedTemporaryFile() as fp:
log_file = fp.name
@@ -110,16 +155,19 @@ def tune_and_check(mod, data, weight):
)
tuner.tune(tune_option, search_policy="sketch.random")
- # Compile and run
- def compile_and_run(disabled_pass={}):
- with auto_scheduler.ApplyHistoryBest(log_file):
- with tvm.transform.PassContext(
- opt_level=3,
- config={"relay.backend.use_auto_scheduler": True},
- disabled_pass=disabled_pass,
- ):
- lib = relay.build(mod, target=target, params={"weight":
weight})
+ # Compile
+ with auto_scheduler.ApplyHistoryBest(log_file):
+ with tvm.transform.PassContext(
+ opt_level=3,
+ config={"relay.backend.use_auto_scheduler": True},
+ ):
+ lib = relay.build(mod, target=target, params={"weight":
weight})
+ # Compile without auto-scheduler for correctness check
+ with tvm.transform.PassContext(opt_level=0):
+ lib2 = relay.build(mod, target=target, params={"weight": weight})
+
+ def get_output(data, lib):
ctx = tvm.cpu()
module = graph_runtime.GraphModule(lib["default"](ctx))
module.set_input("data", data)
@@ -128,19 +176,27 @@ def tune_and_check(mod, data, weight):
return module.get_output(0).asnumpy()
# Check correctness
- actual_output = compile_and_run()
- expected_output =
compile_and_run(disabled_pass={"AutoSchedulerLayoutRewrite"})
+ actual_output = get_output(data, lib)
+ expected_output = get_output(data, lib2)
tvm.testing.assert_allclose(actual_output, expected_output, rtol=1e-4,
atol=1e-4)
def test_conv2d():
- # wrap the search in a new thread to avoid the conflict
- # between python's multiprocessing and tvm's thread pool
mod, data, weight = get_relay_conv2d(kh=1, kw=1)
tune_and_check(mod, data, weight)
+def test_conv2d_winograd():
+ mod, data, weight = get_relay_conv2d(kh=3, kw=3)
+ tune_and_check(mod, data, weight)
+
+
+def test_conv3d():
+ mod, data, weight = get_relay_conv3d()
+ tune_and_check(mod, data, weight)
+
+
def test_dense():
mod, data, weight = get_relay_dense()
tune_and_check(mod, data, weight)
@@ -153,5 +209,7 @@ def test_batch_matmul():
if __name__ == "__main__":
test_conv2d()
+ test_conv2d_winograd()
+ test_conv3d()
test_dense()
test_batch_matmul()
diff --git a/tests/python/relay/test_auto_scheduler_task_extraction.py
b/tests/python/relay/test_auto_scheduler_task_extraction.py
index a58e04a..531d041 100644
--- a/tests/python/relay/test_auto_scheduler_task_extraction.py
+++ b/tests/python/relay/test_auto_scheduler_task_extraction.py
@@ -46,7 +46,7 @@ def get_network(name, batch_size=1, layout="NHWC"):
num_layers=50, batch_size=batch_size, layout=layout,
image_shape=image_shape
)
elif name == "winograd-test":
- input_shape = [1, 23, 40, 128]
+ input_shape = [1, 23, 40, 32]
data = relay.var("data", shape=input_shape, dtype="float32")
net = relay.testing.layers.conv2d(
diff --git a/tests/python/relay/test_auto_scheduler_tuning.py
b/tests/python/relay/test_auto_scheduler_tuning.py
index d42373c..4ae434d 100644
--- a/tests/python/relay/test_auto_scheduler_tuning.py
+++ b/tests/python/relay/test_auto_scheduler_tuning.py
@@ -17,8 +17,11 @@
"""Test end-to-end network tuning with auto-scheduler"""
import tempfile
-import tvm.testing
+import numpy as np
+
from tvm import auto_scheduler, relay
+from tvm.contrib import graph_runtime
+import tvm.testing
from test_auto_scheduler_task_extraction import get_network
@@ -53,9 +56,30 @@ def tune_network(network, target):
):
lib = relay.build(mod, target=target, params=params)
- # Todo(merrymercy): when the cpu backend is upstreamed, do the following
things:
- # 1. compile without history to test the fallback mechanism
- # 2. check the correctness of layout rewrite / winograd pre-transform
+ # Compile without auto-scheduler and any other optimization for
correctness check
+ with tvm.transform.PassContext(opt_level=0):
+ lib2 = relay.build(mod, target=target, params=params)
+
+ # Check the correctness
+ def get_output(data, lib):
+ ctx = tvm.gpu()
+ module = graph_runtime.GraphModule(lib["default"](ctx))
+ module.set_input("data", data)
+ module.run()
+ return module.get_output(0).asnumpy()
+
+ np.random.seed(0)
+ if network == "mlp":
+ data = np.random.uniform(size=(1, 32))
+ elif network == "winograd-test":
+ data = np.random.uniform(size=(1, 23, 40, 32))
+ else:
+ raise ValueError("Unknown network: " + network)
+
+ actual_output = get_output(data, lib)
+ expected_output = get_output(data, lib2)
+
+ tvm.testing.assert_allclose(actual_output, expected_output, rtol=1e-4,
atol=1e-4)
@tvm.testing.requires_cuda