This is an automated email from the ASF dual-hosted git repository.
masahi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 3e02ac5d2e [Adreno] Fix mem_scope annotations for prim funcs having
several heads (#13153)
3e02ac5d2e is described below
commit 3e02ac5d2e46a6d17f56e9b3a452107b0d218060
Author: Andrey Malyshev <[email protected]>
AuthorDate: Mon Oct 24 22:23:55 2022 +0300
[Adreno] Fix mem_scope annotations for prim funcs having several heads
(#13153)
* [Adreno] Fix mem_scope annotations for prim funcs having several heads
2) fix of injective schedules for non blocking case
* Address PR comments
* fix lint
* Modify comment
---
python/tvm/topi/adreno/utils.py | 31 ++--
src/relay/transforms/annotate_texture_storage.cc | 10 +-
.../opencl_texture/test_conv2d_nchw_texture.py | 193 +++++++++++++++++++++
3 files changed, 217 insertions(+), 17 deletions(-)
diff --git a/python/tvm/topi/adreno/utils.py b/python/tvm/topi/adreno/utils.py
index de0505af03..1a1cc747fa 100644
--- a/python/tvm/topi/adreno/utils.py
+++ b/python/tvm/topi/adreno/utils.py
@@ -525,28 +525,27 @@ def bind_data_copy(stage, axis_to_vectorize=None):
stage.bind(block, te.thread_axis("blockIdx.z"))
stage.bind(thread, te.thread_axis("threadIdx.z"))
else:
- axes = stage.op.axis
- fused = stage.fuse(*axes[:-1])
- if shape[-1] <= 32:
+ if shape[-1] == 4:
+ axes = stage.op.axis
+ fused = stage.fuse(*axes[:-1])
ftc = numpy.prod(shape[:-1])
div = get_div(ftc, 64)
block, thread = stage.split(fused, factor=div)
stage.bind(block, te.thread_axis("blockIdx.x"))
stage.bind(thread, te.thread_axis("threadIdx.x"))
- if shape[-1] == 4:
- stage.vectorize(axes[-1])
- # 1024 is the maximum work group size for Adreno devices.
- # See: CL_DEVICE_MAX_WORK_GROUP_SIZE
- elif shape[-1] > 1024:
- ftc = numpy.prod(shape[:-1])
- div = get_div(ftc, 1024)
- by, ty = stage.split(axes[-1], factor=div)
- stage.bind(fused, te.thread_axis("blockIdx.x"))
- stage.bind(by, te.thread_axis("blockIdx.y"))
- stage.bind(ty, te.thread_axis("threadIdx.y"))
+ stage.vectorize(axes[-1])
else:
- stage.bind(fused, te.thread_axis("blockIdx.x"))
- stage.bind(*axes[-1:], te.thread_axis("threadIdx.x"))
+ ftc = numpy.prod(shape)
+ vthread = get_div(ftc, 8)
+ fused = stage.fuse(*stage.op.axis)
+ ftc = ftc / vthread
+ # 1024 is a maximum work group size on the most Adreno GPU
+ num_thread = get_div(ftc, 1024 // vthread)
+ a, b = stage.split(fused, factor=num_thread)
+ a, c = stage.split(a, factor=vthread)
+ stage.bind(c, te.thread_axis("vthread"))
+ stage.bind(a, te.thread_axis("blockIdx.x"))
+ stage.bind(b, te.thread_axis("threadIdx.x"))
def get_texture_storage(shape):
diff --git a/src/relay/transforms/annotate_texture_storage.cc
b/src/relay/transforms/annotate_texture_storage.cc
index 6904c6b5d7..277c5e1da4 100644
--- a/src/relay/transforms/annotate_texture_storage.cc
+++ b/src/relay/transforms/annotate_texture_storage.cc
@@ -206,7 +206,9 @@ class StorageInfo : private
transform::DeviceAwareExprVisitor {
}
}
- primitive_supports_texture_ = SupportsTextureStorage(call);
+ if (!primitive_supports_texture_) {
+ primitive_supports_texture_ = SupportsTextureStorage(call);
+ }
for (auto& arg : call->args) {
Visit(arg);
@@ -362,6 +364,12 @@ class StorageInfo : private
transform::DeviceAwareExprVisitor {
bool SupportsTextureStorage(const CallNode* call) const {
bool supports_texture_storage = false;
+ // we need to verify only entry functions since one of entry op defines
main schedule
+ for (const auto& arg : call->args) {
+ if (!arg.as<VarNode>()) {
+ return false;
+ }
+ }
if (auto attrs = call->attrs.as<Conv2DAttrs>()) {
if (attrs->data_layout == "NCHW4c" && attrs->kernel_layout == "OIHW4o") {
supports_texture_storage = true;
diff --git a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
index c73e411a70..5198cbdf6b 100644
--- a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
+++ b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
@@ -1074,3 +1074,196 @@ def test_conv2d_winograd_non_rect(target, dtype):
)
matches = re.findall("winograd", graph)
assert len(matches) > 0
+
+
+# function repeat, params scope are different in reused functions
[email protected]_opencl
[email protected]_targets("opencl -device=adreno")
+def test_injective_nwo_inputs1(target, dtype):
+ """
+ Use case for verification of stability of annotation primary functions
+ having several ops accepting data outside of Primary function
+ The visiting of ops during traversing of graph inside primary function
+ can depend on order of relay graph creation. Thus the annotation mechanism
+ should be reliable for graph traversal order
+ The current decision if Prim Function support textures or not depend on
+ *any* op accepting input of the function and if op support textures
+ Input
+ / \
+ layout_transform (NCHW->NCHW4c) |
+ | /
+ conv2d (1) /
+ | /
+ conv2d (2) mean /
+ / \ / <- Primary function several
head ops
+ (1)add (2)layout_transform |
+ | (NCHW4c->NCHW) |
+ | | \ /
+ | | (3) add
+ | | |
+ layout_transform \ /
+ (NCHW4c->NCHW) \ /
+ \ mul
+ \ /
+ add
+
+ This test verifies a case when the latest op which is visited is (3) and
does not
+ support textures, but there is (1) supporting textures, thus the whole
func will
+ support textures
+ """
+ input_shape = (1, 4, 40, 40)
+ filter_shape1 = (4, 4, 3, 3)
+ filter_shape2 = (4, 4, 3, 3)
+ filter_shape3 = (4, 4, 3, 3)
+ A = relay.var("data", shape=input_shape, dtype=dtype)
+ W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+ W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+ mean = relay.mean(A, axis=1, keepdims=True)
+ conv1 = relay.nn.conv2d(
+ A,
+ W1,
+ data_layout="NCHW",
+ kernel_layout="OIHW",
+ padding=[1, 1, 1, 1],
+ strides=[1, 1],
+ out_dtype=dtype,
+ channels=4,
+ kernel_size=(3, 3),
+ )
+
+ conv2 = relay.nn.conv2d(
+ conv1,
+ W2,
+ data_layout="NCHW",
+ kernel_layout="OIHW",
+ padding=[1, 1, 1, 1],
+ strides=[1, 1],
+ out_dtype=dtype,
+ channels=4,
+ kernel_size=(3, 3),
+ )
+
+ ad3 = relay.op.add(conv1, conv2)
+ ad1 = relay.op.add(mean, conv1)
+ ad2 = relay.op.multiply(ad1, conv2)
+ ad4 = relay.op.add(ad3, ad2)
+
+ mod = relay.Function([A, W1, W2], ad4)
+ np.random.seed(0)
+ initializer = relay.testing.init.Xavier()
+ filter_data1 = np.zeros(filter_shape1).astype(dtype)
+ filter_data2 = np.zeros(filter_shape2).astype(dtype)
+ initializer("weight", filter_data1)
+ initializer("weight", filter_data2)
+ params1 = {
+ "weight1": tvm.nd.array(filter_data1),
+ "weight2": tvm.nd.array(filter_data2),
+ }
+
+ static_memory_scope = [
+ "global",
+ "global.texture",
+ "global.texture-nhwc",
+ "global.texture",
+ "global.texture-nhwc",
+ "global.texture",
+ "global",
+ "global",
+ ]
+ build_run_compare(mod, params1, {"data": input_shape}, dtype, target,
static_memory_scope)
+
+
+# function repeat, params scope are different in reused functions
[email protected]_opencl
[email protected]_targets("opencl -device=adreno")
+def test_injective_nwo_inputs2(target, dtype):
+ """
+ Use case for verification of stability of annotation primary functions
+ having several ops accepting data outside of Primary function
+ The visiting of ops during traversing of graph inside primary function
+ can depend on order of relay graph creation. Thus the annotation mechanism
+ should be reliable for graph traversal order
+ The current decision if Prim Function support textures or not depend on
+ *any* op accepting input of the function and if op support textures
+ Input
+ / \
+ layout_transform (NCHW->NCHW4c) |
+ | /
+ conv2d (1) /
+ | /
+ conv2d (2) mean /
+ / \ / <- Primary function several
head ops
+ (1)add (2)layout_transform |
+ | (NCHW4c->NCHW) |
+ | | \ /
+ | | (3) add
+ | | |
+ layout_transform \ /
+ (NCHW4c->NCHW) \ /
+ \ mul
+ \ /
+ add
+
+ This test verifies a case when the latest op which is (1), it supports
textures
+ an whole prim function is considered as a func working with textures
+ """
+ input_shape = (1, 4, 40, 40)
+ filter_shape1 = (4, 4, 3, 3)
+ filter_shape2 = (4, 4, 3, 3)
+ filter_shape3 = (4, 4, 3, 3)
+ A = relay.var("data", shape=input_shape, dtype=dtype)
+ W1 = relay.var("weight1", shape=filter_shape1, dtype=dtype)
+ W2 = relay.var("weight2", shape=filter_shape2, dtype=dtype)
+ mean = relay.mean(A, axis=1, keepdims=True)
+ conv1 = relay.nn.conv2d(
+ A,
+ W1,
+ data_layout="NCHW",
+ kernel_layout="OIHW",
+ padding=[1, 1, 1, 1],
+ strides=[1, 1],
+ out_dtype=dtype,
+ channels=4,
+ kernel_size=(3, 3),
+ )
+
+ conv2 = relay.nn.conv2d(
+ conv1,
+ W2,
+ data_layout="NCHW",
+ kernel_layout="OIHW",
+ padding=[1, 1, 1, 1],
+ strides=[1, 1],
+ out_dtype=dtype,
+ channels=4,
+ kernel_size=(3, 3),
+ )
+
+ ad3 = relay.op.add(conv1, conv2)
+ ad1 = relay.op.add(mean, conv1)
+ ad2 = relay.op.multiply(ad1, conv2)
+ ad4 = relay.op.add(ad2, ad3)
+
+ mod = relay.Function([A, W1, W2], ad4)
+ np.random.seed(0)
+ initializer = relay.testing.init.Xavier()
+ filter_data1 = np.zeros(filter_shape1).astype(dtype)
+ filter_data2 = np.zeros(filter_shape2).astype(dtype)
+ initializer("weight", filter_data1)
+ initializer("weight", filter_data2)
+ params1 = {
+ "weight1": tvm.nd.array(filter_data1),
+ "weight2": tvm.nd.array(filter_data2),
+ }
+
+ static_memory_scope = [
+ "global",
+ "global.texture",
+ "global.texture-nhwc",
+ "global.texture",
+ "global",
+ "global.texture-nhwc",
+ "global.texture",
+ "global",
+ ]
+ build_run_compare(mod, params1, {"data": input_shape}, dtype, target,
static_memory_scope)