ekalda commented on code in PR #13669:
URL: https://github.com/apache/tvm/pull/13669#discussion_r1058905849
##########
tests/python/topi/python/test_topi_conv2d_int8.py:
##########
@@ -298,378 +175,462 @@ def get_ref_data():
a_np, w_np, b_np, c_np = get_ref_data()
- with tvm.target.Target(target):
- C = compute(
- A,
- W,
- (stride, stride),
- padding,
- (dilation, dilation),
- "NCHW",
- "NCHW",
- out_dtype,
- )
+ dev = tvm.device(target, 0)
+ with tvm.target.Target(target) as tvm_target:
+ C = compute(A, W, (stride, stride), padding, (dilation, dilation),
dtype)
if add_bias:
C = topi.add(C, bias)
if add_relu:
C = topi.nn.relu(C)
s = schedule([C])
- a = tvm.nd.array(a_np.astype(dtype), dev)
- w = tvm.nd.array(w_np.astype(dtype), dev)
- b = tvm.nd.array(b_np.astype(out_dtype), dev)
- c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
dev)
-
- if add_bias:
- compile_args = [A, W, bias, C]
- run_args = [a, w, b, c]
- else:
- compile_args = [A, W, C]
- run_args = [a, w, c]
-
- func = tvm.build(
- s,
- compile_args,
- target,
- name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
- % (batch, in_channel, in_size, num_filter, kernel, stride,
padding_sum, dilation),
- )
+ a = tvm.nd.array(a_np, dev)
+ w = tvm.nd.array(w_np, dev)
+ b = tvm.nd.array(b_np, dev)
+ c = tvm.nd.array(np.zeros(get_const_tuple(C.shape),
dtype=C.dtype), dev)
- if build_only:
- return
+ build_inputs = [A, W, bias, C] if add_bias else [A, W, C]
+ inference_inputs = (a, w, b, c) if add_bias else (a, w, c)
+
+ func = tvm.build(
+ s,
+ build_inputs,
+ target,
+ name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
+ % (
+ batch,
+ in_channel,
+ in_size,
+ num_filter,
+ kernel,
+ stride,
+ padding_sum,
+ dilation,
+ ),
+ )
- print("Running on target: %s" % target)
+ build_only = tvm_target.features.is_aarch64 and
(platform.machine() != "aarch64")
- func(*run_args)
+ if not build_only:
+ print("Running on target: %s" % target)
+ func(*inference_inputs)
+ tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
- tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
- targets = [
- (
- "cuda",
- lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w,
s, p, d, l, o),
- topi.cuda.schedule_conv2d_NCHWc_int8,
- 4,
- False,
- ),
- # Disable on CI since it does not support spirv int8 dot product
- # (
- # "vulkan -from_device=0",
- # lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a,
w, s, p, d, l, o),
- # topi.cuda.schedule_conv2d_NCHWc_int8,
- # 4,
- # False,
- # ),
- ]
-
- build_only_aarch64 = platform.machine() != "aarch64"
-
- targets.append(
[email protected]("in_dtype", ["int8", "uint8"])
[email protected](
+ "params",
+ [
+ # ResNet18 workloads where channels in / out are multiple of
oc_block_factor
+ (1, 64, 56, 64, 3, 1, 1, 1, False, False),
+ (1, 64, 56, 64, 1, 1, 0, 1, False, False),
+ (1, 64, 56, 128, 3, 2, 1, 1, False, False),
+ (1, 64, 56, 128, 1, 2, 0, 1, False, False),
+ (1, 128, 28, 128, 3, 1, 1, 1, False, False),
+ (1, 128, 28, 256, 3, 2, 1, 1, False, False),
+ (1, 128, 28, 256, 1, 2, 0, 1, False, False),
+ (1, 256, 14, 256, 3, 1, 1, 1, False, False),
+ (1, 256, 14, 512, 3, 2, 1, 1, False, False),
+ (1, 256, 14, 512, 1, 2, 0, 1, False, False),
+ (1, 512, 7, 512, 3, 1, 1, 1, False, False),
+ # bias, relu
+ (1, 64, 56, 64, 3, 1, 1, 1, False, True),
+ (1, 64, 56, 64, 3, 1, 1, 1, True, False),
+ (1, 64, 56, 64, 3, 1, 1, 1, True, True),
+ # dilation = 2
+ (1, 64, 56, 64, 3, 1, 1, 2, False, False),
+ # batch size
+ (4, 64, 56, 64, 3, 1, 1, 1, False, False),
+ (9, 64, 56, 64, 3, 1, 1, 1, False, False),
+ # weird workloads
+ (4, 4, 4, 8, 4, 4, 4, 1, False, False),
+ # inception v3 workloads where channels in / out are multiple of
oc_block_factor
+ (1, 32, 149, 32, 3, 1, 0, 1, False, False),
+ (1, 32, 147, 64, 3, 1, 1, 1, False, False),
+ (1, 64, 73, 80, 1, 1, 0, 1, False, False),
+ (1, 80, 73, 192, 3, 1, 0, 1, False, False),
+ (1, 192, 35, 64, 1, 1, 0, 1, False, False),
+ (1, 192, 35, 48, 1, 1, 0, 1, False, False),
+ (1, 48, 35, 64, 5, 1, 2, 1, False, False),
+ (1, 64, 35, 96, 3, 1, 1, 1, False, False),
+ (1, 96, 35, 96, 3, 1, 1, 1, False, False),
+ (1, 192, 35, 32, 1, 1, 0, 1, False, False),
+ (1, 256, 35, 64, 1, 1, 0, 1, False, False),
+ (1, 256, 35, 48, 1, 1, 0, 1, False, False),
+ (1, 288, 35, 64, 1, 1, 0, 1, False, False),
+ (1, 288, 35, 48, 1, 1, 0, 1, False, False),
+ (1, 288, 35, 384, 3, 2, 0, 1, False, False),
+ (1, 96, 35, 96, 3, 2, 0, 1, False, False),
+ (1, 768, 17, 192, 1, 1, 0, 1, False, False),
+ (1, 768, 17, 128, 1, 1, 0, 1, False, False),
+ (1, 128, 17, 128, 1, 1, 0, 1, False, False),
+ (1, 128, 17, 192, 7, 1, 3, 1, False, False),
+ (1, 128, 17, 128, 7, 1, 3, 1, False, False),
+ (1, 128, 17, 192, 1, 1, 0, 1, False, False),
+ (1, 768, 17, 160, 1, 1, 0, 1, False, False),
+ (1, 160, 17, 160, 1, 1, 0, 1, False, False),
+ (1, 160, 17, 192, 7, 1, 3, 1, False, False),
+ (1, 160, 17, 160, 7, 1, 3, 1, False, False),
+ (1, 160, 17, 192, 1, 1, 0, 1, False, False),
+ (1, 192, 17, 192, 1, 1, 0, 1, False, False),
+ (1, 192, 17, 192, 7, 1, 3, 1, False, False),
+ (1, 192, 17, 320, 3, 2, 0, 1, False, False),
+ (1, 192, 17, 192, 3, 2, 0, 1, False, False),
+ (1, 1280, 8, 320, 1, 1, 0, 1, False, False),
+ (1, 1280, 8, 384, 1, 1, 0, 1, False, False),
+ (1, 384, 8, 384, 1, 1, 0, 1, False, False),
+ (1, 384, 8, 384, 3, 1, 1, 1, False, False),
+ (1, 1280, 8, 448, 1, 1, 0, 1, False, False),
+ (1, 448, 8, 384, 3, 1, 1, 1, False, False),
+ (1, 1280, 8, 192, 1, 1, 0, 1, False, False),
+ (1, 2048, 8, 320, 1, 1, 0, 1, False, False),
+ (1, 2048, 8, 384, 1, 1, 0, 1, False, False),
+ (1, 2048, 8, 448, 1, 1, 0, 1, False, False),
+ (1, 2048, 8, 192, 1, 1, 0, 1, False, False),
+ (1, 1024, 19, 88, 3, 1, 1, 1, False, False),
+ # batch > 1
+ (7, 32, 149, 32, 3, 1, 0, 1, False, False),
+ (8, 32, 149, 32, 3, 1, 0, 1, False, False),
+ (32, 32, 149, 32, 3, 1, 0, 1, False, False),
+ # Asymmetric padding
+ (1, 32, 35, 64, 7, 2, (0, 0, 1, 1), 1, False, False),
+ (1, 64, 8, 128, 3, 1, (3, 3, 2, 2), 1, False, False),
+ (1, 64, 8, 64, 1, 1, (1, 2, 2, 1), 1, False, False),
+ (1, 64, 17, 192, 1, 1, (1, 2), 1, False, False),
+ (1, 64, 8, 64, 3, 1, (3, 1), 1, False, False),
+ (1, 128, 8, 384, 3, 1, (0, 2), 1, False, False),
+ (1, 64, 8, 64, 1, 1, "VALID", 1, False, False),
+ (1, 392, 8, 64, 3, 1, "VALID", 1, False, False),
+ (1, 512, 19, 64, 1, 1, "SAME", 1, False, False),
+ (1, 64, 16, 32, 2, 1, "SAME", 1, False, False),
+ (1, 64, 8, 64, 3, 1, (1, 2, 2, 1), 1, False, True),
+ (1, 64, 8, 64, 5, 2, (1, 3), 1, True, False),
+ (1, 64, 56, 64, 3, 1, "VALID", 1, True, True),
+ (1, 64, 56, 64, 24, 1, "SAME", 1, True, True),
+ ],
+)
+def test_conv2d_NCHWc_int8(in_dtype, params):
+ with Int8Fallback():
(
- "llvm -device arm_cpu -mtriple aarch64-linux-gnu
-mattr=+neon,+v8.2a,+dotprod",
- topi.arm_cpu.conv2d_NCHWc_int8,
- topi.arm_cpu.schedule_conv2d_NCHWc_int8,
- 8,
- build_only_aarch64,
+ batch,
+ in_channel,
+ in_size,
+ num_filter,
+ kernel,
+ stride,
+ padding,
+ dilation,
+ add_bias,
+ add_relu,
+ ) = params
+ pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding,
(kernel, kernel))
+ padding_sum = pad_top + pad_left + pad_bottom + pad_right
+ print(
+ "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
+ % (batch, in_channel, in_size, num_filter, kernel, stride,
padding_sum, dilation)
)
- )
-
- if in_dtype == "int8":
- targets += [
- (
- "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon",
- topi.arm_cpu.conv2d_NCHWc_int8,
- topi.arm_cpu.schedule_conv2d_NCHWc_int8,
- 8,
- build_only_aarch64,
- ),
- (
- "rocm -mattr=+dotprod",
- lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a,
w, s, p, d, l, o),
- topi.cuda.schedule_conv2d_NCHWc_int8,
- 4,
- False,
- ),
- ]
-
- for target, compute, schedule, oc_block_factor, build_only in targets:
- check_target(target, compute, schedule, oc_block_factor, build_only)
-
-
-def verify_conv2d_nchw_int8(
- in_dtype,
- batch,
- in_channel,
- in_size,
- num_filter,
- kernel,
- stride,
- padding,
- dilation=1,
- add_bias=False,
- add_relu=False,
-):
- pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel,
kernel))
- padding_sum = pad_top + pad_left + pad_bottom + pad_right
- print(
- "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
- % (batch, in_channel, in_size, num_filter, kernel, stride,
padding_sum, dilation)
- )
-
- in_height = in_width = in_size
-
- A = te.placeholder((batch, in_channel, in_height, in_width), name="A",
dtype=in_dtype)
- W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W",
dtype=in_dtype)
- bias = te.placeholder((num_filter, 1, 1), name="bias", dtype=in_dtype)
-
- a_shape = get_const_tuple(A.shape)
- w_shape = get_const_tuple(W.shape)
- bias_shape = get_const_tuple(bias.shape)
- dtype = A.dtype
-
- @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
- def get_ref_data():
- a_np = np.random.randint(low=-128, high=127,
size=a_shape).astype(dtype)
- w_np = np.random.randint(low=-128, high=128,
size=w_shape).astype(dtype)
- b_np = np.random.uniform(size=bias_shape).astype(dtype)
- dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation,
dilation))
- c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride,
padding).astype(dtype)
-
- if add_bias:
- b_np = np.random.uniform(size=bias_shape).astype(dtype)
- c_np += b_np
- if add_relu:
- c_np = np.maximum(c_np, 0)
-
- return a_np, w_np, b_np, c_np
-
- a_np, w_np, b_np, c_np = get_ref_data()
-
- def verify_workload_padding():
- _, _, out_height, out_width = get_const_tuple(c_np.shape)
- wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype)
-
- # for testing functionality,
- # we choose arbitrary int32_lanes and num_int8_elements can divide the
channel,
- # regardless of the performance.
- int32_lanes, num_int8_elements = num_filter, in_channel
- # check if tile_ow candidates are the factors of the right output
weight.
- cfg = autotvm.get_config()
- fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes,
num_int8_elements)
- ow_tile = np.prod(cfg["tile_ow"].size)
-
- tvm.testing.assert_allclose(ow_tile, out_width)
+ in_height = in_width = in_size
+
+ A = te.placeholder((batch, in_channel, in_height, in_width), name="A",
dtype=in_dtype)
+ W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W",
dtype=in_dtype)
+
+ a_shape = get_const_tuple(A.shape)
+ w_shape = get_const_tuple(W.shape)
+ dtype = A.dtype
+ out_dtype = "int32" if in_dtype == "int8" else "uint32"
+ lo = -128 if in_dtype == "int8" else 0
+ hi = 127 if in_dtype == "int8" else 255
+
+ def check_target(target, compute, schedule, oc_block_factor,
build_only):
+ dev = tvm.device(target, 0)
+ if not tvm.testing.device_enabled(target):
+ print("Skip because %s is not enabled" % target)
+ return
+ if target == "cuda" and not
tvm.contrib.nvcc.have_int8(dev.compute_version):
+ print("Skip because int8 intrinsics are not available")
+ return
+
+ bias = te.placeholder(
+ (num_filter // oc_block_factor, 1, 1, oc_block_factor),
name="bias", dtype=out_dtype
+ )
+ bias_shape = get_const_tuple(bias.shape)
- def check_target(target):
- dev = tvm.device(target, 0)
- if not tvm.testing.device_enabled(target):
- print("Skip because %s is not enabled" % target)
- return
- if target == "cuda" and not
tvm.contrib.nvcc.have_int8(dev.compute_version):
- print("Skip because int8 intrinsics are not available")
- return
+ @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
+ def get_ref_data():
+ a_np = np.random.randint(low=lo, high=hi,
size=a_shape).astype(out_dtype)
+ w_np = np.random.randint(low=lo, high=hi,
size=w_shape).astype(out_dtype)
+ b_np = np.random.uniform(size=bias_shape).astype(out_dtype)
+ dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation,
dilation))
+ c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np,
stride, padding).astype(
+ out_dtype
+ )
+
+ # convert to NCHWc
+ _, _, out_height, out_width = c_np.shape
+ c_np = c_np.reshape(
+ (batch, num_filter // oc_block_factor, oc_block_factor,
out_height, out_width)
+ ).transpose(0, 1, 3, 4, 2)
+
+ if add_bias:
+ b_np = np.random.uniform(size=bias_shape).astype(out_dtype)
+ c_np += b_np
+ if add_relu:
+ c_np = np.maximum(c_np, 0)
+
+ return a_np, w_np, b_np, c_np
+
+ a_np, w_np, b_np, c_np = get_ref_data()
+
+ with tvm.target.Target(target):
+ C = compute(
+ A,
+ W,
+ (stride, stride),
+ padding,
+ (dilation, dilation),
+ "NCHW",
+ "NCHW",
+ out_dtype,
+ )
+ if add_bias:
+ C = topi.add(C, bias)
+ if add_relu:
+ C = topi.nn.relu(C)
+ s = schedule([C])
+
+ a = tvm.nd.array(a_np.astype(dtype), dev)
+ w = tvm.nd.array(w_np.astype(dtype), dev)
+ b = tvm.nd.array(b_np.astype(out_dtype), dev)
+ c = tvm.nd.array(np.zeros(get_const_tuple(C.shape),
dtype=C.dtype), dev)
- print("Running on target: %s" % target)
- with tvm.target.Target(target):
- C = topi.cuda.conv2d_nchw_int8(
- A, W, (stride, stride), padding, (dilation, dilation), dtype
- )
if add_bias:
- C = topi.add(C, bias)
- if add_relu:
- C = topi.nn.relu(C)
- s = topi.cuda.schedule_conv2d_nchw_int8([C])
-
- a = tvm.nd.array(a_np, dev)
- w = tvm.nd.array(w_np, dev)
- b = tvm.nd.array(b_np, dev)
- c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
dev)
- if add_bias:
- tvm.build(
- s,
- [A, W, bias, C],
- target,
- name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
- % (batch, in_channel, in_size, num_filter, kernel, stride,
padding_sum, dilation),
- )
- func = tvm.build(
- s,
- [A, W, bias, C],
- target,
- name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
- % (batch, in_channel, in_size, num_filter, kernel, stride,
padding_sum, dilation),
- )
- func(a, w, b, c)
- else:
+ compile_args = [A, W, bias, C]
+ run_args = [a, w, b, c]
+ else:
+ compile_args = [A, W, C]
+ run_args = [a, w, c]
+
func = tvm.build(
s,
- [A, W, C],
+ compile_args,
target,
name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
% (batch, in_channel, in_size, num_filter, kernel, stride,
padding_sum, dilation),
)
- func(a, w, c)
- tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
- verify_workload_padding()
+ if build_only:
+ return
- for target in ["cuda"]:
- check_target(target)
+ print("Running on target: %s" % target)
+ func(*run_args)
[email protected]("in_dtype", ["int8", "uint8"])
-def test_conv2d_nchw(in_dtype):
- with Int8Fallback():
- # ResNet18 workloads where channels in / out are multiple of
oc_block_factor
- verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 128, 3, 2, 1)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 128, 1, 2, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 28, 128, 3, 1, 1)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 28, 256, 3, 2, 1)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 28, 256, 1, 2, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 14, 256, 3, 1, 1)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 14, 512, 3, 2, 1)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 14, 512, 1, 2, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 512, 7, 512, 3, 1, 1)
+ tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
- # bias, relu
- verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1,
add_relu=True)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1,
add_bias=True)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1,
add_bias=True, add_relu=True)
+ targets = [
+ (
+ "cuda",
+ lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a,
w, s, p, d, l, o),
+ topi.cuda.schedule_conv2d_NCHWc_int8,
+ 4,
+ False,
+ ),
+ # Disable on CI since it does not support spirv int8 dot product
+ # (
+ # "vulkan -from_device=0",
+ # lambda a, w, s, p, d, l, ol, o:
topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o),
+ # topi.cuda.schedule_conv2d_NCHWc_int8,
+ # 4,
+ # False,
+ # ),
+ ]
- # dilation = 2
- verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, dilation=2)
+ build_only_aarch64 = platform.machine() != "aarch64"
- # batch size
- verify_conv2d_NCHWc_int8(in_dtype, 4, 64, 56, 64, 3, 1, 1)
- verify_conv2d_NCHWc_int8(in_dtype, 9, 64, 56, 64, 3, 1, 1)
+ targets.append(
+ (
+ "llvm -device arm_cpu -mtriple aarch64-linux-gnu
-mattr=+neon,+v8.2a,+dotprod",
+ topi.arm_cpu.conv2d_NCHWc_int8,
+ topi.arm_cpu.schedule_conv2d_NCHWc_int8,
+ 8,
+ build_only_aarch64,
+ )
+ )
- # weird workloads
- verify_conv2d_NCHWc_int8(in_dtype, 4, 4, 4, 8, 4, 4, 4)
+ if in_dtype == "int8":
+ targets += [
+ (
+ "llvm -device arm_cpu -mtriple aarch64-linux-gnu
-mattr=+neon",
+ topi.arm_cpu.conv2d_NCHWc_int8,
+ topi.arm_cpu.schedule_conv2d_NCHWc_int8,
+ 8,
+ build_only_aarch64,
+ ),
+ (
+ "rocm -mattr=+dotprod",
+ lambda a, w, s, p, d, l, ol, o:
topi.cuda.conv2d_NCHWc_int8(
+ a, w, s, p, d, l, o
+ ),
+ topi.cuda.schedule_conv2d_NCHWc_int8,
+ 4,
+ False,
+ ),
+ ]
+
+ for target, compute, schedule, oc_block_factor, build_only in targets:
+ check_target(target, compute, schedule, oc_block_factor,
build_only)
+
+
+# Conv2d NCHW int8 schedule testing. Internally, it uses NCHWc schedule. So,
just
+# performing basic testing - one test for all different scenarios - batch,
dilation etc..
[email protected]("in_dtype", ["int8", "uint8"])
[email protected](
+ "params",
+ [
+ (1, 64, 56, 64, 3, 1, 1, 1, False, False),
+ (1, 64, 56, 64, 3, 1, 1, 1, False, True),
+ (1, 64, 56, 64, 3, 1, 1, 2, False, False),
+ (9, 64, 56, 64, 3, 1, 1, 1, False, False),
+ (4, 4, 4, 4, 4, 4, 4, 1, False, False),
+ (1, 32, 149, 32, 3, 1, 0, 1, False, False),
+ (7, 32, 149, 32, 3, 1, 0, 1, False, False),
+ (1, 32, 35, 64, 7, 2, (0, 0, 1, 1), 1, False, False),
+ (1, 32, 35, 64, 7, 2, (0, 0, 2, 2), 1, False, False),
+ ],
+)
+def test_conv2d_nchw_int8(in_dtype, params):
+ with Int8Fallback():
+ (
+ batch,
+ in_channel,
+ in_size,
+ num_filter,
+ kernel,
+ stride,
+ padding,
+ dilation,
+ add_bias,
+ add_relu,
+ ) = params
+ pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding,
(kernel, kernel))
+ padding_sum = pad_top + pad_left + pad_bottom + pad_right
+ print(
+ "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)"
+ % (batch, in_channel, in_size, num_filter, kernel, stride,
padding_sum, dilation)
+ )
- # inception v3 workloads where channels in / out are multiple of
oc_block_factor
- verify_conv2d_NCHWc_int8(in_dtype, 1, 32, 149, 32, 3, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 32, 147, 64, 3, 1, 1)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 73, 80, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 80, 73, 192, 3, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 35, 64, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 35, 48, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 48, 35, 64, 5, 1, 2)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 35, 96, 3, 1, 1)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 96, 35, 96, 3, 1, 1)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 35, 32, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 35, 64, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 256, 35, 48, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 288, 35, 64, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 288, 35, 48, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 288, 35, 384, 3, 2, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 96, 35, 96, 3, 2, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 768, 17, 192, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 768, 17, 128, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 17, 128, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 17, 192, 7, 1, 3)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 17, 128, 7, 1, 3)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 17, 192, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 768, 17, 160, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 160, 17, 160, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 160, 17, 192, 7, 1, 3)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 160, 17, 160, 7, 1, 3)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 160, 17, 192, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 17, 192, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 17, 192, 7, 1, 3)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 17, 320, 3, 2, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 192, 17, 192, 3, 2, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 1280, 8, 320, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 1280, 8, 384, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 384, 8, 384, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 384, 8, 384, 3, 1, 1)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 1280, 8, 448, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 448, 8, 384, 3, 1, 1)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 1280, 8, 192, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 2048, 8, 320, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 2048, 8, 384, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 2048, 8, 448, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 2048, 8, 192, 1, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 1024, 19, 88, 3, 1, 1)
+ in_height = in_width = in_size
- # batch > 1
- verify_conv2d_NCHWc_int8(in_dtype, 7, 32, 149, 32, 3, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 8, 32, 149, 32, 3, 1, 0)
- verify_conv2d_NCHWc_int8(in_dtype, 32, 32, 149, 32, 3, 1, 0)
+ A = te.placeholder((batch, in_channel, in_height, in_width), name="A",
dtype=in_dtype)
+ W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W",
dtype=in_dtype)
+ bias = te.placeholder((num_filter, 1, 1), name="bias", dtype=in_dtype)
- # Asymmetric padding
- verify_conv2d_NCHWc_int8(in_dtype, 1, 32, 35, 64, 7, 2, (0, 0, 1, 1))
- verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 128, 3, 1, (3, 3, 2, 2))
- verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 1, 1, (1, 2, 2, 1))
- verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 17, 192, 1, 1, (1, 2))
- verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 3, 1, (3, 1))
- verify_conv2d_NCHWc_int8(in_dtype, 1, 128, 8, 384, 3, 1, (0, 2))
- verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 1, 1, "VALID")
- verify_conv2d_NCHWc_int8(in_dtype, 1, 392, 8, 64, 3, 1, "VALID")
- verify_conv2d_NCHWc_int8(in_dtype, 1, 512, 19, 64, 1, 1, "SAME")
- verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 16, 32, 2, 1, "SAME")
- verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 3, 1, (1, 2, 2, 1),
add_relu=True)
- verify_conv2d_NCHWc_int8(in_dtype, 1, 64, 8, 64, 5, 2, (1, 3),
add_bias=True)
- verify_conv2d_NCHWc_int8(
- in_dtype, 1, 64, 56, 64, 3, 1, "VALID", add_bias=True,
add_relu=True
- )
- verify_conv2d_NCHWc_int8(
- in_dtype, 1, 64, 56, 64, 24, 1, "SAME", add_bias=True,
add_relu=True
- )
+ a_shape = get_const_tuple(A.shape)
+ w_shape = get_const_tuple(W.shape)
+ bias_shape = get_const_tuple(bias.shape)
+ dtype = A.dtype
+
+ @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
+ def get_ref_data():
+ a_np = np.random.randint(low=-128, high=127,
size=a_shape).astype(dtype)
+ w_np = np.random.randint(low=-128, high=128,
size=w_shape).astype(dtype)
+ b_np = np.random.uniform(size=bias_shape).astype(dtype)
+ dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation,
dilation))
+ c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride,
padding).astype(dtype)
- # Conv2d NCHW int8 schedule testing. Internally, it uses NCHWc
schedule. So, just
- # performing basic testing - one test for all different scenarios -
batch, dilation etc..
- verify_conv2d_nchw_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1)
- verify_conv2d_nchw_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1,
add_relu=True)
- verify_conv2d_nchw_int8(in_dtype, 1, 64, 56, 64, 3, 1, 1, dilation=2)
- verify_conv2d_nchw_int8(in_dtype, 9, 64, 56, 64, 3, 1, 1)
- verify_conv2d_nchw_int8(in_dtype, 4, 4, 4, 4, 4, 4, 4)
- verify_conv2d_nchw_int8(in_dtype, 1, 32, 149, 32, 3, 1, 0)
- verify_conv2d_nchw_int8(in_dtype, 7, 32, 149, 32, 3, 1, 0)
- verify_conv2d_nchw_int8(in_dtype, 1, 32, 35, 64, 7, 2, (0, 0, 1, 1))
- verify_conv2d_nchw_int8(in_dtype, 1, 32, 35, 64, 7, 2, (0, 0, 2, 2))
+ if add_bias:
+ b_np = np.random.uniform(size=bias_shape).astype(dtype)
+ c_np += b_np
+ if add_relu:
+ c_np = np.maximum(c_np, 0)
+ return a_np, w_np, b_np, c_np
-def test_conv2d_nhwc():
- with Int8Fallback():
- # Subset of inception v3 expanded (dilation > 1, batch > 1, 'VALID'
padding)
- verify_conv2d_NHWC_gemm_int8(1, 3, 299, 32, 3, 2, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 32, 149, 32, 3, 1, "SAME", dilation=2)
- verify_conv2d_NHWC_gemm_int8(4, 32, 147, 64, 3, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 64, 73, 80, 1, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 80, 73, 192, 3, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 192, 35, 48, 1, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 192, 35, 64, 1, 1, "VALID")
- verify_conv2d_NHWC_gemm_int8(1, 192, 35, 32, 1, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 48, 35, 64, 5, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 96, 35, 96, 3, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 256, 35, 48, 1, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 256, 35, 64, 1, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 288, 35, 64, 1, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 288, 35, 48, 1, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 96, 35, 96, 3, 2, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 128, 17, 192, 7, 1, "SAME", dilation=2)
- verify_conv2d_NHWC_gemm_int8(1, 160, 17, 160, 7, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 160, 17, 192, 1, 1, "VALID")
- verify_conv2d_NHWC_gemm_int8(1, 192, 17, 192, 1, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 768, 5, 128, 1, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 192, 17, 320, 3, 2, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 192, 17, 192, 3, 2, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 1280, 8, 192, 1, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 1280, 8, 384, 1, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 1280, 8, 320, 1, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 1280, 8, 448, 1, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 384, 8, 384, 1, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 384, 8, 384, 3, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 448, 8, 384, 3, 1, "VALID")
- verify_conv2d_NHWC_gemm_int8(1, 2048, 8, 320, 1, 1, "SAME")
- verify_conv2d_NHWC_gemm_int8(1, 2048, 8, 448, 1, 1, "SAME",
add_bias=True, add_relu=True)
- verify_conv2d_NHWC_gemm_int8(1, 2048, 8, 192, 1, 1, "SAME",
add_bias=True)
-
- # Let's also verify that it compiles fine on AArch64 targets
- compile_conv2d_NHWC_gemm_int8_arm(1, 3, 299, 32, 3, 2, "SAME")
+ a_np, w_np, b_np, c_np = get_ref_data()
+
+ def verify_workload_padding():
+ _, _, out_height, out_width = get_const_tuple(c_np.shape)
+ wkl = _get_workload(A, W, (stride, stride), padding, dilation,
dtype)
+
+ # for testing functionality,
+ # we choose arbitrary int32_lanes and num_int8_elements can divide
the channel,
+ # regardless of the performance.
+ int32_lanes, num_int8_elements = num_filter, in_channel
+
+ # check if tile_ow candidates are the factors of the right output
weight.
+ cfg = autotvm.get_config()
+ fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes,
num_int8_elements)
+ ow_tile = np.prod(cfg["tile_ow"].size)
+
+ tvm.testing.assert_allclose(ow_tile, out_width)
+
+ def check_target(target):
+ dev = tvm.device(target, 0)
+ if not tvm.testing.device_enabled(target):
+ print("Skip because %s is not enabled" % target)
+ return
+ if target == "cuda" and not
tvm.contrib.nvcc.have_int8(dev.compute_version):
+ print("Skip because int8 intrinsics are not available")
+ return
+
+ print("Running on target: %s" % target)
+ with tvm.target.Target(target):
+ C = topi.cuda.conv2d_nchw_int8(
+ A, W, (stride, stride), padding, (dilation, dilation),
dtype
+ )
+ if add_bias:
+ C = topi.add(C, bias)
+ if add_relu:
+ C = topi.nn.relu(C)
+ s = topi.cuda.schedule_conv2d_nchw_int8([C])
+
+ a = tvm.nd.array(a_np, dev)
+ w = tvm.nd.array(w_np, dev)
+ b = tvm.nd.array(b_np, dev)
+ c = tvm.nd.array(np.zeros(get_const_tuple(C.shape),
dtype=C.dtype), dev)
+ if add_bias:
+ func = tvm.build(
+ s,
+ [A, W, bias, C],
+ target,
+ name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
+ % (
+ batch,
+ in_channel,
+ in_size,
+ num_filter,
+ kernel,
+ stride,
+ padding_sum,
+ dilation,
+ ),
+ )
+ func(a, w, b, c)
+ else:
+ func = tvm.build(
+ s,
+ [A, W, C],
+ target,
+ name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
+ % (
+ batch,
+ in_channel,
+ in_size,
+ num_filter,
+ kernel,
+ stride,
+ padding_sum,
+ dilation,
+ ),
+ )
+ func(a, w, c)
+ tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
+
+ verify_workload_padding()
+
+ for target in ["cuda"]:
+ check_target(target)
Review Comment:
Done
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]