This is an automated email from the ASF dual-hosted git repository.
ekalda pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 93356cdcbe [Bugfix][TOPI] Fix a bug in arm_cpu int8 conv2d i8mm
schedule (#15484)
93356cdcbe is described below
commit 93356cdcbe2160b90827fdcb4c4a88f765d6dd03
Author: Andrei Hutu <[email protected]>
AuthorDate: Mon Aug 7 09:39:52 2023 +0100
[Bugfix][TOPI] Fix a bug in arm_cpu int8 conv2d i8mm schedule (#15484)
`topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved` was failing
compilation with the `+i8mm` extension enabled whenever the output height and
output width were both equal to 1, such that OH x OW = 1.
Padding was being removed during the `tir.BufferShapeLegalize` pass,
causing an error in the `tir.BufferBindUnwrapper` pass. Some of the removed
padding was necessary for tensorize (using the `gemm_acc_2x2_int8_int8_int32`
intrinsic), which expects 2x2 output tiles. However, because of the
optimisations mentioned above, the output tensor `C_interleaved` was reduced to
having 1x2 tiles instead.
e.g. for A = [1x1x1x8], W = [1x1x8x24], C = [1x1x1x24]:
- Before fix:
`C_interleaved = T.Buffer((1, 1, 2, 1, 6, 1, 2), "int32”)`
- After fix:
`C_interleaved = T.Buffer((1, 1, 2, 1, 6, 2, 2), "int32”)`
To make sure the required padding is left untouched, while the rest of it
is still removed, a dummy reference to the needed axis is declared.
Finally, the leftover padding is still disregarded when computing the final
output tensor `C`.
---
python/tvm/topi/arm_cpu/conv2d_gemm.py | 45 ++++++++++++++++++-----
tests/python/topi/python/test_topi_conv2d_int8.py | 11 +++---
2 files changed, 41 insertions(+), 15 deletions(-)
diff --git a/python/tvm/topi/arm_cpu/conv2d_gemm.py
b/python/tvm/topi/arm_cpu/conv2d_gemm.py
index 04748a4d81..ea9026688e 100644
--- a/python/tvm/topi/arm_cpu/conv2d_gemm.py
+++ b/python/tvm/topi/arm_cpu/conv2d_gemm.py
@@ -211,18 +211,45 @@ def compute_conv2d_gemm_without_weight_transform(
),
name="C_interleaved",
)
+ # Ensure the padding needed for tensorize does not get removed
during tir passes
+ # by adding a dummy reference to the specific padded area of the
result
+ zero = (
+ tvm.tir.const(1, C_interleaved.dtype)
+ * C_interleaved[
+ batches - 1,
+ M // tile_rows_A,
+ N_transformed - 1,
+ idxm(M, tile_rows_A) // 2,
+ tile_rows_B // 2 - 1,
+ 1,
+ 1,
+ ]
+ - tvm.tir.const(1, C_interleaved.dtype)
+ * C_interleaved[
+ batches - 1,
+ M // tile_rows_A,
+ N_transformed - 1,
+ idxm(M, tile_rows_A) // 2,
+ tile_rows_B // 2 - 1,
+ 1,
+ 1,
+ ]
+ )
# Unpack the result
C = te.compute(
(batches, M, N),
- lambda b, x, y: C_interleaved[
- b,
- x // tile_rows_A,
- y // tile_rows_B,
- idxm(x, tile_rows_A) // 2,
- idxm(y, tile_rows_B) // 2,
- idxm(idxm(x, tile_rows_A), 2),
- idxm(idxm(y, tile_rows_B), 2),
- ].astype(out_dtype),
+ lambda b, x, y: (
+ C_interleaved[
+ b,
+ x // tile_rows_A,
+ y // tile_rows_B,
+ idxm(x, tile_rows_A) // 2,
+ idxm(y, tile_rows_B) // 2,
+ idxm(idxm(x, tile_rows_A), 2),
+ idxm(idxm(y, tile_rows_B), 2),
+ ]
+ + zero
+ ).astype(out_dtype),
name="C",
)
else:
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py
b/tests/python/topi/python/test_topi_conv2d_int8.py
index e05dba3dfe..fd101fb797 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -57,12 +57,11 @@ devices = [
topi.arm_cpu.compute_conv2d_NHWC_quantized_native,
topi.arm_cpu.schedule_conv2d_NHWC_quantized_native,
),
- # TODO(giuseros) We need LLVM-11 in order to compile with +i8mm extension
- # (
- # "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+i8mm",
- # topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
- # topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
- # ),
+ (
+ "llvm --device arm_cpu --mtriple aarch64-linux-gnu
-mattr=+v8.2a,+i8mm",
+ topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
+ topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
+ ),
]