lhutton1 commented on code in PR #16899:
URL: https://github.com/apache/tvm/pull/16899#discussion_r1572098369
##########
python/tvm/relay/op/strategy/arm_cpu.py:
##########
@@ -252,6 +252,17 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type,
target):
)
# Non-quantized cases
if is_aarch64 and data.dtype in ["float32", "float16"]:
+ if target.features.has_sve:
+ # This strategy is currently suboptimal because of
LLVM's limited support
+ # for scalable vector alias analysis, which causes
redundant loads / stores
Review Comment:
Might be a good idea to add the version of LLVM it was tested with, just to
give an indication in the future for when it might be worth trying again
##########
tests/python/topi/test_topi_conv2d_nhwc.py:
##########
@@ -57,11 +57,17 @@
topi.arm_cpu.compute_conv2d_NHWC_hybrid,
topi.arm_cpu.schedule_conv2d_NHWC_hybrid,
),
+ (
+ "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.6a,+sve",
+ topi.arm_cpu.compute_conv2d_NHWC_hybrid_SVE,
+ topi.arm_cpu.schedule_conv2d_NHWC_hybrid_SVE,
+ ),
)
dtype = tvm.testing.parameter("float32")
batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation =
tvm.testing.parameters(
+ (1, 1, 3, 15, 1, 1, "SAME", 1),
Review Comment:
I think it would be good to add at least one more test for a case that needs
padding
##########
python/tvm/topi/arm_cpu/conv2d_gemm.py:
##########
@@ -322,10 +326,24 @@ def compute_conv2d_gemm_without_weight_transform(
tvm.tir.const(1, C.dtype) * C[0, M_padded - 1, N_padded - 1]
- tvm.tir.const(1, C.dtype) * C[0, M_padded - 1, N_padded - 1]
)
+ elif use_scalable_vectors:
+ assert len(B_interleaved_t.shape) == 2
+ C = te.compute(
+ (batches, M_padded, N_padded),
+ lambda b, x, y: te.sum(
+ A[b, x, k].astype(in_dtype) * B_interleaved_t[k,
y].astype(in_dtype),
+ axis=k,
+ ),
+ name="C",
+ )
+ # Ensure padding on the N axis does not get removed during tir passes
+ # by adding a dummy reference to the specific padded area of the result
+ zero = (
+ tvm.tir.const(1, C.dtype) * C[0, 0, N_padded - 1]
+ - tvm.tir.const(1, C.dtype) * C[0, 0, N_padded - 1]
+ )
else:
- # Configuration space
- configure_knobs(cfg, M_padded, K_padded, target)
Review Comment:
Was removing this intentional?
##########
python/tvm/topi/arm_cpu/conv2d_gemm.py:
##########
@@ -478,23 +498,21 @@ def schedule_conv2d_gemm_native(cfg, s, out, final_out):
s[C].tensorize(y_inner, gemm_acc)
s[C].parallel(x_outer)
else:
- k_outer, k_inner = s[C].split(k, 4)
- x_outer, y_outer, x_inner, y_inner = s[C].tile(x, y, x_factor=4,
y_factor=y_tile_size)
- y_inner_outer, y_inner_inner = s[C].split(y_inner, nparts=4)
Review Comment:
Curious, does removing this change the schedule when we have `+neon`? How
does it impact performance?
##########
python/tvm/topi/arm_cpu/conv2d_gemm.py:
##########
@@ -478,23 +498,21 @@ def schedule_conv2d_gemm_native(cfg, s, out, final_out):
s[C].tensorize(y_inner, gemm_acc)
s[C].parallel(x_outer)
else:
- k_outer, k_inner = s[C].split(k, 4)
- x_outer, y_outer, x_inner, y_inner = s[C].tile(x, y, x_factor=4,
y_factor=y_tile_size)
- y_inner_outer, y_inner_inner = s[C].split(y_inner, nparts=4)
+ k_outer, k_inner = s[C].split(k, factor=4)
+ x_outer, x_inner = s[C].split(x, factor=4)
+ y_outer, y_inner = s[C].split(y, factor=y_tile_size,
disable_predication=True)
Review Comment:
Just wanted to check, "C" is guaranteed to be padded?
##########
python/tvm/topi/arm_cpu/conv2d_gemm.py:
##########
@@ -164,6 +159,7 @@ def compute_conv2d_gemm_without_weight_transform(
tile_K_A = 4
Review Comment:
Is there a function for these padding cases similar to
`get_tiling_B_transformed`? If not, is it worth creating one?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]