https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122069
Bug ID: 122069
Summary: Missed use of UDOT for char->int reduction
Product: gcc
Version: 15.0
Status: UNCONFIRMED
Keywords: missed-optimization
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: ktkachov at gcc dot gnu.org
CC: tnfchris at gcc dot gnu.org
Target Milestone: ---
Target: aarch64
Testcase:
inline char char_abs(char i) {
return (i < 0 ? -i : i);
}
int foo_int(unsigned char *x, unsigned char *y) {
int sum = 0;
for (int i = 0; i < 8000; i++)
sum += char_abs(x[i] - y[i]);
return sum;
}
Clang for AArch64 can, with the options -O3 -mcpu=grace -mllvm
-force-vector-interleave=1 produce:
foo_int(unsigned char*, unsigned char*):
movi v1.16b, #1
mov w8, #8000
movi v0.2d, #0000000000000000
.LBB0_1:
ldr q2, [x0], #16
ldr q3, [x1], #16
subs x8, x8, #16
sub v2.16b, v2.16b, v3.16b
udot z0.s, z2.b, z1.b
b.ne .LBB0_1
addv s0, v0.4s
fmov w0, s0
ret
GCC seems to miss it and generates:
foo_int(unsigned char*, unsigned char*):
movi v30.4s, 0
mov x2, 8000
str d15, [sp, -16]!
add x2, x1, x2
mov v31.16b, v30.16b
mov v29.16b, v30.16b
mov v28.16b, v30.16b
mov v27.16b, v30.16b
mov v26.16b, v30.16b
mov v25.16b, v30.16b
mov v24.16b, v30.16b
mov v23.16b, v30.16b
mov v22.16b, v30.16b
mov v21.16b, v30.16b
mov v20.16b, v30.16b
mov v19.16b, v30.16b
mov v18.16b, v30.16b
mov v17.16b, v30.16b
mov v16.16b, v30.16b
mov v7.16b, v30.16b
.L2:
ldp q6, q4, [x0]
ldp q2, q0, [x0, 32]
add x0, x0, 64
ldp q5, q3, [x1]
ldp q1, q15, [x1, 32]
add x1, x1, 64
sub v5.16b, v6.16b, v5.16b
sub v3.16b, v4.16b, v3.16b
sub v1.16b, v2.16b, v1.16b
sub v15.16b, v0.16b, v15.16b
zip1 v6.16b, v5.16b, v31.16b
zip1 v4.16b, v3.16b, v31.16b
zip1 v2.16b, v1.16b, v31.16b
zip1 v0.16b, v15.16b, v31.16b
zip2 v5.16b, v5.16b, v31.16b
zip2 v3.16b, v3.16b, v31.16b
zip2 v1.16b, v1.16b, v31.16b
zip2 v15.16b, v15.16b, v31.16b
uaddw v7.4s, v7.4s, v6.4h
uaddw2 v16.4s, v16.4s, v6.8h
uaddw v17.4s, v17.4s, v5.4h
uaddw2 v18.4s, v18.4s, v5.8h
uaddw v19.4s, v19.4s, v4.4h
uaddw2 v20.4s, v20.4s, v4.8h
uaddw v21.4s, v21.4s, v3.4h
uaddw2 v22.4s, v22.4s, v3.8h
uaddw v23.4s, v23.4s, v2.4h
uaddw2 v24.4s, v24.4s, v2.8h
uaddw v25.4s, v25.4s, v1.4h
uaddw2 v26.4s, v26.4s, v1.8h
uaddw v27.4s, v27.4s, v0.4h
uaddw2 v28.4s, v28.4s, v0.8h
uaddw v29.4s, v29.4s, v15.4h
uaddw2 v30.4s, v30.4s, v15.8h
cmp x1, x2
bne .L2
add v7.4s, v7.4s, v16.4s
ldr d15, [sp], 16
add v7.4s, v7.4s, v17.4s
add v7.4s, v7.4s, v18.4s
add v7.4s, v7.4s, v19.4s
add v7.4s, v7.4s, v20.4s
add v7.4s, v7.4s, v21.4s
add v7.4s, v7.4s, v22.4s
add v7.4s, v7.4s, v23.4s
add v7.4s, v7.4s, v24.4s
add v7.4s, v7.4s, v25.4s
add v7.4s, v7.4s, v26.4s
add v7.4s, v7.4s, v27.4s
add v7.4s, v7.4s, v28.4s
add v7.4s, v7.4s, v29.4s
add v7.4s, v7.4s, v30.4s
addv s31, v7.4s
fmov w0, s31
ret
https://godbolt.org/z/Eq66dvoeK