| Issue |
71512
|
| Summary |
[AArch64] suboptimal vectorisation (tsvc, s128)
|
| Labels |
backend:AArch64,
new issue,
vectorization
|
| Assignees |
|
| Reporter |
sjoerdmeijer
|
Clang tip of tree is 120% behind GCC for kernel s128 in TSVC. Compile this input with `-O3 -ffast-math -mcpu=neoverse-v2`:
```
__attribute__((aligned(64))) float x[32000];
__attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000],
aa[256][256],bb[256][256],cc[256][256],tt[256][256];
int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float);
float s128(struct args_t * func_args)
{
int j, k;
for (int nl = 0; nl < 2*100000; nl++) {
j = -1;
for (int i = 0; i < 32000/2; i++) {
k = j + 1;
a[i] = b[k] - d[i];
j = k + 1;
b[k] = a[i] + c[k];
}
dummy(a, b, c, d, e, aa, bb, cc, 1.);
}
}
```
GCC's codegen:
```
.L3:
ld2 {v28.4s - v29.4s}, [x0]
mov x6, x0
add x5, x0, 16
add x4, x0, 24
add x0, x0, 32
ldr q27, [x8], 16
ld2 {v30.4s - v31.4s}, [x7], 32
fsub v28.4s, v28.4s, v27.4s
fadd v30.4s, v28.4s, v30.4s
str q28, [x9], 16
str s30, [x6], 8
st1 {v30.s}[1], [x6]
st1 {v30.s}[2], [x5]
st1 {v30.s}[3], [x4]
cmp x7, x23
bne .L3
```
Clang's codegen:
```
.LBB0_2: // Parent Loop BB0_1 Depth=1
mov z2.d, z0.d
add z2.d, z2.d, #1 // =0x1
adr z3.d, [z7.d, z2.d, lsl #2]
fmov x9, d3
ld2w { z3.s, z4.s }, p0/z, [x9]
ld1w { z5.s }, p0/z, [x21, x8, lsl #2]
fmov x9, d2
fsub z3.s, z3.s, z5.s
st1w { z3.s }, p0, [x22, x8, lsl #2]
add x8, x8, x28
ld2w { z4.s, z5.s }, p0/z, [x20, x9, lsl #2]
add x9, x19, #4
cmp x23, x8
fadd z2.s, z4.s, z3.s
uunpklo z3.d, z2.s
uunpkhi z2.d, z2.s
st1w { z3.d }, p1, [x9, z0.d, lsl #2]
add z0.d, z0.d, z6.d
st1w { z2.d }, p1, [x9, z1.d, lsl #2]
add z1.d, z1.d, z6.d
b.ne .LBB0_2
```
See also:
https://godbolt.org/z/154McGMve
Todo: root cause analysis
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs