Issue 71511
Summary [AArch64] Missed vectorisation opportunity (tsvc, s112)
Labels backend:AArch64, vectorization
Assignees
Reporter sjoerdmeijer
    With Clang top of tree, we are about 35% behind on our AArch64 platform compared to GCC12 for the s122 kernel from TSVC: GCC vectorises the kernel, Clang doesn't. Clang seems to think it's not worthwhile vectorising this input with -O3 -ffast-math -mcpu=neoverse-v2:

```
__attribute__((aligned(64))) float x[32000];

__attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000],
 aa[256][256],bb[256][256],cc[256][256],tt[256][256];

int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float);

float s112()
{
    for (int nl = 0; nl < 3*100000; nl++) {
        for (int i = 32000 - 2; i >= 0; i--) {
            a[i+1] = a[i] + b[i];
 }
        dummy(a, b, c, d, e, aa, bb, cc, 0.);
 }
}
```

Clang's codegen:

```
.LBB0_3: //   Parent Loop BB0_2 Depth=1
        add     x10, x19, x8
        ldr     s0, [x9, #-8]!
        sub     x8, x8, #8
 ldur    s1, [x10, #-4]
        fadd    s2, s1, s0
        ldur    s0, [x9, #-4]
        ldr     s1, [x19, x8]
        fadd    s0, s1, s0
 stp     s0, s2, [x9]
        cbnz    x8, .LBB0_3
```

whereas GCC generates:

```
.L4:
 ldr     q30, [x27, x0]
        ldr     q28, [x20, x0]
        mov v31.16b, v30.16b
        mov     v29.16b, v28.16b
        tbl v30.16b, {v30.16b - v31.16b}, v27.16b
        tbl     v28.16b, {v28.16b - v29.16b}, v27.16b
        fadd    v30.4s, v30.4s, v28.4s
        mov v31.16b, v30.16b
        tbl     v30.16b, {v30.16b - v31.16b}, v27.16b
        str     q30, [x19, x0]
        sub     x0, x0, #16
 cmp     x0, x28
        bne     .L4
```

See https://godbolt.org/z/6cfK9sbj5 for the reproducer (and this codegen).


_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to