Issue 81112
Summary [AArch64][SVE] Cannot be vectorized, but GCC can vectorize.(TSVC s235)
Labels new issue
Assignees
Reporter m-saito-fj
    Clang cannot SVE vectorize TSVC s235, but GCC13.2.0 can.

Option:
`-Ofast -march=armv8.2-a+sve`

```c
#define LEN 32000
#define LEN2 256
static int ntimes = 200000;

float a[LEN], b[LEN], c[LEN], d[LEN], e[LEN];
float aa[LEN2][LEN2], bb[LEN2][LEN2], cc[LEN2][LEN2], dd[LEN2][LEN2];

int dummy(float[LEN], float[LEN], float[LEN], float[LEN], float[LEN],
          float[LEN2][LEN2], float[LEN2][LEN2], float[LEN2][LEN2], float);

int s235()
{
        for (int nl = 0; nl < 200*(ntimes/LEN2); nl++) {
                for (int i = 0; i < LEN2; i++) {
                        a[i] += b[i] * c[i];
 for (int j = 1; j < LEN2; j++) {
 aa[j][i] = aa[j-1][i] + bb[j][i] * a[i];
                        }
 }
                dummy(a, b, c, d, e, aa, bb, cc, 0.);
 }
        return 0;
}
```

See also (Clang vs GCC):
https://godbolt.org/z/KeeK58oz7

GCC result:
```asm
.L4:
        add     x8, x9, 1024
        mov x0, 0
        lsl     x11, x12, 2
        add     x13, x20, x11
 add     x15, x1, x11
        ld1w    z1.s, p0/z, [x13]
        ld1w z0.s, p0/z, [x15]
        add     x11, x2, x11
        ld1w    z2.s, p0/z, [x11]
        fmad    z2.s, p1/m, z0.s, z1.s
        st1w z2.s, p0, [x13]
.L3:
        ld1w    z1.s, p0/z, [x9, x0, lsl 2]
 ld1w    z0.s, p0/z, [x10, x0, lsl 2]
        fmad    z0.s, p1/m, z2.s, z1.s
        st1w    z0.s, p0, [x8, x0, lsl 2]
        add     x0, x0, 256
        cmp     x0, x19
        bne     .L3
        add     x12, x12, x16
        add     x9, x9, x21
        add     x10, x10, x21
 whilelo p0.s, w12, w14
        b.any   .L4
```
Regarding this result, it appears to me that it is vectorized for i in the outer loop.

`-mllvm -debug-_only_=loop-vectorize` messages:
```
LV: Checking a loop in 's235' from s235.c:19:4
LV: Loop hints: force=? width=vscale x 0 interleave=0
LV: Found a loop: for.body13
LV: Not vectorizing: Found an unidentified PHI   %3 = phi float [ %.pre, %for.body4 ], [ %add25, %for.body13 ], !dbg !30
LV: Interleaving disabled by the pass manager
LV: Can't vectorize the instructions or CFG
```
LLVM does not appear to be able to account for vectorization of loops of the form s235.c. 
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to