Issue 55438
Summary [Aarch64][SVE] Bad code generation of llvm.fmuladd.* for SVE
Labels new issue
Assignees
Reporter jsetoain
    I'm trying to compile code that does this:
c += a * broadcast(b[0])

For fixed length vectors, if I use this:
```
declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #3

define void @fp_convert_combine_crash(float *%arg0, float *%arg1, float* %arg2) #0 {
        %ap = bitcast float* %arg0 to <4 x float>*
        %bp = bitcast float* %arg1 to <4 x float>*
        %cp = bitcast float* %arg2 to <4 x float>*

        %a = load <4 x float>, <4 x float>* %ap
        %b = load <4 x float>, <4 x float>* %bp
        %c = load <4 x float>, <4 x float>* %cp

        %b0splat = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer

        %mad = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b0splat, <4 x float> %c) #3

        store <4 x float> %mad, <4 x float>* %cp
        ret void
}

```
And, when I compile it, I get this assembly:
```
    ldr q0, [x1]                   // Load b
    ldr q1, [x0]                   // Load a
    ldr q2, [x2]                   // Load c
    fmla    v2.4s, v1.4s, v0.s[0]  // mad = c + a * splat(b[i])
    str q2, [x2]                   // store mad in c
    ret

```
Which looks good to me, although one might argue that it should not need to load the whole vector `b` if it's going to splat its first element.

But, if I write a scalable version of the same code:
```
declare <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) #3

define void @fp_convert_combine_crash(float *%arg0, float *%arg1, float* %arg2) #0 {
        %ap = bitcast float* %arg0 to <vscale x 4 x float>*
        %bp = bitcast float* %arg1 to <vscale x 4 x float>*
        %cp = bitcast float* %arg2 to <vscale x 4 x float>*

        %a = load <vscale x 4 x float>, <vscale x 4 x float>* %ap
        %b = load <vscale x 4 x float>, <vscale x 4 x float>* %bp
        %c = load <vscale x 4 x float>, <vscale x 4 x float>* %cp

        %b0splat = shufflevector <vscale x 4 x float> %b, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer

        %mad = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b0splat, <vscale x 4 x float> %c) #3

        store <vscale x 4 x float> %mad, <vscale x 4 x float>* %cp
        ret void
}
```

I obtain this:

```
    ptrue   p0.s
    ld1w    { z0.s }, p0/z, [x0]    // Load a
    ld1w    { z1.s }, p0/z, [x1]    // Load b
    ld1w    { z2.s }, p0/z, [x2]    // Load c
    mov z1.s, s1                    // Splat b[0]
    fmad    z0.s, p0/m, z1.s, z2.s  // mad = a + (splat b[0]) * c
    st1w    { z0.s }, p0, [x2]      // store mad in c
    ret
```

Which is semantically incorrect (it's reading the arguments of `fmuladd` in the wrong order) and instead of using the indexed version of fmad is using a splat, which might have performance implications.

Additionally, if I try to use vscale_range to generate SVE from fixed-length vector code:

```
declare <8 x float> @llvm.fmuladd.v4f32(<8 x float>, <8 x float>, <8 x float>) #3

define void @fp_convert_combine_crash(float *%arg0, float *%arg1, float* %arg2) #0 {
        %ap = bitcast float* %arg0 to <8 x float>*
        %bp = bitcast float* %arg1 to <8 x float>*
        %cp = bitcast float* %arg2 to <8 x float>*

        %a = load <8 x float>, <8 x float>* %ap
        %b = load <8 x float>, <8 x float>* %bp
        %c = load <8 x float>, <8 x float>* %cp

        %b0splat = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> zeroinitializer

        %mad = call <8 x float> @llvm.fmuladd.v4f32(<8 x float> %a, <8 x float> %b0splat, <8 x float> %c) #3

        store <8 x float> %mad, <8 x float>* %cp
        ret void
}

attributes #0 = { vscale_range(2,2) "target-features"="+sve" }
attributes #3 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
```

I get this:
```
    stp x29, x30, [sp, #-16]!           // 16-byte Folded Spill
    mov x29, sp
    sub x9, sp, #48
    and sp, x9, #0xffffffffffffffe0
    ptrue   p0.s
    ld1w    { z0.s }, p0/z, [x0]    // Load a
    ld1w    { z1.s }, p0/z, [x1]    // Load b
    ld1w    { z2.s }, p0/z, [x2]    // Load c
    stp s1, s1, [sp, #24]           // Splat b[0] to the stack
    stp s1, s1, [sp, #16]           // Splat b[0] to the stack
    stp s1, s1, [sp, #8]            // Splat b[0] to the stack
    stp s1, s1, [sp]                // Splat b[0] to the stack
    ld1w    { z1.s }, p0/z, [sp]    // Load splatted b[0] from the stack
    fmad    z0.s, p0/m, z1.s, z2.s  // mad = splat(b[0]) * c + a
    st1w    { z0.s }, p0, [x2]      // Store mad in c
    mov sp, x29
    ldp x29, x30, [sp], #16             // 16-byte Folded Reload
    ret
```

Which introduces additional performance issues by doing the splat through the stack, instead of simply `mov z1.s, s1`, or something to that effect.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to