https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91723

            Bug ID: 91723
           Summary: builtin fma is not optimized or vectorized as *+
           Product: gcc
           Version: 10.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: nsz at gcc dot gnu.org
  Target Milestone: ---

i'd expect a*b+c to generate the same code as __builtin_fmaf(a,b,c)
when hw instruction is available for fmaf, but the later generates
significantly worst code in some cases, e.g. when vectorization
is involved.

consider:

void
foo (float *restrict r, const float *restrict a,
     const float *restrict b, const float *restrict c)
{
        for (int i=0; i < 4; i++) {
                float x;
#ifdef BUILTIN
                x = __builtin_fmaf(a[i],b[i],c[i]);
                x = __builtin_fmaf(a[i],b[i],x);
#else
                x = a[i]*b[i]+c[i];
                x = a[i]*b[i]+x;
#endif
                r[i] = x;
        }
}

with gcc -O3 -mfma -mavx -ffp-contract=fast -fno-math-errno i get good code:

foo:
        vmovups (%rdx), %xmm0
        vmovups (%rsi), %xmm1
        vmovaps %xmm0, %xmm2
        vfmadd213ps     (%rcx), %xmm1, %xmm2
        vfmadd132ps     %xmm1, %xmm2, %xmm0
        vmovups %xmm0, (%rdi)
        ret

but if i add -DBUILTIN i get

foo:
        vmovss  (%rsi), %xmm0
        vmovss  (%rdx), %xmm1
        vmovaps %xmm0, %xmm2
        vfmadd213ss     (%rcx), %xmm1, %xmm2
        vfmadd132ss     %xmm1, %xmm2, %xmm0
        vmovss  4(%rdx), %xmm1
        vmovss  %xmm0, (%rdi)
        vmovss  4(%rsi), %xmm0
        vmovaps %xmm0, %xmm2
        vfmadd213ss     4(%rcx), %xmm1, %xmm2
        vfmadd132ss     %xmm1, %xmm2, %xmm0
        vmovss  8(%rdx), %xmm1
        vmovss  %xmm0, 4(%rdi)
        vmovss  8(%rsi), %xmm0
        vmovaps %xmm0, %xmm2
        vfmadd213ss     8(%rcx), %xmm1, %xmm2
        vfmadd132ss     %xmm1, %xmm2, %xmm0
        vmovss  12(%rdx), %xmm1
        vmovss  %xmm0, 8(%rdi)
        vmovss  12(%rsi), %xmm0
        vmovaps %xmm0, %xmm2
        vfmadd213ss     12(%rcx), %xmm1, %xmm2
        vfmadd132ss     %xmm1, %xmm2, %xmm0
        vmovss  %xmm0, 12(%rdi)
        ret


i expected identical results, the same happens on other targets.

Reply via email to