https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85115

            Bug ID: 85115
           Summary: Failure to (auto)vectorize sqrtf
           Product: gcc
           Version: 7.3.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: robertw89 at googlemail dot com
  Target Milestone: ---

Fails to (auto)vectorize the code bellow with the flags

-O3 -mavx

#include <cmath>

void simdSqrt(
    float * __restrict__ a,
    float * __restrict__ res,
    int size)
{
        int i;

        float *aAligned = (float*)__builtin_assume_aligned(a, 32);
        float *resAligned = (float*)__builtin_assume_aligned(res, 32);

        for (i = 0; i < size; i++) {
                resAligned[i] = sqrtf(aAligned[i]);
        }
}

produces (as displayed by https://godbolt.org/)

simdSqrt(float*, float*, int):
        test    edx, edx
        jle     .L8
        lea     eax, [rdx-1]
        push    r12
        vxorps  xmm2, xmm2, xmm2
        lea     r12, [rdi+4+rax*4]
        sub     rsp, 32
.L3:
        vmovss  xmm0, DWORD PTR [rdi]
        vucomiss        xmm2, xmm0
        vsqrtss xmm1, xmm1, xmm0
        ja      .L12
        add     rdi, 4
        vmovss  DWORD PTR [rsi], xmm1
        add     rsi, 4
        cmp     rdi, r12
        jne     .L3
.L6:
        add     rsp, 32
        pop     r12
        ret
.L8:
        rep ret
.L12:
        vmovss  DWORD PTR [rsp+28], xmm2
        mov     QWORD PTR [rsp+16], rsi
        mov     QWORD PTR [rsp+8], rdi
        vmovss  DWORD PTR [rsp+24], xmm1
        call    sqrtf
        mov     rdi, QWORD PTR [rsp+8]
        mov     rsi, QWORD PTR [rsp+16]
        vmovss  xmm1, DWORD PTR [rsp+24]
        vmovss  xmm2, DWORD PTR [rsp+28]
        add     rdi, 4
        vmovss  DWORD PTR [rsi], xmm1
        add     rsi, 4
        cmp     rdi, r12
        jne     .L3
        jmp     .L6

Reply via email to