https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122687

            Bug ID: 122687
           Summary: Vectorization requires -fassociative-math when it
                    shouldn't
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: manu at gcc dot gnu.org
  Target Milestone: ---

The testcase below requires -fassociative-math for vectorization but the
generated code does not actually reassociate the expressions, that is, it
always calculates the differences before the products. I cannot see that any
intermediate pass reassociates the expression. I can imagine that some
transformations that the vectorizer may try may need -fassociative-math, but
not this one.

https://godbolt.org/z/Gaodcacz5

double one_point_hv(const double * restrict x, const double * restrict ref,
unsigned char d)
{
    double hv = ref[0] - x[0];
    for (unsigned char i = 1; i < d; i++)
        hv *= (ref[i] - x[i]);
    return hv;
}


gcc -O3 -march=x86-64-v3 -fopt-info-vec-optimized-missed -fno-signed-zeros
-ffinite-math-only -fno-trapping-math
<source>:5:33: missed: couldn't vectorize loop
<source>:6:19: missed: not vectorized: no vectype for stmt: _6 = *_5;
 scalar_type: const double

"one_point_hv":
        vmovsd  xmm1, QWORD PTR [rsi]
        vsubsd  xmm1, xmm1, QWORD PTR [rdi]
        cmp     dl, 1
        jbe     .L1
        mov     eax, 1
.L3:
        vmovsd  xmm0, QWORD PTR [rsi+rax*8]
        vsubsd  xmm0, xmm0, QWORD PTR [rdi+rax*8]
        add     rax, 1
        vmulsd  xmm1, xmm1, xmm0
        cmp     al, dl
        jb      .L3
.L1:
        vmovapd xmm0, xmm1
        ret

gcc -O3 -march=x86-64-v3 -fopt-info-vec-optimized-missed -fno-signed-zeros
-ffinite-math-only -fno-trapping-math -fassociative-math
<source>:5:33: optimized: loop vectorized using 32 byte vectors and unroll
factor 4


"one_point_hv":
        vmovsd  xmm2, QWORD PTR [rsi]
        mov     r8d, edx
        vsubsd  xmm2, xmm2, QWORD PTR [rdi]
        cmp     dl, 1
        jbe     .L1
        lea     eax, [r8-2]
        cmp     al, 2
        jbe     .L5
        vbroadcastsd    ymm1, QWORD PTR .LC1[rip]
        lea     r9d, [r8-1]
        xor     eax, eax
        mov     ecx, r9d
        shr     cl, 2
        movzx   edx, cl
        sal     rdx, 5
.L4:
        vmovupd ymm0, YMMWORD PTR [rsi+8+rax]
        vsubpd  ymm0, ymm0, YMMWORD PTR [rdi+8+rax]
        add     rax, 32
        vmulpd  ymm1, ymm1, ymm0
        cmp     rax, rdx
        jne     .L4
        vmovapd xmm0, xmm1
        vextractf128    xmm1, ymm1, 0x1
        sal     ecx, 2
        vmulpd  xmm0, xmm0, xmm1
        vpsrldq xmm1, xmm0, 8
        vmulpd  xmm0, xmm0, xmm1
        vmulsd  xmm2, xmm2, xmm0
        cmp     r9b, cl
        je      .L7
        lea     eax, [rcx+1]
        vzeroupper
.L3:
        movzx   edx, al
        vmovsd  xmm0, QWORD PTR [rsi+rdx*8]
        vsubsd  xmm0, xmm0, QWORD PTR [rdi+rdx*8]
        lea     edx, [rax+1]
        vmulsd  xmm2, xmm2, xmm0
        cmp     dl, r8b
        jnb     .L1
        movzx   edx, dl
        add     eax, 2
        vmovsd  xmm0, QWORD PTR [rsi+rdx*8]
        vsubsd  xmm0, xmm0, QWORD PTR [rdi+rdx*8]
        vmulsd  xmm2, xmm2, xmm0
        cmp     al, r8b
        jnb     .L1
        movzx   eax, al
        vmovsd  xmm0, QWORD PTR [rsi+rax*8]
        vsubsd  xmm0, xmm0, QWORD PTR [rdi+rax*8]
        vmulsd  xmm2, xmm2, xmm0
.L1:
        vmovapd xmm0, xmm2
        ret
.L7:
        vzeroupper
        vmovapd xmm0, xmm2
        ret
.L5:
        mov     eax, 1
        jmp     .L3
.LC1:
        .long   0
        .long   1072693248

Reply via email to