https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122687
Bug ID: 122687
Summary: Vectorization requires -fassociative-math when it
shouldn't
Product: gcc
Version: 16.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: manu at gcc dot gnu.org
Target Milestone: ---
The testcase below requires -fassociative-math for vectorization but the
generated code does not actually reassociate the expressions, that is, it
always calculates the differences before the products. I cannot see that any
intermediate pass reassociates the expression. I can imagine that some
transformations that the vectorizer may try may need -fassociative-math, but
not this one.
https://godbolt.org/z/Gaodcacz5
double one_point_hv(const double * restrict x, const double * restrict ref,
unsigned char d)
{
double hv = ref[0] - x[0];
for (unsigned char i = 1; i < d; i++)
hv *= (ref[i] - x[i]);
return hv;
}
gcc -O3 -march=x86-64-v3 -fopt-info-vec-optimized-missed -fno-signed-zeros
-ffinite-math-only -fno-trapping-math
<source>:5:33: missed: couldn't vectorize loop
<source>:6:19: missed: not vectorized: no vectype for stmt: _6 = *_5;
scalar_type: const double
"one_point_hv":
vmovsd xmm1, QWORD PTR [rsi]
vsubsd xmm1, xmm1, QWORD PTR [rdi]
cmp dl, 1
jbe .L1
mov eax, 1
.L3:
vmovsd xmm0, QWORD PTR [rsi+rax*8]
vsubsd xmm0, xmm0, QWORD PTR [rdi+rax*8]
add rax, 1
vmulsd xmm1, xmm1, xmm0
cmp al, dl
jb .L3
.L1:
vmovapd xmm0, xmm1
ret
gcc -O3 -march=x86-64-v3 -fopt-info-vec-optimized-missed -fno-signed-zeros
-ffinite-math-only -fno-trapping-math -fassociative-math
<source>:5:33: optimized: loop vectorized using 32 byte vectors and unroll
factor 4
"one_point_hv":
vmovsd xmm2, QWORD PTR [rsi]
mov r8d, edx
vsubsd xmm2, xmm2, QWORD PTR [rdi]
cmp dl, 1
jbe .L1
lea eax, [r8-2]
cmp al, 2
jbe .L5
vbroadcastsd ymm1, QWORD PTR .LC1[rip]
lea r9d, [r8-1]
xor eax, eax
mov ecx, r9d
shr cl, 2
movzx edx, cl
sal rdx, 5
.L4:
vmovupd ymm0, YMMWORD PTR [rsi+8+rax]
vsubpd ymm0, ymm0, YMMWORD PTR [rdi+8+rax]
add rax, 32
vmulpd ymm1, ymm1, ymm0
cmp rax, rdx
jne .L4
vmovapd xmm0, xmm1
vextractf128 xmm1, ymm1, 0x1
sal ecx, 2
vmulpd xmm0, xmm0, xmm1
vpsrldq xmm1, xmm0, 8
vmulpd xmm0, xmm0, xmm1
vmulsd xmm2, xmm2, xmm0
cmp r9b, cl
je .L7
lea eax, [rcx+1]
vzeroupper
.L3:
movzx edx, al
vmovsd xmm0, QWORD PTR [rsi+rdx*8]
vsubsd xmm0, xmm0, QWORD PTR [rdi+rdx*8]
lea edx, [rax+1]
vmulsd xmm2, xmm2, xmm0
cmp dl, r8b
jnb .L1
movzx edx, dl
add eax, 2
vmovsd xmm0, QWORD PTR [rsi+rdx*8]
vsubsd xmm0, xmm0, QWORD PTR [rdi+rdx*8]
vmulsd xmm2, xmm2, xmm0
cmp al, r8b
jnb .L1
movzx eax, al
vmovsd xmm0, QWORD PTR [rsi+rax*8]
vsubsd xmm0, xmm0, QWORD PTR [rdi+rax*8]
vmulsd xmm2, xmm2, xmm0
.L1:
vmovapd xmm0, xmm2
ret
.L7:
vzeroupper
vmovapd xmm0, xmm2
ret
.L5:
mov eax, 1
jmp .L3
.LC1:
.long 0
.long 1072693248