https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107009
Bug ID: 107009
Summary: massive unnecessary code blowup in vectorizer
Product: gcc
Version: 13.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: drepper.fsp+rhbz at gmail dot com
Target Milestone: ---
Given an annotated saxpy function:
#include <cstdlib>
void saxpy(size_t n, float* __restrict res, float a, const float* __restrict x,
const float* __restrict y)
{
if (n == 0 || n % 8 != 0)
__builtin_unreachable();
res = (float*)__builtin_assume_aligned(res, 32);
x = (const float*) __builtin_assume_aligned(x, 32);
y = (const float*) __builtin_assume_aligned(y, 32);
for (size_t i = 0; i < n; ++i)
res[i] = a * x[i] + y[i];
}
Compiling this with the gcc 12.2.1 version from Fedora 36 leads to the
expected, guided result (although the shrq isn't necessary…) with -O3:
_Z5saxpymPffPKfS1_:
.cfi_startproc
shrq $3, %rdi
vbroadcastss %xmm0, %ymm0
xorl %eax, %eax
salq $5, %rdi
.p2align 4
.p2align 3
.L2:
vmovaps (%rdx,%rax), %ymm1
vfmadd213ps (%rcx,%rax), %ymm0, %ymm1
vmovaps %ymm1, (%rsi,%rax)
addq $32, %rax
cmpq %rdi, %rax
jne .L2
vzeroupper
ret
The the current trunk gcc the result is massively bigger and given the guidance
in the sources none of the extra code is necessary.
_Z5saxpymPffPKfS1_:
.LFB22:
.cfi_startproc
movq %rdi, %r8
movq %rdx, %rdi
movq %rcx, %rdx
leaq -1(%r8), %rax
cmpq $6, %rax
jbe .L7
movq %r8, %rcx
vbroadcastss %xmm0, %ymm2
xorl %eax, %eax
shrq $3, %rcx
salq $5, %rcx
.p2align 4
.p2align 3
.L3:
vmovaps (%rdi,%rax), %ymm1
vfmadd213ps (%rdx,%rax), %ymm2, %ymm1
vmovaps %ymm1, (%rsi,%rax)
addq $32, %rax
cmpq %rcx, %rax
jne .L3
movq %r8, %rax
andq $-8, %rax
testb $7, %r8b
je .L18
vzeroupper
.L2:
movq %r8, %rcx
subq %rax, %rcx
leaq -1(%rcx), %r9
cmpq $2, %r9
jbe .L5
vmovaps (%rdx,%rax,4), %xmm3
vshufps $0, %xmm0, %xmm0, %xmm1
movq %rcx, %r9
vfmadd132ps (%rdi,%rax,4), %xmm3, %xmm1
andq $-4, %r9
vmovaps %xmm1, (%rsi,%rax,4)
addq %r9, %rax
andl $3, %ecx
je .L16
.L5:
vmovss (%rdi,%rax,4), %xmm1
leaq 0(,%rax,4), %rcx
leaq 1(%rax), %r9
vfmadd213ss (%rdx,%rax,4), %xmm0, %xmm1
vmovss %xmm1, (%rsi,%rcx)
cmpq %r8, %r9
jnb .L16
vmovss 4(%rdi,%rcx), %xmm1
addq $2, %rax
vfmadd213ss 4(%rdx,%rcx), %xmm0, %xmm1
vmovss %xmm1, 4(%rsi,%rcx)
cmpq %r8, %rax
jnb .L16
vmovss 8(%rdx,%rcx), %xmm4
vfmadd132ss 8(%rdi,%rcx), %xmm4, %xmm0
vmovss %xmm0, 8(%rsi,%rcx)
.L16:
ret
.p2align 4
.p2align 3
.L18:
vzeroupper
ret
.p2align 4
.p2align 3
.L7:
xorl %eax, %eax
jmp .L2