https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112891
Bug ID: 112891
Summary: [10/11/12/13/14 Regression] Missing vzeroupper insert.
Product: gcc
Version: 14.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: liuhongt at gcc dot gnu.org
Target Milestone: ---
#include<math.h>
void
__attribute__((noinline))
bar (double* a)
{
a[0] = 1.0;
a[1] = 2.0;
}
void
__attribute__((noinline))
foo (double* __restrict a, double* b)
{
a[0] += b[0];
a[1] += b[1];
a[2] += b[2];
a[3] += b[3];
bar (b);
}
double
foo1 (double* __restrict a, double* b)
{
foo (a, b);
return exp (b[1]);
}
gcc -O3 -mavx2 Got
bar(double*):
vmovapd xmm0, XMMWORD PTR .LC0[rip]
vmovupd XMMWORD PTR [rdi], xmm0
ret
foo(double*, double*):
mov rax, rdi
vmovupd ymm0, YMMWORD PTR [rsi]
mov rdi, rsi
vaddpd ymm0, ymm0, YMMWORD PTR [rax]
vmovupd YMMWORD PTR [rax], ymm0
jmp bar(double*)
foo1(double*, double*):
sub rsp, 8
call foo(double*, double*)
vmovsd xmm0, QWORD PTR [rsi+8]
add rsp, 8
jmp exp
.LC0:
.long 0
.long 1072693248
.long 0
.long 1073741824
In foo, 256-bit ymm are used, and the upper bits are dirty, but there's no
vzeroupper inserted by exp which cause big avx->sse transition penalty.