https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122219
--- Comment #11 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Pengfei Li from comment #5)
> I might have reduced it too far. The original code was written with x86
> intrinsics and compiled with the SIMDe (SIMD everywhere) library. A more
> original case (like below) doesn't have undefined data.
>
> #define SIMDE_ENABLE_NATIVE_ALIASES
>
> #include "simde/x86/avx2.h"
>
> void foo(__m256& v, unsigned int n) {
> __m128 f0 = {1.0f, 2.0f, 3.0f, 4.0f};
> __m128 f1 = {5.0f, 6.0f, 7.0f, 8.0f};
> for (int i = 0; i < n; i++) {
> f0 = f0 + f0;
> f1 = f1 + f1;
> v = _mm256_castps128_ps256(f0);
> v = _mm256_insertf128_ps(v, f1, 1);
> }
> }
>
> Even with -fstack-reuse=none, the stores don’t sink.
>
> I acknowledge the workload code isn’t well written. We can either move the
> assignments to v out or use _mm256_set_m128 instead of the cast + insert.
> However, we also observed that even for this case LLVM can sink the stores.
> So perhaps there's still room for optimization.
I'll note that using x86intrin.h has the store sunk:
_Z3fooRDv8_fj:
.LFB7470:
.cfi_startproc
testl %esi, %esi
je .L8
vmovaps .LC0(%rip), %xmm1
vmovaps .LC1(%rip), %xmm0
xorl %eax, %eax
.p2align 4
.p2align 4
.p2align 3
.L3:
addl $1, %eax
vaddps %xmm0, %xmm0, %xmm0
vaddps %xmm1, %xmm1, %xmm1
cmpl %esi, %eax
jne .L3
vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
vmovaps %ymm0, (%rdi)
vzeroupper
.L8:
ret