https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97352
--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> --- So a simpler testcase is the following (but hinting at the possibly not generic enough solution to split the load group): double a[6], b[6]; void foo() { a[0] = b[0]; a[1] = b[1]; a[2] = b[2]; a[3] = b[3]; a[4] = b[4]; a[5] = b[5]; } produces with SSE: movapd b(%rip), %xmm0 movapd b+16(%rip), %xmm1 movapd b+32(%rip), %xmm2 movaps %xmm0, a(%rip) movaps %xmm1, a+16(%rip) movaps %xmm2, a+32(%rip) and with AVX: vmovsd b+32(%rip), %xmm0 vmovapd b(%rip), %ymm1 vmovsd %xmm0, a+32(%rip) vmovsd b+40(%rip), %xmm0 vmovapd %ymm1, a(%rip) vmovsd %xmm0, a+40(%rip) while we'd like to see sth like vmovapd b(%rip), %ymm1 vmovapd %ymm1, a(%rip) movapd b+32(%rip), %xmm2 movaps %xmm2, a+32(%rip)