https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49869
--- Comment #1 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
On the trunk I only see one copy of the loop:
.L11:
movups (%rbx,%rax), %xmm7
movups 0(%rbp,%rax), %xmm0
movups (%r9,%rax), %xmm1
subps %xmm7, %xmm0
movups (%r11,%rax), %xmm7
addq $16, %rax
addps %xmm7, %xmm1
mulps %xmm6, %xmm0
mulps %xmm1, %xmm0
movaps %xmm0, %xmm1
addss %xmm2, %xmm1
movaps %xmm0, %xmm2
shufps $85, %xmm0, %xmm2
addss %xmm2, %xmm1
movaps %xmm0, %xmm2
unpckhps %xmm0, %xmm2
shufps $255, %xmm0, %xmm0
addss %xmm2, %xmm1
movaps %xmm1, %xmm2
addss %xmm0, %xmm2
cmpq %rax, %r10
jne .L11