On Tue, Jul 31, 2018 at 05:39:37AM -0700, H.J. Lu wrote:
> For
> 
> ---
> #define N 16
> float f[N];
> double d[N];
> int n[N];
> 
> __attribute__((noinline)) void
> f3 (void)
> {
>   int i;
>   for (i = 0; i < N; i++)
>     d[i] = f[i];
> }
> ---
> 
> r263067 improved -O3 -mavx2 -mtune=generic -m64 from
> 
> .cfi_startproc
> vmovaps f(%rip), %xmm2
> vmovaps f+32(%rip), %xmm3
> vinsertf128 $0x1, f+16(%rip), %ymm2, %ymm0
> vcvtps2pd %xmm0, %ymm1
> vextractf128 $0x1, %ymm0, %xmm0
> vmovaps %xmm1, d(%rip)
> vextractf128 $0x1, %ymm1, d+16(%rip)
> vcvtps2pd %xmm0, %ymm0
> vmovaps %xmm0, d+32(%rip)
> vextractf128 $0x1, %ymm0, d+48(%rip)
> vinsertf128 $0x1, f+48(%rip), %ymm3, %ymm0
> vcvtps2pd %xmm0, %ymm1
> vextractf128 $0x1, %ymm0, %xmm0
> vmovaps %xmm1, d+64(%rip)
> vextractf128 $0x1, %ymm1, d+80(%rip)
> vcvtps2pd %xmm0, %ymm0
> vmovaps %xmm0, d+96(%rip)
> vextractf128 $0x1, %ymm0, d+112(%rip)
> vzeroupper
> ret
> .cfi_endproc
> 
> to
> 
> .cfi_startproc
> vcvtps2pd f(%rip), %ymm0
> vmovaps %xmm0, d(%rip)
> vextractf128 $0x1, %ymm0, d+16(%rip)
> vcvtps2pd f+16(%rip), %ymm0
> vmovaps %xmm0, d+32(%rip)
> vextractf128 $0x1, %ymm0, d+48(%rip)
> vcvtps2pd f+32(%rip), %ymm0
> vextractf128 $0x1, %ymm0, d+80(%rip)
> vmovaps %xmm0, d+64(%rip)
> vcvtps2pd f+48(%rip), %ymm0
> vextractf128 $0x1, %ymm0, d+112(%rip)
> vmovaps %xmm0, d+96(%rip)
> vzeroupper
> ret
> .cfi_endproc

I cannot really read AVX, but that looks like better code alright :-)


Segher

Reply via email to