On Tue, Jul 31, 2018 at 05:39:37AM -0700, H.J. Lu wrote: > For > > --- > #define N 16 > float f[N]; > double d[N]; > int n[N]; > > __attribute__((noinline)) void > f3 (void) > { > int i; > for (i = 0; i < N; i++) > d[i] = f[i]; > } > --- > > r263067 improved -O3 -mavx2 -mtune=generic -m64 from > > .cfi_startproc > vmovaps f(%rip), %xmm2 > vmovaps f+32(%rip), %xmm3 > vinsertf128 $0x1, f+16(%rip), %ymm2, %ymm0 > vcvtps2pd %xmm0, %ymm1 > vextractf128 $0x1, %ymm0, %xmm0 > vmovaps %xmm1, d(%rip) > vextractf128 $0x1, %ymm1, d+16(%rip) > vcvtps2pd %xmm0, %ymm0 > vmovaps %xmm0, d+32(%rip) > vextractf128 $0x1, %ymm0, d+48(%rip) > vinsertf128 $0x1, f+48(%rip), %ymm3, %ymm0 > vcvtps2pd %xmm0, %ymm1 > vextractf128 $0x1, %ymm0, %xmm0 > vmovaps %xmm1, d+64(%rip) > vextractf128 $0x1, %ymm1, d+80(%rip) > vcvtps2pd %xmm0, %ymm0 > vmovaps %xmm0, d+96(%rip) > vextractf128 $0x1, %ymm0, d+112(%rip) > vzeroupper > ret > .cfi_endproc > > to > > .cfi_startproc > vcvtps2pd f(%rip), %ymm0 > vmovaps %xmm0, d(%rip) > vextractf128 $0x1, %ymm0, d+16(%rip) > vcvtps2pd f+16(%rip), %ymm0 > vmovaps %xmm0, d+32(%rip) > vextractf128 $0x1, %ymm0, d+48(%rip) > vcvtps2pd f+32(%rip), %ymm0 > vextractf128 $0x1, %ymm0, d+80(%rip) > vmovaps %xmm0, d+64(%rip) > vcvtps2pd f+48(%rip), %ymm0 > vextractf128 $0x1, %ymm0, d+112(%rip) > vmovaps %xmm0, d+96(%rip) > vzeroupper > ret > .cfi_endproc
I cannot really read AVX, but that looks like better code alright :-) Segher