https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97428
Bug ID: 97428
Summary: -O3 is great for basic AoSoA packing of complex
arrays, but horrible one step above the basic
Product: gcc
Version: 10.2.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: already5chosen at yahoo dot com
Target Milestone: ---
That my next example of bad handling of AoSoA layout by gcc
optimizer/vectorizer.
For discussion of AoSoA see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97343
The issue in hand is transformation (packing) of complex AoS numbers into AoSoA
format.
Compiler used: gcc 10.2
Target: AVX2 (Skylake)
Part 1.
typedef struct { double re, im; } dcmlx_t;
typedef struct { double re[4], im[4]; } dcmlx4_t;
void foo(dcmlx4_t dst[], const dcmlx_t src[], int n)
{
for (int i = 0; i < n; ++i) {
dcmlx_t s00 = src[i*4+0];
dcmlx_t s01 = src[i*4+1];
dcmlx_t s02 = src[i*4+2];
dcmlx_t s03 = src[i*4+3];
dcmlx_t s10 = src[i*4+0+n];
dcmlx_t s11 = src[i*4+1+n];
dcmlx_t s12 = src[i*4+2+n];
dcmlx_t s13 = src[i*4+3+n];
dst[i*2+0].re[0] = s00.re;
dst[i*2+0].re[1] = s01.re;
dst[i*2+0].re[2] = s02.re;
dst[i*2+0].re[3] = s03.re;
dst[i*2+0].im[0] = s00.im;
dst[i*2+0].im[1] = s01.im;
dst[i*2+0].im[2] = s02.im;
dst[i*2+0].im[3] = s03.im;
dst[i*2+1].re[0] = s10.re;
dst[i*2+1].re[1] = s11.re;
dst[i*2+1].re[2] = s12.re;
dst[i*2+1].re[3] = s13.re;
dst[i*2+1].im[0] = s10.im;
dst[i*2+1].im[1] = s11.im;
dst[i*2+1].im[2] = s12.im;
dst[i*2+1].im[3] = s13.im;
}
}
-march=skylake -O2 produces following inner loop:
.L3:
vmovsd (%rdx), %xmm7
vmovsd 8(%rdx), %xmm3
vmovsd 16(%rdx), %xmm6
vmovsd 24(%rdx), %xmm2
vmovsd 32(%rdx), %xmm5
vmovsd 40(%rdx), %xmm1
vmovsd 48(%rdx), %xmm4
vmovsd 56(%rdx), %xmm0
addq $64, %rdx
vmovsd %xmm7, (%rcx)
vmovsd %xmm6, 8(%rcx)
vmovsd %xmm5, 16(%rcx)
vmovsd %xmm4, 24(%rcx)
vmovsd %xmm3, 32(%rcx)
vmovsd %xmm2, 40(%rcx)
vmovsd %xmm1, 48(%rcx)
vmovsd %xmm0, 56(%rcx)
addq $64, %rcx
cmpq %rax, %rdx
jne .L3
Quite reasonable for non-vectorizing optimization level. It's possible to save
one instruction by using indexed addressing, but in majority of situations it
wouldn't be faster.
-march=skylake -O3 inner loop:
.L3:
vmovupd (%rdx,%rax), %ymm0
vmovupd 32(%rdx,%rax), %ymm2
vunpcklpd %ymm2, %ymm0, %ymm1
vunpckhpd %ymm2, %ymm0, %ymm0
vpermpd $216, %ymm1, %ymm1
vpermpd $216, %ymm0, %ymm0
vmovupd %ymm1, (%rcx,%rax)
vmovupd %ymm0, 32(%rcx,%rax)
addq $64, %rax
cmpq %r8, %rax
jne .L3
That's excellent. It's not only looks better. According to my measurement, for
source array in external memory and destination in L1/L2 cache it's actually
~1.5x faster than -O2, which is not a small fit.
Part 2.
A little more involved case. Now we want to interleave 2 lines of source
matrix.
Sometimes it's desirable to have interleaved layout, because it improves
locality of access for the rest of processing, and also can reduce pressure on
GPRs that are used as pointers or indices.
typedef struct { double re, im; } dcmlx_t;
typedef struct { double re[4], im[4]; } dcmlx4_t;
void foo_i2(dcmlx4_t dst[], const dcmlx_t src[], int n)
{
for (int i = 0; i < n; ++i) {
dcmlx_t s00 = src[i*4+0];
dcmlx_t s01 = src[i*4+1];
dcmlx_t s02 = src[i*4+2];
dcmlx_t s03 = src[i*4+3];
dcmlx_t s10 = src[i*4+0+n];
dcmlx_t s11 = src[i*4+1+n];
dcmlx_t s12 = src[i*4+2+n];
dcmlx_t s13 = src[i*4+3+n];
dst[i*2+0].re[0] = s00.re;
dst[i*2+0].re[1] = s01.re;
dst[i*2+0].re[2] = s02.re;
dst[i*2+0].re[3] = s03.re;
dst[i*2+0].im[0] = s00.im;
dst[i*2+0].im[1] = s01.im;
dst[i*2+0].im[2] = s02.im;
dst[i*2+0].im[3] = s03.im;
dst[i*2+1].re[0] = s10.re;
dst[i*2+1].re[1] = s11.re;
dst[i*2+1].re[2] = s12.re;
dst[i*2+1].re[3] = s13.re;
dst[i*2+1].im[0] = s10.im;
dst[i*2+1].im[1] = s11.im;
dst[i*2+1].im[2] = s12.im;
dst[i*2+1].im[3] = s13.im;
}
}
-march=skylake -O2 produces following inner loop:
.L3:
vmovsd (%rdx), %xmm15
vmovsd 8(%rdx), %xmm11
vmovsd 16(%rdx), %xmm14
vmovsd 24(%rdx), %xmm10
vmovsd 32(%rdx), %xmm13
vmovsd 40(%rdx), %xmm9
vmovsd 48(%rdx), %xmm12
vmovsd 56(%rdx), %xmm8
vmovsd (%rax), %xmm7
vmovsd 8(%rax), %xmm3
vmovsd 16(%rax), %xmm6
vmovsd 24(%rax), %xmm2
vmovsd 32(%rax), %xmm5
vmovsd 40(%rax), %xmm1
vmovsd 48(%rax), %xmm4
vmovsd 56(%rax), %xmm0
subq $-128, %rcx
vmovsd %xmm15, -128(%rcx)
vmovsd %xmm14, -120(%rcx)
vmovsd %xmm13, -112(%rcx)
vmovsd %xmm12, -104(%rcx)
vmovsd %xmm11, -96(%rcx)
vmovsd %xmm10, -88(%rcx)
vmovsd %xmm9, -80(%rcx)
vmovsd %xmm8, -72(%rcx)
vmovsd %xmm7, -64(%rcx)
vmovsd %xmm6, -56(%rcx)
vmovsd %xmm5, -48(%rcx)
vmovsd %xmm4, -40(%rcx)
vmovsd %xmm3, -32(%rcx)
vmovsd %xmm2, -24(%rcx)
vmovsd %xmm1, -16(%rcx)
vmovsd %xmm0, -8(%rcx)
addq $64, %rdx
addq $64, %rax
cmpq %rcx, %r8
jne .L3
Once again, in absence of vectorizer it's very reasonable.
But may be, vectorizer can do better, as it did in the Part 1?
-march=skylake -O3 inner loop:
.L4:
vmovupd (%rcx), %ymm5
vmovupd 64(%rcx), %ymm4
vunpcklpd 32(%rcx), %ymm5, %ymm3
vunpckhpd 32(%rcx), %ymm5, %ymm1
vmovupd 128(%rcx), %ymm5
vmovupd 192(%rcx), %ymm7
vunpcklpd 160(%rcx), %ymm5, %ymm0
vunpckhpd 160(%rcx), %ymm5, %ymm2
vmovupd 192(%rcx), %ymm5
vunpcklpd 96(%rcx), %ymm4, %ymm6
vunpcklpd 224(%rcx), %ymm5, %ymm5
vunpckhpd 96(%rcx), %ymm4, %ymm4
vunpckhpd 224(%rcx), %ymm7, %ymm7
vpermpd $216, %ymm5, %ymm5
vpermpd $216, %ymm0, %ymm0
vpermpd $216, %ymm3, %ymm3
vpermpd $216, %ymm4, %ymm4
vpermpd $216, %ymm7, %ymm7
vunpcklpd %ymm5, %ymm0, %ymm8
vpermpd $216, %ymm1, %ymm1
vunpckhpd %ymm5, %ymm0, %ymm0
vpermpd $216, %ymm6, %ymm6
vpermpd $216, %ymm2, %ymm2
vunpcklpd %ymm6, %ymm3, %ymm15
vunpcklpd %ymm4, %ymm1, %ymm5
vunpckhpd %ymm6, %ymm3, %ymm6
vunpckhpd %ymm4, %ymm1, %ymm1
vpermpd $216, %ymm0, %ymm3
vunpcklpd %ymm7, %ymm2, %ymm0
vunpckhpd %ymm7, %ymm2, %ymm2
vpermpd $216, %ymm0, %ymm0
vpermpd $216, %ymm2, %ymm2
vpermpd $216, %ymm6, %ymm6
vpermpd $216, %ymm5, %ymm5
vpermpd $216, %ymm1, %ymm1
vunpcklpd %ymm0, %ymm5, %ymm13
vunpcklpd %ymm2, %ymm1, %ymm12
vunpckhpd %ymm0, %ymm5, %ymm5
vunpckhpd %ymm2, %ymm1, %ymm1
vunpcklpd %ymm3, %ymm6, %ymm0
vmovupd (%rdx), %ymm2
vunpckhpd %ymm3, %ymm6, %ymm6
vmovupd 64(%rdx), %ymm3
vunpcklpd 32(%rdx), %ymm2, %ymm2
vpermpd $216, %ymm1, %ymm4
vunpcklpd 96(%rdx), %ymm3, %ymm1
vmovupd 128(%rdx), %ymm3
vpermpd $216, %ymm1, %ymm1
vpermpd $216, %ymm2, %ymm2
vunpcklpd %ymm1, %ymm2, %ymm2
vunpcklpd 160(%rdx), %ymm3, %ymm1
vmovupd 192(%rdx), %ymm3
vpermpd $216, %ymm1, %ymm1
vunpcklpd 224(%rdx), %ymm3, %ymm3
vmovupd 64(%rdx), %ymm7
vpermpd $216, %ymm3, %ymm3
vunpcklpd %ymm3, %ymm1, %ymm1
vmovupd (%rdx), %ymm3
vpermpd $216, %ymm1, %ymm1
vpermpd $216, %ymm2, %ymm2
vmovupd %ymm4, (%rsp)
vunpcklpd %ymm1, %ymm2, %ymm2
vunpckhpd 32(%rdx), %ymm3, %ymm4
vunpckhpd 96(%rdx), %ymm7, %ymm1
vmovupd 128(%rdx), %ymm3
vmovupd 192(%rdx), %ymm7
vpermpd $216, %ymm1, %ymm1
vpermpd $216, %ymm4, %ymm4
vunpcklpd %ymm1, %ymm4, %ymm4
vunpckhpd 160(%rdx), %ymm3, %ymm1
vunpckhpd 224(%rdx), %ymm7, %ymm3
vpermpd $216, %ymm1, %ymm1
vpermpd $216, %ymm3, %ymm3
vunpcklpd %ymm3, %ymm1, %ymm1
vmovupd (%r11), %ymm3
vpermpd $216, %ymm1, %ymm1
vunpcklpd 32(%r11), %ymm3, %ymm9
vmovupd 64(%r11), %ymm7
vpermpd $216, %ymm4, %ymm4
vunpcklpd %ymm1, %ymm4, %ymm4
vunpcklpd 96(%r11), %ymm7, %ymm1
vmovupd 128(%r11), %ymm3
vmovupd 192(%r11), %ymm7
vpermpd $216, %ymm1, %ymm1
vpermpd $216, %ymm9, %ymm9
vunpcklpd %ymm1, %ymm9, %ymm9
vunpcklpd 160(%r11), %ymm3, %ymm1
vunpcklpd 224(%r11), %ymm7, %ymm3
vpermpd $216, %ymm1, %ymm1
vpermpd $216, %ymm3, %ymm3
vmovupd 64(%r11), %ymm7
vunpcklpd %ymm3, %ymm1, %ymm1
vmovupd (%r11), %ymm3
vpermpd $216, %ymm1, %ymm1
vpermpd $216, %ymm9, %ymm9
vunpckhpd 32(%r11), %ymm3, %ymm3
vunpcklpd %ymm1, %ymm9, %ymm9
vunpckhpd 96(%r11), %ymm7, %ymm1
vmovupd 128(%r11), %ymm7
vpermpd $216, %ymm1, %ymm1
vpermpd $216, %ymm3, %ymm3
vunpcklpd %ymm1, %ymm3, %ymm3
vunpckhpd 160(%r11), %ymm7, %ymm1
vmovupd 192(%r11), %ymm7
vpermpd $216, %ymm1, %ymm1
vunpckhpd 224(%r11), %ymm7, %ymm7
vpermpd $216, %ymm8, %ymm8
vpermpd $216, %ymm7, %ymm7
vunpcklpd %ymm7, %ymm1, %ymm1
vmovupd (%r10), %ymm7
vpermpd $216, %ymm15, %ymm15
vunpcklpd %ymm8, %ymm15, %ymm10
vunpckhpd %ymm8, %ymm15, %ymm15
vunpcklpd 32(%r10), %ymm7, %ymm8
vmovupd 64(%r10), %ymm7
vpermpd $216, %ymm1, %ymm1
vpermpd $216, %ymm3, %ymm3
vunpcklpd %ymm1, %ymm3, %ymm3
vunpcklpd 96(%r10), %ymm7, %ymm1
vmovupd 128(%r10), %ymm7
vpermpd $216, %ymm1, %ymm1
vpermpd $216, %ymm8, %ymm8
vunpcklpd %ymm1, %ymm8, %ymm8
vunpcklpd 160(%r10), %ymm7, %ymm1
vmovupd 192(%r10), %ymm7
vpermpd $216, %ymm1, %ymm1
vunpcklpd 224(%r10), %ymm7, %ymm7
vpermpd $216, %ymm8, %ymm8
vpermpd $216, %ymm7, %ymm7
vunpcklpd %ymm7, %ymm1, %ymm1
vmovupd (%r10), %ymm7
vpermpd $216, %ymm1, %ymm1
vunpcklpd %ymm1, %ymm8, %ymm8
vunpckhpd 32(%r10), %ymm7, %ymm1
vmovupd 64(%r10), %ymm7
vpermpd $216, %ymm1, %ymm1
vunpckhpd 96(%r10), %ymm7, %ymm7
vmovupd 192(%r10), %ymm11
vpermpd $216, %ymm7, %ymm7
vunpcklpd %ymm7, %ymm1, %ymm1
vmovupd 128(%r10), %ymm7
vunpckhpd 224(%r10), %ymm11, %ymm11
vunpckhpd 160(%r10), %ymm7, %ymm7
vpermpd $216, %ymm11, %ymm11
vpermpd $216, %ymm7, %ymm7
vunpcklpd %ymm11, %ymm7, %ymm7
vpermpd $216, %ymm1, %ymm1
vpermpd $216, %ymm7, %ymm7
vunpcklpd %ymm7, %ymm1, %ymm7
vmovupd (%r9), %ymm1
vpermpd $216, %ymm7, %ymm7
vmovupd %ymm7, 32(%rsp)
vunpcklpd 32(%r9), %ymm1, %ymm7
vmovupd 64(%r9), %ymm1
vpermpd $216, %ymm7, %ymm7
vunpcklpd 96(%r9), %ymm1, %ymm1
vmovupd 192(%r9), %ymm14
vpermpd $216, %ymm1, %ymm1
vunpcklpd %ymm1, %ymm7, %ymm7
vmovupd 128(%r9), %ymm1
vunpcklpd 224(%r9), %ymm14, %ymm11
vunpcklpd 160(%r9), %ymm1, %ymm1
vpermpd $216, %ymm11, %ymm11
vpermpd $216, %ymm1, %ymm1
vunpcklpd %ymm11, %ymm1, %ymm1
vpermpd $216, %ymm1, %ymm1
vpermpd $216, %ymm7, %ymm7
vmovupd 64(%r9), %ymm11
vunpcklpd %ymm1, %ymm7, %ymm7
vmovupd (%r9), %ymm1
vunpckhpd 96(%r9), %ymm11, %ymm11
vunpckhpd 32(%r9), %ymm1, %ymm1
vmovupd 128(%r9), %ymm14
vpermpd $216, %ymm11, %ymm11
vpermpd $216, %ymm1, %ymm1
vunpcklpd %ymm11, %ymm1, %ymm1
vunpckhpd 160(%r9), %ymm14, %ymm11
vmovupd 192(%r9), %ymm14
vpermpd $216, %ymm11, %ymm11
vunpckhpd 224(%r9), %ymm14, %ymm14
vpermpd $216, %ymm10, %ymm10
vpermpd $216, %ymm14, %ymm14
vunpcklpd %ymm14, %ymm11, %ymm11
vpermpd $216, %ymm2, %ymm2
vpermpd $216, %ymm11, %ymm11
vpermpd $216, %ymm1, %ymm1
vpermpd $68, %ymm10, %ymm14
vpermpd $216, %ymm0, %ymm0
vpermpd $216, %ymm9, %ymm9
vunpcklpd %ymm11, %ymm1, %ymm1
vpermpd $238, %ymm10, %ymm10
vpermpd $68, %ymm2, %ymm11
vpermpd $238, %ymm2, %ymm2
vshufpd $12, %ymm11, %ymm14, %ymm11
vshufpd $12, %ymm2, %ymm10, %ymm2
vpermpd $216, %ymm15, %ymm15
vpermpd $68, %ymm0, %ymm10
vpermpd $216, %ymm8, %ymm8
vpermpd $68, %ymm9, %ymm14
vpermpd $238, %ymm0, %ymm0
vpermpd $238, %ymm9, %ymm9
vshufpd $12, %ymm9, %ymm0, %ymm0
vpermpd $216, %ymm6, %ymm6
vmovupd %ymm0, 64(%rsp)
vpermpd $68, %ymm15, %ymm9
vpermpd $216, %ymm7, %ymm7
vpermpd $68, %ymm8, %ymm0
vshufpd $12, %ymm14, %ymm10, %ymm14
vshufpd $12, %ymm0, %ymm9, %ymm0
vpermpd $216, %ymm13, %ymm13
vpermpd $68, %ymm7, %ymm9
vpermpd $216, %ymm4, %ymm4
vpermpd $238, %ymm8, %ymm8
vpermpd $68, %ymm6, %ymm10
vpermpd $238, %ymm7, %ymm7
vpermpd $238, %ymm6, %ymm6
vpermpd $238, %ymm15, %ymm15
vshufpd $12, %ymm8, %ymm15, %ymm15
vshufpd $12, %ymm9, %ymm10, %ymm10
vshufpd $12, %ymm7, %ymm6, %ymm8
vmovupd %ymm10, 128(%rsp)
vpermpd $68, %ymm4, %ymm6
vpermpd $68, %ymm13, %ymm10
vshufpd $12, %ymm6, %ymm10, %ymm10
vpermpd $216, %ymm12, %ymm12
vmovupd 32(%rsp), %ymm6
vpermpd $216, %ymm3, %ymm3
vpermpd $238, %ymm13, %ymm13
vpermpd $238, %ymm4, %ymm4
vmovupd %ymm15, 96(%rsp)
vmovupd %ymm8, 160(%rsp)
vshufpd $12, %ymm4, %ymm13, %ymm15
vpermpd $216, %ymm5, %ymm5
vpermpd $68, %ymm3, %ymm4
vpermpd $68, %ymm12, %ymm8
vpermpd $238, %ymm3, %ymm3
vpermpd $238, %ymm12, %ymm12
vshufpd $12, %ymm4, %ymm8, %ymm8
vshufpd $12, %ymm3, %ymm12, %ymm12
vpermpd $68, %ymm5, %ymm4
vpermpd $68, %ymm6, %ymm3
vpermpd $238, %ymm5, %ymm5
vpermpd $238, %ymm6, %ymm6
vshufpd $12, %ymm6, %ymm5, %ymm6
vmovupd (%rsp), %ymm5
vpermpd $216, %ymm1, %ymm1
vshufpd $12, %ymm3, %ymm4, %ymm3
vpermpd $68, %ymm5, %ymm7
vpermpd $68, %ymm1, %ymm4
vpermpd $238, %ymm5, %ymm5
vpermpd $238, %ymm1, %ymm1
vshufpd $12, %ymm1, %ymm5, %ymm5
vshufpd $12, %ymm4, %ymm7, %ymm7
vpermpd $68, %ymm10, %ymm1
vpermpd $68, %ymm11, %ymm4
vpermpd $68, %ymm2, %ymm9
vshufpd $12, %ymm1, %ymm4, %ymm4
vpermpd $238, %ymm10, %ymm10
vpermpd $68, %ymm15, %ymm1
vpermpd $238, %ymm2, %ymm2
vpermpd $238, %ymm15, %ymm15
vpermpd $238, %ymm11, %ymm11
vshufpd $12, %ymm10, %ymm11, %ymm11
vshufpd $12, %ymm1, %ymm9, %ymm1
vshufpd $12, %ymm15, %ymm2, %ymm10
vpermpd $68, %ymm14, %ymm9
vpermpd $68, %ymm8, %ymm2
vpermpd $238, %ymm14, %ymm14
vpermpd $238, %ymm8, %ymm8
vshufpd $12, %ymm8, %ymm14, %ymm8
vmovupd 64(%rsp), %ymm14
vshufpd $12, %ymm2, %ymm9, %ymm2
vpermpd $68, %ymm14, %ymm9
vmovupd %ymm2, 32(%rsp)
vpermpd $68, %ymm12, %ymm2
vshufpd $12, %ymm2, %ymm9, %ymm13
vpermpd $238, %ymm12, %ymm12
vpermpd $238, %ymm14, %ymm9
vmovupd 96(%rsp), %ymm15
vshufpd $12, %ymm12, %ymm9, %ymm14
vpermpd $68, %ymm3, %ymm2
vpermpd $68, %ymm0, %ymm9
vpermpd $238, %ymm3, %ymm3
vpermpd $238, %ymm0, %ymm0
vshufpd $12, %ymm3, %ymm0, %ymm12
vmovupd %ymm13, 64(%rsp)
vpermpd $68, %ymm6, %ymm0
vpermpd $238, %ymm6, %ymm13
vmovupd 128(%rsp), %ymm6
vpermpd $68, %ymm15, %ymm3
vpermpd $238, %ymm15, %ymm15
vshufpd $12, %ymm2, %ymm9, %ymm2
vshufpd $12, %ymm13, %ymm15, %ymm13
vpermpd $238, %ymm6, %ymm9
vpermpd $68, %ymm6, %ymm15
vmovupd 160(%rsp), %ymm6
vshufpd $12, %ymm0, %ymm3, %ymm3
vpermpd $68, %ymm7, %ymm0
vpermpd $238, %ymm7, %ymm7
vshufpd $12, %ymm7, %ymm9, %ymm9
vshufpd $12, %ymm0, %ymm15, %ymm15
vpermpd $68, %ymm6, %ymm7
vpermpd $68, %ymm5, %ymm0
vshufpd $12, %ymm0, %ymm7, %ymm7
vmovupd %ymm14, 192(%rsp)
vpermpd $68, %ymm2, %ymm0
vpermpd $238, %ymm5, %ymm14
vpermpd $238, %ymm2, %ymm2
vpermpd $68, %ymm4, %ymm5
vpermpd $238, %ymm4, %ymm4
vshufpd $12, %ymm0, %ymm5, %ymm5
vshufpd $12, %ymm2, %ymm4, %ymm4
vpermpd $68, %ymm12, %ymm0
vpermpd $68, %ymm3, %ymm2
vmovupd %ymm5, (%rsp)
vmovupd %ymm4, 96(%rsp)
vpermpd $68, %ymm11, %ymm5
vpermpd $68, %ymm1, %ymm4
vpermpd $238, %ymm3, %ymm3
vpermpd $238, %ymm1, %ymm1
vshufpd $12, %ymm0, %ymm5, %ymm5
vshufpd $12, %ymm2, %ymm4, %ymm4
vshufpd $12, %ymm3, %ymm1, %ymm1
vpermpd $68, %ymm13, %ymm2
vpermpd $238, %ymm12, %ymm0
vpermpd $68, %ymm10, %ymm3
vmovupd 32(%rsp), %ymm12
vshufpd $12, %ymm2, %ymm3, %ymm3
vpermpd $238, %ymm6, %ymm6
vpermpd $238, %ymm10, %ymm2
vpermpd $238, %ymm13, %ymm13
vshufpd $12, %ymm14, %ymm6, %ymm14
vshufpd $12, %ymm13, %ymm2, %ymm2
vpermpd $68, %ymm15, %ymm6
vpermpd $68, %ymm12, %ymm13
vpermpd $238, %ymm15, %ymm15
vpermpd $238, %ymm12, %ymm12
vshufpd $12, %ymm15, %ymm12, %ymm12
vpermpd $238, %ymm11, %ymm11
vmovupd 64(%rsp), %ymm15
vshufpd $12, %ymm0, %ymm11, %ymm0
vpermpd $238, %ymm9, %ymm10
vpermpd $68, %ymm8, %ymm11
vpermpd $238, %ymm8, %ymm8
vshufpd $12, %ymm6, %ymm13, %ymm13
vshufpd $12, %ymm10, %ymm8, %ymm10
vpermpd $68, %ymm9, %ymm6
vpermpd $238, %ymm15, %ymm8
vpermpd $68, %ymm15, %ymm9
vmovupd 192(%rsp), %ymm15
vshufpd $12, %ymm6, %ymm11, %ymm11
vpermpd $68, %ymm7, %ymm6
vpermpd $238, %ymm7, %ymm7
vshufpd $12, %ymm6, %ymm9, %ymm9
vshufpd $12, %ymm7, %ymm8, %ymm8
vpermpd $68, %ymm14, %ymm6
vpermpd $68, %ymm15, %ymm7
vshufpd $12, %ymm6, %ymm7, %ymm7
vpermpd $238, %ymm14, %ymm14
vpermpd $238, %ymm15, %ymm6
vshufpd $12, %ymm14, %ymm6, %ymm6
vpermpd $68, (%rsp), %ymm14
vpermpd $68, %ymm13, %ymm15
vshufpd $12, %ymm15, %ymm14, %ymm14
vmovupd 96(%rsp), %ymm15
vmovupd %ymm14, (%rax)
vpermpd $238, (%rsp), %ymm14
vpermpd $238, %ymm13, %ymm13
vshufpd $12, %ymm13, %ymm14, %ymm13
vpermpd $68, %ymm12, %ymm14
vmovupd %ymm13, 32(%rax)
vpermpd $68, %ymm15, %ymm13
vshufpd $12, %ymm14, %ymm13, %ymm13
vpermpd $238, %ymm12, %ymm12
vmovupd %ymm13, 64(%rax)
vpermpd $238, %ymm15, %ymm13
vshufpd $12, %ymm12, %ymm13, %ymm12
vpermpd $68, %ymm11, %ymm13
vmovupd %ymm12, 96(%rax)
vpermpd $238, %ymm11, %ymm11
vpermpd $68, %ymm5, %ymm12
vpermpd $238, %ymm5, %ymm5
vshufpd $12, %ymm11, %ymm5, %ymm5
vpermpd $68, %ymm10, %ymm11
vmovupd %ymm5, 160(%rax)
vpermpd $238, %ymm10, %ymm10
vpermpd $68, %ymm0, %ymm5
vpermpd $238, %ymm0, %ymm0
vshufpd $12, %ymm11, %ymm5, %ymm5
vshufpd $12, %ymm10, %ymm0, %ymm0
vmovupd %ymm5, 192(%rax)
vmovupd %ymm0, 224(%rax)
vpermpd $68, %ymm9, %ymm5
vpermpd $68, %ymm4, %ymm0
vpermpd $238, %ymm9, %ymm9
vpermpd $238, %ymm4, %ymm4
vshufpd $12, %ymm5, %ymm0, %ymm0
vshufpd $12, %ymm9, %ymm4, %ymm4
vmovupd %ymm0, 256(%rax)
vmovupd %ymm4, 288(%rax)
vpermpd $68, %ymm1, %ymm0
vpermpd $68, %ymm8, %ymm4
vpermpd $238, %ymm1, %ymm1
vpermpd $238, %ymm8, %ymm8
vshufpd $12, %ymm4, %ymm0, %ymm0
vshufpd $12, %ymm8, %ymm1, %ymm1
vmovupd %ymm0, 320(%rax)
vmovupd %ymm1, 352(%rax)
vpermpd $68, %ymm3, %ymm0
vpermpd $68, %ymm7, %ymm1
vshufpd $12, %ymm1, %ymm0, %ymm0
vpermpd $238, %ymm3, %ymm3
vmovupd %ymm0, 384(%rax)
vpermpd $68, %ymm6, %ymm1
vpermpd $68, %ymm2, %ymm0
vpermpd $238, %ymm7, %ymm7
vpermpd $238, %ymm2, %ymm2
vpermpd $238, %ymm6, %ymm6
addq $256, %rcx
vshufpd $12, %ymm13, %ymm12, %ymm12
vshufpd $12, %ymm7, %ymm3, %ymm3
vmovupd %ymm12, 128(%rax)
vmovupd %ymm3, 416(%rax)
vshufpd $12, %ymm1, %ymm0, %ymm0
vshufpd $12, %ymm6, %ymm2, %ymm2
vmovupd %ymm0, 448(%rax)
vmovupd %ymm2, 480(%rax)
addq $256, %rdx
addq $256, %r11
addq $256, %r10
addq $256, %r9
addq $512, %rax
cmpq %rcx, %rbp
jne .L4
I am not kidding.
gcc 10.2 -O3 really generates code that is approximately 3 times slower than
scalar output of -O2 and, may be, 4-4.5 times slower than good SIMD code
similar to what was generated in Part1.
My guess is that it's once again, as in nearly all my complains of recent
months
it a case of earlier phase of optimization producing a mess that totally
confuses a later stage. I just can't guess what is the name of stage in fault
this time.
You have so many.