https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79102
Bug ID: 79102 Summary: gcc fails to auto-vectorise the product of an array of complex floats Product: gcc Version: 7.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c Assignee: unassigned at gcc dot gnu.org Reporter: drraph at gmail dot com Target Milestone: --- Consider this simple piece of code. #include <complex.h> complex float f(complex float x[]) { complex float p = 1.0; for (int i = 0; i < 128; i++) p *= x[i]; return p; } If I compile it with -O3 -march=bdver2 -ffast-math I get f: vmovss xmm2, DWORD PTR .LC1[rip] vxorps xmm1, xmm1, xmm1 lea rax, [rdi+256] .L2: vmovss xmm0, DWORD PTR [rdi+4] add rdi, 8 vmulss xmm3, xmm0, xmm2 vmulss xmm0, xmm0, xmm1 vfmadd132ss xmm1, xmm3, DWORD PTR [rdi-8] vfmsub132ss xmm2, xmm0, DWORD PTR [rdi-8] cmp rax, rdi jne .L2 vmovss DWORD PTR [rsp-8], xmm2 vmovss DWORD PTR [rsp-4], xmm1 vmovq xmm0, QWORD PTR [rsp-8] ret .LC1: .long 1065353216 This is unvectorised code. However if I do the same using float instead, that is with: float f(float x[], int n ) { float p = 1.0; for (int i = 0; i < 32; i++) p *= x[i]; return p; } I get vmovups xmm2, XMMWORD PTR [rdi] vmulps xmm0, xmm2, XMMWORD PTR [rdi+16] vmulps xmm0, xmm0, XMMWORD PTR [rdi+32] vmulps xmm0, xmm0, XMMWORD PTR [rdi+48] vmulps xmm0, xmm0, XMMWORD PTR [rdi+64] vmulps xmm0, xmm0, XMMWORD PTR [rdi+80] vmulps xmm0, xmm0, XMMWORD PTR [rdi+96] vmulps xmm0, xmm0, XMMWORD PTR [rdi+112] vpsrldq xmm1, xmm0, 8 vmulps xmm0, xmm0, xmm1 vpsrldq xmm1, xmm0, 4 vmulps xmm0, xmm0, xmm1 ret This is vectorised. As a test I also the Intel C compiler version 17. In this case the assembly you get using complex float is however vectorised giving: f: mov rdx, rdi #4.3 and rdx, 15 #4.3 movsd xmm0, QWORD PTR p.152.0.0.1[rip] #3.19 test dl, dl #4.3 je ..B1.4 # Prob 50% #4.3 test dl, 7 #4.3 jne ..B1.12 # Prob 10% #4.3 movsd xmm0, QWORD PTR [rdi] #5.10 mov dl, 1 #4.3 ..B1.4: # Preds ..B1.3 ..B1.1 movzx eax, dl #4.3 neg dl #4.3 and dl, 3 #4.3 movzx edx, dl #4.3 movss xmm1, DWORD PTR .L_2il0floatpacket.0[rip] #3.19 neg rdx #4.3 movlhps xmm0, xmm1 #3.19 add rdx, 128 #4.3 ..B1.5: # Preds ..B1.5 ..B1.4 movaps xmm2, xmm0 #5.5 movups xmm1, XMMWORD PTR [rdi+rax*8] #5.10 shufps xmm2, xmm0, 160 #5.5 mulps xmm2, xmm1 #5.5 xorps xmm1, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5 shufps xmm1, xmm1, 177 #5.5 shufps xmm0, xmm0, 245 #5.5 mulps xmm1, xmm0 #5.5 movups xmm3, XMMWORD PTR [16+rdi+rax*8] #5.10 add rax, 4 #4.3 addps xmm2, xmm1 #5.5 movaps xmm0, xmm2 #5.5 shufps xmm0, xmm2, 160 #5.5 mulps xmm0, xmm3 #5.5 xorps xmm3, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5 shufps xmm3, xmm3, 177 #5.5 shufps xmm2, xmm2, 245 #5.5 mulps xmm3, xmm2 #5.5 addps xmm0, xmm3 #5.5 cmp rax, rdx #4.3 jb ..B1.5 # Prob 99% #4.3 movaps xmm1, xmm0 #3.19 movhlps xmm1, xmm0 #3.19 movaps xmm2, xmm1 #3.19 shufps xmm2, xmm1, 160 #3.19 mulps xmm2, xmm0 #3.19 xorps xmm0, XMMWORD PTR .L_2il0floatpacket.1[rip] #3.19 shufps xmm0, xmm0, 177 #3.19 shufps xmm1, xmm1, 245 #3.19 mulps xmm0, xmm1 #3.19 addps xmm0, xmm2 #3.19 ..B1.7: # Preds ..B1.6 ..B1.12 cmp rdx, 128 #4.3 jae ..B1.11 # Prob 0% #4.3 ..B1.9: # Preds ..B1.7 ..B1.9 movsd xmm1, QWORD PTR [rdi+rdx*8] #5.10 inc rdx #4.3 movaps xmm2, xmm1 #5.5 shufps xmm2, xmm1, 160 #5.5 mulps xmm2, xmm0 #5.5 xorps xmm0, XMMWORD PTR .L_2il0floatpacket.1[rip] #5.5 shufps xmm0, xmm0, 177 #5.5 shufps xmm1, xmm1, 245 #5.5 mulps xmm0, xmm1 #5.5 addps xmm0, xmm2 #5.5 cmp rdx, 128 #4.3 jb ..B1.9 # Prob 99% #4.3 ..B1.11: # Preds ..B1.9 ..B1.7 ret #6.10 ..B1.12: # Preds ..B1.2 xor edx, edx #4.3 jmp ..B1.7 # Prob 100% #4.3 p.152.0.0.1: .long 0x3f800000,0x00000000 .L_2il0floatpacket.1: .long 0x00000000,0x80000000,0x00000000,0x80000000 .L_2il0floatpacket.0: .long 0x3f800000