https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88497
kelvin at gcc dot gnu.org changed: What |Removed |Added ---------------------------------------------------------------------------- CC| |kelvin at gcc dot gnu.org, | |rguenther at suse dot de, | |segher at gcc dot gnu.org, | |wschmidt at gcc dot gnu.org --- Comment #1 from kelvin at gcc dot gnu.org --- Consider the following loop: double sacc = 0.00; extern double x[], y[]; for (unsigned long long int i = 0; i < N; i++) sacc += x[i] * y[i]; Auto-vectorization turns the body of the loop into something close to the following function foo: double foo (double accumulator, vector double arg2[], vector double arg3[]) { vector double temp; temp = arg2[0] * arg3[0]; accumulator += temp[0] + temp[1]; temp = arg2[1] * arg3[1]; accumulator += temp[0] + temp[1]; temp = arg2[2] * arg3[2]; accumulator += temp[0] + temp[1]; temp = arg2[3] * arg3[3]; accumulator += temp[0] + temp[1]; return accumulator; } Compiled with -O3 -mcpu=power9 -ffast-math, this translates into 25 instructions: foo: .LFB11: .cfi_startproc lxv 6,0(5) lxv 10,0(4) lxv 7,16(5) lxv 11,16(4) lxv 8,32(5) lxv 12,32(4) lxv 9,48(5) lxv 0,48(4) xvmuldp 10,10,6 xvmuldp 11,11,7 xvmuldp 12,12,8 xvmuldp 0,0,9 xxpermdi 7,10,10,3 xxpermdi 8,11,11,3 fadd 10,7,10 xxpermdi 9,12,12,3 fadd 11,8,11 xxpermdi 6,0,0,3 fadd 12,9,12 fadd 0,6,0 fadd 10,10,1 fadd 11,11,10 fadd 1,12,11 fadd 1,0,1 blr If auto-vectorization were to transform this loop into the following equivalent code, the resulting translation is only 18 instructions: double foo (double accumulator, vector double arg2[], vector double arg3[]) { vector double temp; temp[0] = accumulator; temp[1] = 0.0; temp += arg2[0] * arg3[0]; temp += arg2[1] * arg3[1]; temp += arg2[2] * arg3[2]; temp += arg2[3] * arg3[3]; return temp[0] + temp[1]; } foo: .LFB11: .cfi_startproc li 9,0 lxv 10,0(4) lxv 6,0(5) lxv 11,16(4) lxv 7,16(5) mtvsrd 0,9 lxv 12,32(4) lxv 8,32(5) lxv 9,48(5) xxpermdi 1,0,1,0 lxv 0,48(4) xvmaddadp 1,10,6 xvmaddadp 1,11,7 xvmaddadp 1,12,8 xvmaddadp 1,0,9 xxpermdi 0,1,1,3 fadd 1,0,1 blr I have also experimented with trunk's treatment of x86 targets, and the same optimization is relevant there: x86 -O3 -ffast-math optimized translation of the "original" source is: _foo: LFB1: ;; 17 insns in original code ;; movadp: 4 ;; mulpd: 4 ;; addsd: 4 ;; haddpd: 4 ;; ret: 1 ;; total: 17 movapd 32(%rdi), %xmm2 ; load arg2[2] mulpd 32(%rsi), %xmm2 ; multiply arg2[2] * arg3[2] movapd (%rdi), %xmm1 ; load arg2[0] movapd 16(%rdi), %xmm3 ; load arg2[1] mulpd (%rsi), %xmm1 ; multiply arg2[0] * arg3[0] mulpd 16(%rsi), %xmm3 ; multiply arg2[1] * arg3[1] haddpd %xmm2, %xmm2 ; sum args[2] products haddpd %xmm1, %xmm1 ; sum args[0] products haddpd %xmm3, %xmm3 ; sum args[1] products addsd %xmm0, %xmm2 ; add args[2] sum into accumulator movapd 48(%rdi), %xmm0 ; load arg2[3] mulpd 48(%rsi), %xmm0 ; multiply arg2[3] * arg3[3] addsd %xmm3, %xmm1 ; add args[0] and args[1] products addsd %xmm2, %xmm1 ; accumulate args[0..2] products haddpd %xmm0, %xmm0 ; sum args[3] products addsd %xmm1, %xmm0 ; accumulate args[0..3] ret The optimized translation of the "improved" source on x86 is: _foo: LFB1: ;; 15 insns in translation of improved source code ;; movq: 1 ;; movadp: 4 (load vector ;; mulpd: 4 ;; addpd: 4 ;; haddpd: 1 ;; ret: 1 ;; total: 15 movapd (%rdi), %xmm1 ; load arg2[0] movq %xmm0, %xmm0 ; move quad word: i don't understand this. mulpd (%rsi), %xmm1 ; multiply arg2[0] * arg3[0] movapd 16(%rdi), %xmm2 ; load arg2[1] mulpd 16(%rsi), %xmm2 ; multiply arg2[1] * arg3[1] movapd 48(%rdi), %xmm3 ; load arg2[3] mulpd 48(%rsi), %xmm3 ; multiply arg2[3] * arg3[3] addpd %xmm2, %xmm1 ; add args[0] and args[1] products movapd 32(%rdi), %xmm2 ; load arg2 [2] mulpd 32(%rsi), %xmm2 ; multiply arg2[2] * arg3[2] addpd %xmm3, %xmm2 ; add args[2] and args[3] products addpd %xmm2, %xmm1 ; adds all args products addpd %xmm1, %xmm0 ; Add sums to accumulator haddpd %xmm0, %xmm0 ; add double elements of vector ret