[Bug middle-end/88497] Improve Accumulation in Auto-Vectorized Code

kelvin at gcc dot gnu.org Fri, 14 Dec 2018 06:12:33 -0800

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88497


kelvin at gcc dot gnu.org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |kelvin at gcc dot gnu.org,
                   |                            |rguenther at suse dot de,
                   |                            |segher at gcc dot gnu.org,
                   |                            |wschmidt at gcc dot gnu.org

--- Comment #1 from kelvin at gcc dot gnu.org ---
Consider the following loop:

double sacc = 0.00;
extern double x[], y[];
for (unsigned long long int i = 0; i < N; i++)
  sacc += x[i] * y[i];

Auto-vectorization turns the body of the loop into something close to the
following function foo:

double foo (double accumulator, vector double arg2[], vector double arg3[])
{
  vector double temp;

  temp = arg2[0] * arg3[0];
  accumulator += temp[0] + temp[1];
  temp = arg2[1] * arg3[1];
  accumulator += temp[0] + temp[1];
  temp = arg2[2] * arg3[2];
  accumulator += temp[0] + temp[1];
  temp = arg2[3] * arg3[3];
  accumulator += temp[0] + temp[1];
  return accumulator;
}

Compiled with -O3 -mcpu=power9 -ffast-math, this translates into 25
instructions:

foo:
.LFB11:
        .cfi_startproc
        lxv 6,0(5)
        lxv 10,0(4)
        lxv 7,16(5)
        lxv 11,16(4)
        lxv 8,32(5)
        lxv 12,32(4)
        lxv 9,48(5)
        lxv 0,48(4)
        xvmuldp 10,10,6
        xvmuldp 11,11,7
        xvmuldp 12,12,8
        xvmuldp 0,0,9
        xxpermdi 7,10,10,3
        xxpermdi 8,11,11,3
        fadd 10,7,10
        xxpermdi 9,12,12,3
        fadd 11,8,11
        xxpermdi 6,0,0,3
        fadd 12,9,12
        fadd 0,6,0
        fadd 10,10,1
        fadd 11,11,10
        fadd 1,12,11
        fadd 1,0,1
        blr


If auto-vectorization were to transform this loop into the following equivalent
code, the resulting translation is only 18 instructions:

double foo (double accumulator, vector double arg2[], vector double arg3[])
{
  vector double temp;

  temp[0] = accumulator;
  temp[1] = 0.0;
  temp += arg2[0] * arg3[0];
  temp += arg2[1] * arg3[1];
  temp += arg2[2] * arg3[2];
  temp += arg2[3] * arg3[3];
  return temp[0] + temp[1];
}

foo:                                                                            
.LFB11:                                                                         
        .cfi_startproc                                                          
        li 9,0                                                                  
        lxv 10,0(4)                                                             
        lxv 6,0(5)                                                              
        lxv 11,16(4)                                                            
        lxv 7,16(5)                                                             
        mtvsrd 0,9                                                              
        lxv 12,32(4)                                                            
        lxv 8,32(5)                                                             
        lxv 9,48(5)                                                             
        xxpermdi 1,0,1,0                                                        
        lxv 0,48(4)                                                             
        xvmaddadp 1,10,6                                                        
        xvmaddadp 1,11,7                                                        
        xvmaddadp 1,12,8                                                        
        xvmaddadp 1,0,9                                                         
        xxpermdi 0,1,1,3                                                        
        fadd 1,0,1                                                              
        blr                  

I have also experimented with trunk's treatment of x86 targets, and the same
optimization is relevant there:


x86 -O3 -ffast-math optimized translation of the "original" source is:

_foo:
LFB1:
        ;; 17 insns in original code                                            
        ;;  movadp:     4                                                       
        ;;  mulpd:      4                                                       
        ;;  addsd:      4                                                       
        ;;  haddpd:     4                                                       
        ;;  ret:        1                                                       
        ;;  total:     17                                                       

        movapd  32(%rdi), %xmm2 ; load arg2[2]                                  
        mulpd   32(%rsi), %xmm2 ; multiply arg2[2] * arg3[2]                    
        movapd  (%rdi), %xmm1   ; load arg2[0]                                  
        movapd  16(%rdi), %xmm3 ; load arg2[1]                                  
        mulpd   (%rsi), %xmm1   ; multiply arg2[0] * arg3[0]                    
        mulpd   16(%rsi), %xmm3 ; multiply arg2[1] * arg3[1]                    
        haddpd  %xmm2, %xmm2    ; sum args[2] products                          
        haddpd  %xmm1, %xmm1    ; sum args[0] products                          
        haddpd  %xmm3, %xmm3    ; sum args[1] products                          
        addsd   %xmm0, %xmm2    ; add args[2] sum into accumulator              
        movapd  48(%rdi), %xmm0 ; load arg2[3]                                  
        mulpd   48(%rsi), %xmm0 ; multiply arg2[3] * arg3[3]                    
        addsd   %xmm3, %xmm1    ; add args[0] and args[1] products              
        addsd   %xmm2, %xmm1    ; accumulate args[0..2] products                
        haddpd  %xmm0, %xmm0    ; sum args[3] products                          
        addsd   %xmm1, %xmm0    ; accumulate args[0..3]                         
        ret

The optimized translation of the "improved" source on x86 is:

_foo:
LFB1:
        ;; 15 insns in translation of improved source code                      
        ;;  movq:       1                                                       
        ;;  movadp:     4       (load vector                                    
        ;;  mulpd:      4                                                       
        ;;  addpd:      4                                                       
        ;;  haddpd:     1                                                       
        ;;  ret:        1                                                       
        ;;  total:     15                                                       

        movapd  (%rdi), %xmm1   ; load arg2[0]                                  
        movq    %xmm0, %xmm0    ; move quad word: i don't understand this.      
        mulpd   (%rsi), %xmm1   ; multiply arg2[0] * arg3[0]                    
        movapd  16(%rdi), %xmm2 ; load arg2[1]                                  
        mulpd   16(%rsi), %xmm2 ; multiply arg2[1] * arg3[1]                    
        movapd  48(%rdi), %xmm3 ; load arg2[3]                                  
        mulpd   48(%rsi), %xmm3 ; multiply arg2[3] * arg3[3]                    
        addpd   %xmm2, %xmm1    ; add args[0] and args[1] products              
        movapd  32(%rdi), %xmm2 ; load arg2 [2]                                 
        mulpd   32(%rsi), %xmm2 ; multiply arg2[2] * arg3[2]                    
        addpd   %xmm3, %xmm2    ; add args[2] and args[3] products              
        addpd   %xmm2, %xmm1    ; adds all args products                        
        addpd   %xmm1, %xmm0    ; Add sums to accumulator                       
        haddpd  %xmm0, %xmm0    ; add double elements of vector                 
        ret

[Bug middle-end/88497] Improve Accumulation in Auto-Vectorized Code

Reply via email to