--- Comment #22 from ktkachov at gcc dot ---
Some more experiments...
Unrolling 4x in a similar way to my previous example and not splitting the
accumulator (separate issue):

unsigned int *colnums;
double *val;

struct foostruct
  unsigned int rows;
  unsigned int *colnums;
  unsigned int *rowstart;

struct foostruct *cols;

foo (double * __restrict__ dst, const double *__restrict__ src)
  const unsigned int n_rows = cols->rows;
  const double *val_ptr = &val[cols->rowstart[0]];
  const unsigned int *colnum_ptr = &cols->colnums[cols->rowstart[0]];  

  double *dst_ptr = dst;
  for (unsigned int row=0; row<n_rows; ++row)
      double s = 0.;
      const double *const val_end_of_row = &val[cols->rowstart[row+1]];
      __PTRDIFF_TYPE__ diff = val_end_of_row - val_ptr;
      if (diff & 1)
        s += *val_ptr++ * src[*colnum_ptr++];
      if (diff & 2)
          s += val_ptr[0] * src[colnum_ptr[0]];
          s += val_ptr[1] * src[colnum_ptr[1]];
          val_ptr += 2;
          colnum_ptr += 2;
      while (val_ptr != val_end_of_row)
          s += val_ptr[0] * src[colnum_ptr[0]];
          s += val_ptr[1] * src[colnum_ptr[1]];
          s += val_ptr[2] * src[colnum_ptr[2]];
          s += val_ptr[3] * src[colnum_ptr[3]];
          val_ptr += 4;
          colnum_ptr += 4;
      *dst_ptr++ = s;

helps even more. On Cortex-A72 it gives a bit more than 6% (vs 3%) improvement
on parest, and about 5.3% on a more aggressive CPU.
I tried unrolling 8x in a similar manner and that was not faster than 4x on
either target.

Note that perf profiling shows that the loads are what's hot in these loops,
not the FMAs themselves:
  4.41 │1b8:   ldp    w3, w4, [x0]                                             
  5.85 │       ldp    d3, d4, [x2]                                             
       │       add    x2, x2, #0x20                                            
  3.79 │       ldur   d5, [x2, #-16]                                           
  2.82 │       ldr    d0, [x1, x4, lsl #3]                                     
  2.53 │       ldr    d2, [x1, x3, lsl #3]                                     
  2.10 │       ldp    w4, w3, [x0, #8]                                         
       │       add    x0, x0, #0x10                                            
  0.00 │       cmp    x5, x0                                                   
       │       fmul   d0, d0, d4                                               
  4.73 │       ldr    d4, [x1, x4, lsl #3]                                     
       │       fmadd  d0, d3, d2, d0                                           
  2.01 │       ldur   d3, [x2, #-8]                                            
  2.54 │       ldr    d2, [x1, x3, lsl #3]                                     
       │       fmadd  d0, d5, d4, d0                                           
       │       fmadd  d0, d3, d2, d0                                           
       │       fadd   d1, d1, d0

Reply via email to