https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88760
--- Comment #22 from ktkachov at gcc dot gnu.org --- Some more experiments... Unrolling 4x in a similar way to my previous example and not splitting the accumulator (separate issue): unsigned int *colnums; double *val; struct foostruct { unsigned int rows; unsigned int *colnums; unsigned int *rowstart; }; struct foostruct *cols; void foo (double * __restrict__ dst, const double *__restrict__ src) { const unsigned int n_rows = cols->rows; const double *val_ptr = &val[cols->rowstart[0]]; const unsigned int *colnum_ptr = &cols->colnums[cols->rowstart[0]]; double *dst_ptr = dst; for (unsigned int row=0; row<n_rows; ++row) { double s = 0.; const double *const val_end_of_row = &val[cols->rowstart[row+1]]; __PTRDIFF_TYPE__ diff = val_end_of_row - val_ptr; if (diff & 1) { s += *val_ptr++ * src[*colnum_ptr++]; diff--; } if (diff & 2) { s += val_ptr[0] * src[colnum_ptr[0]]; s += val_ptr[1] * src[colnum_ptr[1]]; val_ptr += 2; colnum_ptr += 2; } while (val_ptr != val_end_of_row) { s += val_ptr[0] * src[colnum_ptr[0]]; s += val_ptr[1] * src[colnum_ptr[1]]; s += val_ptr[2] * src[colnum_ptr[2]]; s += val_ptr[3] * src[colnum_ptr[3]]; val_ptr += 4; colnum_ptr += 4; } *dst_ptr++ = s; } } helps even more. On Cortex-A72 it gives a bit more than 6% (vs 3%) improvement on parest, and about 5.3% on a more aggressive CPU. I tried unrolling 8x in a similar manner and that was not faster than 4x on either target. Note that perf profiling shows that the loads are what's hot in these loops, not the FMAs themselves: 4.41 │1b8: ldp w3, w4, [x0] ▒ 5.85 │ ldp d3, d4, [x2] ▒ │ add x2, x2, #0x20 ▒ 3.79 │ ldur d5, [x2, #-16] ▒ 2.82 │ ldr d0, [x1, x4, lsl #3] ▒ 2.53 │ ldr d2, [x1, x3, lsl #3] ▒ 2.10 │ ldp w4, w3, [x0, #8] ▒ │ add x0, x0, #0x10 ▒ 0.00 │ cmp x5, x0 ▒ │ fmul d0, d0, d4 ▒ 4.73 │ ldr d4, [x1, x4, lsl #3] ▒ │ fmadd d0, d3, d2, d0 ▒ 2.01 │ ldur d3, [x2, #-8] ▒ 2.54 │ ldr d2, [x1, x3, lsl #3] ▒ │ fmadd d0, d5, d4, d0 ▒ │ fmadd d0, d3, d2, d0 ▒ │ fadd d1, d1, d0