https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113592

            Bug ID: 113592
           Summary: missed partial sum optimization in vectorizer
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: liuhongt at gcc dot gnu.org
  Target Milestone: ---

double
foo (short* p, int n)
{
    double sum = 0;
    for (int i = 0; i != n; i++)
      sum += p[i] * (double)p[i];
    return sum;
}

w/ fast-math vectorizer generates

  <bb 5> [local count: 860067200]:
  # vect_sum_16.8_44 = PHI <vect_sum_12.15_61(5), { 0.0, 0.0, 0.0, 0.0 }(4)>
  # ivtmp.35_152 = PHI <ivtmp.35_151(5), ivtmp.35_150(4)>
  # DEBUG BEGIN_STMT
  # DEBUG D#13 => D#14 * 2
  # DEBUG D#12 => p_11(D) + D#13
  _149 = (void *) ivtmp.35_152;
  vect__4.11_47 = MEM <vector(16) short int> [(short int *)_149];
  # DEBUG D#11 => *D#12
  vect__5.13_48 = [vec_unpack_lo_expr] vect__4.11_47;
  vect__5.13_49 = [vec_unpack_hi_expr] vect__4.11_47;
  vect__5.12_50 = [vec_unpack_float_lo_expr] vect__5.13_48;
  vect__5.12_51 = [vec_unpack_float_hi_expr] vect__5.13_48;
  vect__5.12_52 = [vec_unpack_float_lo_expr] vect__5.13_49;
  vect__5.12_53 = [vec_unpack_float_hi_expr] vect__5.13_49;
  # DEBUG D#10 => (double) D#11
  vect_powmult_6.14_55 = vect__5.12_51 * vect__5.12_51;
  _62 = .FMA (vect__5.12_50, vect__5.12_50, vect_powmult_6.14_55);
  vect_powmult_6.14_57 = vect__5.12_53 * vect__5.12_53;
  _45 = .FMA (vect__5.12_52, vect__5.12_52, vect_powmult_6.14_57);
  _46 = _45 + _62;
  # DEBUG D#9 => D#10 * D#10
  vect_sum_12.15_61 = vect_sum_16.8_44 + _46;
  # DEBUG sum => D#8
  # DEBUG BEGIN_STMT
  # DEBUG i => NULL
  # DEBUG sum => D#8
  # DEBUG BEGIN_STMT
  ivtmp.35_151 = ivtmp.35_152 + 32;
  if (_18 != ivtmp.35_151)
    goto <bb 5>; [89.00%]
  else
    goto <bb 8>; [11.00%]

But it can be better with.
....
  vect_powmult_6.14_55 = .FMA (vect__5.12_51, vect__5.12_51, 0);
  _62 = .FMA (vect__5.12_50, vect__5.12_50, 0);
  vect_powmult_6.14_57 = .FMA (vect__5.12_53, vect__5.12_53, 0);
  _45 = .FMA (vect__5.12_52, vect__5.12_52, 0);
  ivtmp.35_151 = ivtmp.35_152 + 32;
  if (_18 != ivtmp.35_151)
    goto <bb 5>; [89.00%]
  else
    goto <bb 8>; [11.00%]

<bb 8>
   _tmp1 = vect_powmult_6.14_55 + _62;
   _tmp2 = vect_powmult_6.14_57 + _45;
   _tmp3 = _tmp1 + _tmp2;
   _tmp4_scalar = .REDUCE_SUM (_tmp3);

Reply via email to