https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110979

            Bug ID: 110979
           Summary: Miss-optimization for O2 fully masked loop on floating
                    point reduction.
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: crazylht at gmail dot com
  Target Milestone: ---

https://godbolt.org/z/YsaesW8zT

float
foo3 (float* __restrict a, int n)
{
    float sum = 0.0f;
    for (int i = 0; i != 100; i++)
      sum += a[i];
    return sum;
}

-O2 -march=znver4 --param vect-partial-vector-usage=2, we get

  <bb 3> [local count: 66437776]:
  # sum_13 = PHI <sum_10(3), 0.0(2)>
  # loop_mask_16 = PHI <_54(3), { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1 }(2)>
  # ivtmp.13_12 = PHI <ivtmp.13_15(3), ivtmp.13_1(2)>
  # ivtmp.16_2 = PHI <ivtmp.16_3(3), 84(2)>
  # DEBUG i => NULL
  # DEBUG sum => NULL
  # DEBUG BEGIN_STMT
  _4 = (void *) ivtmp.13_12;
  _11 = &MEM <vector(16) float> [(float *)_4];
  vect__4.6_17 = .MASK_LOAD (_11, 32B, loop_mask_16);
  cond_18 = .VCOND_MASK (loop_mask_16, vect__4.6_17, { 0.0, 0.0, 0.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 });
  stmp_sum_10.7_19 = BIT_FIELD_REF <cond_18, 32, 0>;
  stmp_sum_10.7_20 = sum_13 + stmp_sum_10.7_19;
  stmp_sum_10.7_21 = BIT_FIELD_REF <cond_18, 32, 32>;
  stmp_sum_10.7_22 = stmp_sum_10.7_20 + stmp_sum_10.7_21;
  stmp_sum_10.7_23 = BIT_FIELD_REF <cond_18, 32, 64>;
  stmp_sum_10.7_24 = stmp_sum_10.7_22 + stmp_sum_10.7_23;
  stmp_sum_10.7_25 = BIT_FIELD_REF <cond_18, 32, 96>;
  stmp_sum_10.7_26 = stmp_sum_10.7_24 + stmp_sum_10.7_25;
  stmp_sum_10.7_27 = BIT_FIELD_REF <cond_18, 32, 128>;
  stmp_sum_10.7_28 = stmp_sum_10.7_26 + stmp_sum_10.7_27;
  stmp_sum_10.7_29 = BIT_FIELD_REF <cond_18, 32, 160>;
  stmp_sum_10.7_30 = stmp_sum_10.7_28 + stmp_sum_10.7_29;
  stmp_sum_10.7_31 = BIT_FIELD_REF <cond_18, 32, 192>;
  stmp_sum_10.7_32 = stmp_sum_10.7_30 + stmp_sum_10.7_31;
  stmp_sum_10.7_33 = BIT_FIELD_REF <cond_18, 32, 224>;
  stmp_sum_10.7_34 = stmp_sum_10.7_32 + stmp_sum_10.7_33;
  stmp_sum_10.7_35 = BIT_FIELD_REF <cond_18, 32, 256>;
  stmp_sum_10.7_36 = stmp_sum_10.7_34 + stmp_sum_10.7_35;
  stmp_sum_10.7_37 = BIT_FIELD_REF <cond_18, 32, 288>;
  stmp_sum_10.7_38 = stmp_sum_10.7_36 + stmp_sum_10.7_37;
  stmp_sum_10.7_39 = BIT_FIELD_REF <cond_18, 32, 320>;
  stmp_sum_10.7_40 = stmp_sum_10.7_38 + stmp_sum_10.7_39;
  stmp_sum_10.7_41 = BIT_FIELD_REF <cond_18, 32, 352>;
  stmp_sum_10.7_42 = stmp_sum_10.7_40 + stmp_sum_10.7_41;
  stmp_sum_10.7_43 = BIT_FIELD_REF <cond_18, 32, 384>;
  stmp_sum_10.7_44 = stmp_sum_10.7_42 + stmp_sum_10.7_43;
  stmp_sum_10.7_45 = BIT_FIELD_REF <cond_18, 32, 416>;
  stmp_sum_10.7_46 = stmp_sum_10.7_44 + stmp_sum_10.7_45;
  stmp_sum_10.7_47 = BIT_FIELD_REF <cond_18, 32, 448>;
  stmp_sum_10.7_48 = stmp_sum_10.7_46 + stmp_sum_10.7_47;
  stmp_sum_10.7_49 = BIT_FIELD_REF <cond_18, 32, 480>;
  sum_10 = stmp_sum_10.7_48 + stmp_sum_10.7_49;
  # DEBUG sum => sum_10
  # DEBUG BEGIN_STMT
  # DEBUG i => NULL
  # DEBUG sum => sum_10
  # DEBUG BEGIN_STMT
  _53 = {ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2,
ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2,
ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2, ivtmp.16_2};
  _54 = _53 > { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
  ivtmp.13_15 = ivtmp.13_12 + 64;
  ivtmp.16_3 = ivtmp.16_2 + 240;
  if (ivtmp.16_3 != 228)


Looks like an cost model issue?

For aarch64, it looks fine since they have FADDA(Floating-point add
strictly-ordered reduction, accumulating in scalar).

Reply via email to