https://gcc.gnu.org/bugzilla/show_bug.cgi?id=14741

--- Comment #41 from Avinash Jayakar <avinashd at gcc dot gnu.org> ---
For this particular loop, either in fortran or in c, with the O2 pipeline, I
see that loop invariant motion and pre pass is hindering the graphite to
perform the tiling. If I disable these 2 passes prior to graphite pass, it does
say tiled by 51 and I am able to see about 2x speed improvement (on x86).

In the graphite logs, I can see that it prefers tiling if the loop is perfectly
nested, thus the reason I eliminated those passes to keep it perfect nested
loop until graphite pass.

Also one other change I made in fortran code in description of this bug is to
keep N as compile time constant, for scop detection.

Following is the tiled version of the mult function:


void mult (real(kind=8)[4194304] * restrict a, real(kind=8)[4194304] * restrict
b, real(kind=8)[4194304] * restrict c)
{
  int128_t graphite_IV.161;
  int128_t graphite_IV.160;
  int128_t graphite_IV.159;
  int128_t graphite_IV.158;
  int128_t graphite_IV.157;
  int128_t graphite_IV.156;
  integer(kind=4) k;
  integer(kind=4) j;
  integer(kind=4) i;
  int128_t _46;
  logical(kind=1) _47;
  int128_t _48;
  int128_t _49;
  int128_t _50;
  int128_t _53;
  logical(kind=1) _54;
  int128_t _55;
  int128_t _56;
  int128_t _57;
  int128_t _60;
  logical(kind=1) _61;
  int128_t _62;
  int128_t _63;
  int128_t _64;
  real(kind=8) _67;
  int128_t _68;
  int128_t _69;
  unsigned long _70;
  int128_t _71;
  int128_t _72;
  unsigned long _73;
  unsigned long _74;
  unsigned long _75;
  integer(kind=8) _76;
  real(kind=8) _77;
  int128_t _78;
  int128_t _79;
  unsigned long _80;
  int128_t _81;
  int128_t _82;
  unsigned long _83;
  unsigned long _84;
  unsigned long _85;
  integer(kind=8) _86;
  real(kind=8) _87;
  int128_t _88;
  int128_t _89;
  unsigned long _90;
  unsigned long _91;
  int128_t _92;
  int128_t _93;
  unsigned long _94;
  unsigned long _95;
  integer(kind=8) _96;
  real(kind=8) _97;
  real(kind=8) _98;
  int128_t _100;
  int128_t _101;
  unsigned long _102;
  int128_t _103;
  int128_t _104;
  unsigned long _105;
  unsigned long _106;
  unsigned long _107;
  integer(kind=8) _108;

  <bb 2> [local count: 10631108]:

  <bb 20> [local count: 10631108]:
  # graphite_IV.156_22 = PHI <0(2), graphite_IV.156_19(21)>
  _100 = graphite_IV.156_22 * 51;
  _78 = graphite_IV.156_22 * 51;
  _68 = graphite_IV.156_22 * 51;

  <bb 24> [local count: 10631108]:
  # graphite_IV.157_32 = PHI <0(20), graphite_IV.157_35(25)>
  _103 = graphite_IV.157_32 * 51;
  _88 = graphite_IV.157_32 * 51;
  _71 = graphite_IV.157_32 * 51;

  <bb 28> [local count: 10631108]:
  # graphite_IV.158_33 = PHI <0(24), graphite_IV.158_31(29)>
  _92 = graphite_IV.158_33 * 51;
  _81 = graphite_IV.158_33 * 51;
  _46 = graphite_IV.156_22 * -51;
  _47 = _46 >= -2047;
  if (_47 != 0)
    goto <bb 38>; [100.00%]
  else
    goto <bb 33>; [0.00%]

  <bb 38> [local count: 10631108]:
  _48 = graphite_IV.156_22 * -51;
  _49 = _48 + 2047;
  _50 = MIN_EXPR <_49, 50>;

  <bb 36> [local count: 10631108]:
  # graphite_IV.159_51 = PHI <0(38), graphite_IV.159_52(37)>
  _101 = graphite_IV.159_51 + _100;
  _102 = (unsigned long) _101;
  _79 = graphite_IV.159_51 + _78;
  _80 = (unsigned long) _79;
  _69 = graphite_IV.159_51 + _68;
  _70 = (unsigned long) _69;
  _53 = graphite_IV.157_32 * -51;
  _54 = _53 >= -2047;
  if (_54 != 0)
    goto <bb 47>; [100.00%]
  else
    goto <bb 42>; [0.00%]

  <bb 47> [local count: 10631108]:
  _55 = graphite_IV.157_32 * -51;
  _56 = _55 + 2047;
  _57 = MIN_EXPR <_56, 50>;

  <bb 45> [local count: 10631108]:
  # graphite_IV.160_58 = PHI <0(47), graphite_IV.160_59(46)>
  _104 = graphite_IV.160_58 + _103;
  _105 = (unsigned long) _104;
  _106 = _105 * 2048;
  _107 = _102 + _106;
  _108 = (integer(kind=8)) _107;
  _89 = graphite_IV.160_58 + _88;
  _90 = (unsigned long) _89;
  _91 = _90 * 2048;
  _72 = graphite_IV.160_58 + _71;
  _73 = (unsigned long) _72;
  _74 = _73 * 2048;
  _75 = _70 + _74;
  _76 = (integer(kind=8)) _75;
  _60 = graphite_IV.158_33 * -51;
  _61 = _60 >= -2047;
  if (_61 != 0)
    goto <bb 56>; [100.00%]
  else
    goto <bb 51>; [0.00%]

  <bb 56> [local count: 10631108]:
  _62 = graphite_IV.158_33 * -51;
  _63 = _62 + 2047;
  _64 = MIN_EXPR <_63, 50>;

  <bb 54> [local count: 1073741824]:
  # graphite_IV.161_65 = PHI <0(56), graphite_IV.161_66(55)>
  _93 = graphite_IV.161_65 + _92;
  _94 = (unsigned long) _93;
  _95 = _91 + _94;
  _96 = (integer(kind=8)) _95;
  _82 = graphite_IV.161_65 + _81;
  _83 = (unsigned long) _82;
  _84 = _83 * 2048;
  _85 = _80 + _84;
  _86 = (integer(kind=8)) _85;
  _67 = (*c_24(D))[_76];
  _77 = (*a_25(D))[_86];
  _87 = (*b_26(D))[_96];
  _97 = _77 * _87;
  _98 = _67 + _97;
  (*c_24(D))[_108] = _98;
  graphite_IV.161_66 = graphite_IV.161_65 + 1;
  if (graphite_IV.161_65 < _64)
    goto <bb 55>; [100.00%]
  else
    goto <bb 51>; [0.00%]

  <bb 55> [local count: 1073741824]:
  goto <bb 54>; [100.00%]

  <bb 51> [count: 0]:
  graphite_IV.160_59 = graphite_IV.160_58 + 1;
  if (graphite_IV.160_58 < _57)
    goto <bb 46>; [100.00%]
  else
    goto <bb 42>; [0.00%]

  <bb 46> [count: 0]:
  goto <bb 45>; [100.00%]

  <bb 42> [count: 0]:
  graphite_IV.159_52 = graphite_IV.159_51 + 1;
  if (graphite_IV.159_51 < _50)
    goto <bb 37>; [100.00%]
  else
    goto <bb 33>; [0.00%]

  <bb 37> [count: 0]:
  goto <bb 36>; [100.00%]

  <bb 33> [count: 0]:
  graphite_IV.158_31 = graphite_IV.158_33 + 1;
  if (graphite_IV.158_33 < 40)
    goto <bb 29>; [100.00%]
  else
    goto <bb 27>; [0.00%]

  <bb 29> [count: 0]:
  goto <bb 28>; [100.00%]

  <bb 27> [count: 0]:
  graphite_IV.157_35 = graphite_IV.157_32 + 1;
  if (graphite_IV.157_32 < 40)
    goto <bb 25>; [100.00%]
  else
    goto <bb 23>; [0.00%]

  <bb 25> [count: 0]:
  goto <bb 24>; [100.00%]

  <bb 23> [count: 0]:
  graphite_IV.156_19 = graphite_IV.156_22 + 1;
  if (graphite_IV.156_22 < 40)
    goto <bb 21>; [100.00%]
  else
    goto <bb 16>; [0.00%]

  <bb 21> [count: 0]:
  goto <bb 20>; [100.00%]

  <bb 16> [count: 0]:
  return;

}


But I am not sure, why further unroll and vectorization does not happen for the
inner loop, perhaps due to the access patterns, need to investigate further.

Reply via email to