https://gcc.gnu.org/bugzilla/show_bug.cgi?id=14741
--- Comment #41 from Avinash Jayakar <avinashd at gcc dot gnu.org> ---
For this particular loop, either in fortran or in c, with the O2 pipeline, I
see that loop invariant motion and pre pass is hindering the graphite to
perform the tiling. If I disable these 2 passes prior to graphite pass, it does
say tiled by 51 and I am able to see about 2x speed improvement (on x86).
In the graphite logs, I can see that it prefers tiling if the loop is perfectly
nested, thus the reason I eliminated those passes to keep it perfect nested
loop until graphite pass.
Also one other change I made in fortran code in description of this bug is to
keep N as compile time constant, for scop detection.
Following is the tiled version of the mult function:
void mult (real(kind=8)[4194304] * restrict a, real(kind=8)[4194304] * restrict
b, real(kind=8)[4194304] * restrict c)
{
int128_t graphite_IV.161;
int128_t graphite_IV.160;
int128_t graphite_IV.159;
int128_t graphite_IV.158;
int128_t graphite_IV.157;
int128_t graphite_IV.156;
integer(kind=4) k;
integer(kind=4) j;
integer(kind=4) i;
int128_t _46;
logical(kind=1) _47;
int128_t _48;
int128_t _49;
int128_t _50;
int128_t _53;
logical(kind=1) _54;
int128_t _55;
int128_t _56;
int128_t _57;
int128_t _60;
logical(kind=1) _61;
int128_t _62;
int128_t _63;
int128_t _64;
real(kind=8) _67;
int128_t _68;
int128_t _69;
unsigned long _70;
int128_t _71;
int128_t _72;
unsigned long _73;
unsigned long _74;
unsigned long _75;
integer(kind=8) _76;
real(kind=8) _77;
int128_t _78;
int128_t _79;
unsigned long _80;
int128_t _81;
int128_t _82;
unsigned long _83;
unsigned long _84;
unsigned long _85;
integer(kind=8) _86;
real(kind=8) _87;
int128_t _88;
int128_t _89;
unsigned long _90;
unsigned long _91;
int128_t _92;
int128_t _93;
unsigned long _94;
unsigned long _95;
integer(kind=8) _96;
real(kind=8) _97;
real(kind=8) _98;
int128_t _100;
int128_t _101;
unsigned long _102;
int128_t _103;
int128_t _104;
unsigned long _105;
unsigned long _106;
unsigned long _107;
integer(kind=8) _108;
<bb 2> [local count: 10631108]:
<bb 20> [local count: 10631108]:
# graphite_IV.156_22 = PHI <0(2), graphite_IV.156_19(21)>
_100 = graphite_IV.156_22 * 51;
_78 = graphite_IV.156_22 * 51;
_68 = graphite_IV.156_22 * 51;
<bb 24> [local count: 10631108]:
# graphite_IV.157_32 = PHI <0(20), graphite_IV.157_35(25)>
_103 = graphite_IV.157_32 * 51;
_88 = graphite_IV.157_32 * 51;
_71 = graphite_IV.157_32 * 51;
<bb 28> [local count: 10631108]:
# graphite_IV.158_33 = PHI <0(24), graphite_IV.158_31(29)>
_92 = graphite_IV.158_33 * 51;
_81 = graphite_IV.158_33 * 51;
_46 = graphite_IV.156_22 * -51;
_47 = _46 >= -2047;
if (_47 != 0)
goto <bb 38>; [100.00%]
else
goto <bb 33>; [0.00%]
<bb 38> [local count: 10631108]:
_48 = graphite_IV.156_22 * -51;
_49 = _48 + 2047;
_50 = MIN_EXPR <_49, 50>;
<bb 36> [local count: 10631108]:
# graphite_IV.159_51 = PHI <0(38), graphite_IV.159_52(37)>
_101 = graphite_IV.159_51 + _100;
_102 = (unsigned long) _101;
_79 = graphite_IV.159_51 + _78;
_80 = (unsigned long) _79;
_69 = graphite_IV.159_51 + _68;
_70 = (unsigned long) _69;
_53 = graphite_IV.157_32 * -51;
_54 = _53 >= -2047;
if (_54 != 0)
goto <bb 47>; [100.00%]
else
goto <bb 42>; [0.00%]
<bb 47> [local count: 10631108]:
_55 = graphite_IV.157_32 * -51;
_56 = _55 + 2047;
_57 = MIN_EXPR <_56, 50>;
<bb 45> [local count: 10631108]:
# graphite_IV.160_58 = PHI <0(47), graphite_IV.160_59(46)>
_104 = graphite_IV.160_58 + _103;
_105 = (unsigned long) _104;
_106 = _105 * 2048;
_107 = _102 + _106;
_108 = (integer(kind=8)) _107;
_89 = graphite_IV.160_58 + _88;
_90 = (unsigned long) _89;
_91 = _90 * 2048;
_72 = graphite_IV.160_58 + _71;
_73 = (unsigned long) _72;
_74 = _73 * 2048;
_75 = _70 + _74;
_76 = (integer(kind=8)) _75;
_60 = graphite_IV.158_33 * -51;
_61 = _60 >= -2047;
if (_61 != 0)
goto <bb 56>; [100.00%]
else
goto <bb 51>; [0.00%]
<bb 56> [local count: 10631108]:
_62 = graphite_IV.158_33 * -51;
_63 = _62 + 2047;
_64 = MIN_EXPR <_63, 50>;
<bb 54> [local count: 1073741824]:
# graphite_IV.161_65 = PHI <0(56), graphite_IV.161_66(55)>
_93 = graphite_IV.161_65 + _92;
_94 = (unsigned long) _93;
_95 = _91 + _94;
_96 = (integer(kind=8)) _95;
_82 = graphite_IV.161_65 + _81;
_83 = (unsigned long) _82;
_84 = _83 * 2048;
_85 = _80 + _84;
_86 = (integer(kind=8)) _85;
_67 = (*c_24(D))[_76];
_77 = (*a_25(D))[_86];
_87 = (*b_26(D))[_96];
_97 = _77 * _87;
_98 = _67 + _97;
(*c_24(D))[_108] = _98;
graphite_IV.161_66 = graphite_IV.161_65 + 1;
if (graphite_IV.161_65 < _64)
goto <bb 55>; [100.00%]
else
goto <bb 51>; [0.00%]
<bb 55> [local count: 1073741824]:
goto <bb 54>; [100.00%]
<bb 51> [count: 0]:
graphite_IV.160_59 = graphite_IV.160_58 + 1;
if (graphite_IV.160_58 < _57)
goto <bb 46>; [100.00%]
else
goto <bb 42>; [0.00%]
<bb 46> [count: 0]:
goto <bb 45>; [100.00%]
<bb 42> [count: 0]:
graphite_IV.159_52 = graphite_IV.159_51 + 1;
if (graphite_IV.159_51 < _50)
goto <bb 37>; [100.00%]
else
goto <bb 33>; [0.00%]
<bb 37> [count: 0]:
goto <bb 36>; [100.00%]
<bb 33> [count: 0]:
graphite_IV.158_31 = graphite_IV.158_33 + 1;
if (graphite_IV.158_33 < 40)
goto <bb 29>; [100.00%]
else
goto <bb 27>; [0.00%]
<bb 29> [count: 0]:
goto <bb 28>; [100.00%]
<bb 27> [count: 0]:
graphite_IV.157_35 = graphite_IV.157_32 + 1;
if (graphite_IV.157_32 < 40)
goto <bb 25>; [100.00%]
else
goto <bb 23>; [0.00%]
<bb 25> [count: 0]:
goto <bb 24>; [100.00%]
<bb 23> [count: 0]:
graphite_IV.156_19 = graphite_IV.156_22 + 1;
if (graphite_IV.156_22 < 40)
goto <bb 21>; [100.00%]
else
goto <bb 16>; [0.00%]
<bb 21> [count: 0]:
goto <bb 20>; [100.00%]
<bb 16> [count: 0]:
return;
}
But I am not sure, why further unroll and vectorization does not happen for the
inner loop, perhaps due to the access patterns, need to investigate further.