http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53791
Bug #: 53791
Summary: Branches not re-ordered using profile-information
Classification: Unclassified
Product: gcc
Version: 4.8.0
Status: UNCONFIRMED
Severity: enhancement
Priority: P3
Component: tree-optimization
AssignedTo: [email protected]
ReportedBy: [email protected]
Consider the following test case:
extern void abort(void) __attribute__((__noreturn__));
static int __attribute__((__noinline__,__noclone__))
candidate_for_reordering (int x)
{
asm volatile ("" : : : "memory");
if (x == 1)
return 2;
else if (x == 2)
return 4;
else if (x == 3)
return 8;
abort ();
}
int accum = 0;
int main (int argc __attribute__((__unused__)),
char **argv __attribute__((__unused__)))
{
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (1);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (2);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
accum += candidate_for_reordering (3);
return accum; /* 30*8 + 4 + 2 = 246 */
}
Compiling with -fprofile-generate, doing 3 test runs, and compiling with
-fprofile-use, results in this .040t.feedback_fnsplit dump for the
candidate_for_reordering function:
candidate_for_reordering (intD.0 xD.1241)
{
intD.0 D.1319;
# BLOCK 2 freq:10000 count:96
# PRED: ENTRY [100.0%] count:96 (fallthru,exec)
# .MEMD.1325_7 = VDEF <.MEMD.1325_6(D)>
__asm__ __volatile__("" : : : "memory");
# SUCC: 3 [100.0%] count:96 (fallthru)
# BLOCK 3 freq:10000 count:96
# PRED: 2 [100.0%] count:96 (fallthru)
if (xD.1241_2(D) == 1)
goto <bb 7>;
else
goto <bb 4>;
# SUCC: 7 [3.1%] count:3 (true,exec) 4 [96.9%] count:93 (false,exec)
# BLOCK 4 freq:9688 count:93
# PRED: 3 [96.9%] count:93 (false,exec)
if (xD.1241_2(D) == 2)
goto <bb 7>;
else
goto <bb 5>;
# SUCC: 7 [3.2%] count:3 (true,exec) 5 [96.8%] count:90 (false,exec)
# BLOCK 5 freq:9375 count:90
# PRED: 4 [96.8%] count:90 (false,exec)
if (xD.1241_2(D) == 3)
goto <bb 7>;
else
goto <bb 6>;
# SUCC: 7 [100.0%] count:90 (true,exec) 6 (false,exec)
# BLOCK 6
# PRED: 5 (false,exec)
# VUSE <.MEMD.1325_7>
# USE = nonlocal
# CLB = nonlocal
abortD.830 ();
# SUCC:
# BLOCK 7 freq:10000 count:96
# PRED: 3 [3.1%] count:3 (true,exec) 4 [3.2%] count:3 (true,exec) 5
[100.0%] count:90 (true,exec)
# D.1319_1 = PHI <2(3), 4(4), 8(5)>
return D.1319_1;
# SUCC: EXIT [100.0%] count:96
}
... and the following assembly (powerpc64):
candidate_for_reordering:
.quad .L.candidate_for_reordering,.TOC.@tocbase
.previous
.type candidate_for_reordering, @function
.L.candidate_for_reordering:
mflr 0
std 0,16(1)
stdu 1,-112(1)
cmpwi 7,3,1
beq- 7,.L4
cmpwi 0,3,2
beq- 0,.L5
cmpwi 1,3,3
bne- 1,.L3
li 3,8
.L1:
addi 1,1,112
ld 4,16(1)
mtlr 4
blr
.L4:
li 3,2
b .L1
.L5:
li 3,4
b .L1
.L3:
bl abort
nop
Note the very obvious (to the human eye, anyway) optimization opportunity to
test for (x == 3) first. GCC is currently (AFAICT) not able to optimize the
order of branches for mutually exclusive conditions using profile info.
This prevents my work to re-implement emit_case_bit_tests as a GIMPLE lowering
pass to have any meaningful impact on the compiler speed with
profiledbootstrap: There are a couple of tree and GIMPLE predicates that would
benefit from this transformation, but it's not happening...