Author: dcoakley Date: 2011-05-20 17:21:26 -0400 (Fri, 20 May 2011) New Revision: 3614
Modified: trunk/osprey/be/cg/cg_flags.cxx trunk/osprey/be/cg/cg_flags.h trunk/osprey/be/cg/cgdriver.cxx trunk/osprey/be/cg/cgemit.cxx Log: Improve code alignment of basic blocks for x86 targets. Use the "maximum number of bytes that should be skipped" parameter to the .p2align assembler directive to provide better performance for generated code. If the branch-in frequency to the BB is higher, then more padding bytes are allowed to align the BB, and vice-versa. The exact values to use are determined empirically. Approved by: Sun Chan Modified: trunk/osprey/be/cg/cg_flags.cxx =================================================================== --- trunk/osprey/be/cg/cg_flags.cxx 2011-05-20 02:19:52 UTC (rev 3613) +++ trunk/osprey/be/cg/cg_flags.cxx 2011-05-20 21:21:26 UTC (rev 3614) @@ -522,7 +522,11 @@ INT32 CG_sse_load_execute = 0; INT32 CG_load_execute = 1; BOOL CG_loadbw_execute = FALSE; -BOOL CG_p2align = FALSE; +#if defined(TARG_X8664) +INT32 CG_p2align = 2; +#else +INT32 CG_p2align = 0; +#endif BOOL CG_loop32 = FALSE; BOOL CG_compute_to = FALSE; BOOL CG_valgrind_friendly = TRUE; Modified: trunk/osprey/be/cg/cg_flags.h =================================================================== --- trunk/osprey/be/cg/cg_flags.h 2011-05-20 02:19:52 UTC (rev 3613) +++ trunk/osprey/be/cg/cg_flags.h 2011-05-20 21:21:26 UTC (rev 3614) @@ -868,7 +868,7 @@ extern BOOL CG_use_short_form; extern BOOL CG_loadbw_execute; extern BOOL CG_Movext_ICMP; -extern BOOL CG_p2align; +extern INT32 CG_p2align; extern BOOL CG_loop32; extern BOOL CG_compute_to; extern UINT64 CG_p2align_freq; Modified: trunk/osprey/be/cg/cgdriver.cxx =================================================================== --- trunk/osprey/be/cg/cgdriver.cxx 2011-05-20 02:19:52 UTC (rev 3613) +++ trunk/osprey/be/cg/cgdriver.cxx 2011-05-20 21:21:26 UTC (rev 3614) @@ -1213,8 +1213,8 @@ 0, 0, 0, &CG_use_setcc, NULL }, { OVK_BOOL, OV_INTERNAL, TRUE,"short_form", "", 0, 0, 0, &CG_use_short_form, NULL }, - { OVK_BOOL, OV_INTERNAL, TRUE, "p2align", "p2align", - 0, 0, 0, &CG_p2align, NULL }, + { OVK_INT32, OV_VISIBLE, TRUE, "p2align", "p2align", + 2, 0, 2, &CG_p2align, NULL }, { OVK_BOOL, OV_INTERNAL, TRUE, "loop32", "loop32", 0, 0, 0, &CG_loop32, NULL }, { OVK_BOOL, OV_INTERNAL, TRUE, "compute_to", "compute_to", Modified: trunk/osprey/be/cg/cgemit.cxx =================================================================== --- trunk/osprey/be/cg/cgemit.cxx 2011-05-20 02:19:52 UTC (rev 3613) +++ trunk/osprey/be/cg/cgemit.cxx 2011-05-20 21:21:26 UTC (rev 3614) @@ -4531,15 +4531,6 @@ } if (bb == head) { -#ifdef TARG_X8664 - if (CG_p2align) - fputs ("\t.p2align 6,,7\n", file); - else if (CG_loop32) { - if (BB_innermost(bb) && (Is_Target_Barcelona() || Is_Target_Orochi())) { - fputs ("\t.p2align 5,,\n", file); - } - } -#endif SRCPOS srcpos = BB_Loop_Srcpos(bb); INT32 lineno = SRCPOS_linenum(srcpos); @@ -4705,6 +4696,8 @@ if (!BB_entry(bb)) { float fall_thru_freq = 0.0; float branch_in_freq = 0.0; + int fall_thru_preds = 0; + int branch_in_preds = 0; BBLIST *edge; BB *fall_thru_pred = BB_Fall_Thru_Predecessor(bb); FOR_ALL_BB_PREDS(bb, edge) { @@ -4713,28 +4706,72 @@ FmtAssert(succ_edge != NULL, ("EMT_Assemble_BB: succ bb not found")); if (pred == fall_thru_pred) { fall_thru_freq = BB_freq(pred) * BBLIST_prob(succ_edge); + fall_thru_preds = 1; } else { branch_in_freq += BB_freq(pred) * BBLIST_prob(succ_edge); + branch_in_preds ++; } } - int max_skip_bytes = CG_p2align_max_skip_bytes; float branch_in_ratio = branch_in_freq / fall_thru_freq; - bool add_p2align = FALSE; + if (fall_thru_freq == 0.0 && branch_in_freq > 0.0) + branch_in_ratio = 100; + else if (fall_thru_freq == 0.0 && branch_in_freq == 0.0){ + if (fall_thru_preds == 0 && branch_in_preds > 0) + branch_in_ratio = 0.4; + else + branch_in_ratio = 0.0; + } + if (!(Is_Target_Barcelona() || Is_Target_Orochi() || Is_Target_Wolfdale()) || !CG_p2align) + { // bug 2191 if (branch_in_freq > 100000000.0 && - branch_in_ratio > 50.0) { - max_skip_bytes = 15; - add_p2align = TRUE; + branch_in_ratio > 50.0) + fprintf(Asm_File, "\t.p2align 4,,\n"); } + else if (branch_in_ratio > 0.0) + { + int max_skip_bytes; + + if (CG_p2align == 2){ + if (branch_in_ratio > 50.0) + max_skip_bytes = 31; + else if (branch_in_ratio > 3.5) + max_skip_bytes = 20; + else if (branch_in_ratio > 0.5) + max_skip_bytes = 10; + else if (branch_in_ratio > 0.3) + max_skip_bytes = 3; + else + max_skip_bytes = 0; + } else if (CG_p2align == 1) + { + if(branch_in_ratio > 50.0) + max_skip_bytes = 31; + else if (branch_in_ratio > 0.5) + max_skip_bytes = 3; + else max_skip_bytes = 0; + } +/* loop head are not aligned specificially anymore, as + * 1. If it is the first BB of the loop, the loop head is already honored with + * high branch in rate; + * 2. the loop head can be placed by CFLOW-OPT in the middle of the loop code, + * when there is none biased jump on condition instructions inside. If so, + * it is not desirable to align the loop head. + */ + if(max_skip_bytes > 0) + { + if(!Is_Target_Barcelona() && !Is_Target_Orochi() || CG_p2align != 2){ + if (max_skip_bytes > 15) + max_skip_bytes = 15; + fprintf(Asm_File, "\t.p2align 4,,%d\n", max_skip_bytes); + } + else + fprintf(Asm_File, "\t.p2align 5,,%d\n", max_skip_bytes); + } + } - if (add_p2align || - (CG_p2align_freq > 0 && - branch_in_freq > CG_p2align_freq && - branch_in_ratio > 0.5)) { - fprintf(Asm_File, "\t.p2align 4,,%d\n", max_skip_bytes); - } } #endif @@ -4827,6 +4864,8 @@ // alt-entry if ( Assembly ) { fprintf ( Asm_File, "\t%s\t%s\n", AS_AENT, ST_name(entry_sym)); // KEY + if (CG_p2align != 0) + fputs ("\t.p2align 5,,\n", Asm_File); Print_Label (Asm_File, entry_sym, 0 ); } EMT_Put_Elf_Symbol (entry_sym); @@ -8743,7 +8782,7 @@ if ( Assembly ) { #if defined(TARG_X8664) || defined(TARG_LOONGSON) if (CG_p2align) - fputs ("\t.p2align 4,,15\n", Asm_File); + fputs ("\t.p2align 5,,\n", Asm_File); else if (PU_src_lang (Get_Current_PU()) & PU_CXX_LANG) { // g++ requires a minimum alignment because it uses the least significant // bit of function pointers to store the virtual bit. ------------------------------------------------------------------------------ What Every C/C++ and Fortran developer Should Know! Read this article and learn how Intel has extended the reach of its next-generation tools to help Windows* and Linux* C/C++ and Fortran developers boost performance applications - including clusters. http://p.sf.net/sfu/intel-dev2devmay _______________________________________________ Open64-devel mailing list Open64-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/open64-devel