Author: dcoakley
Date: 2011-05-20 17:21:26 -0400 (Fri, 20 May 2011)
New Revision: 3614

Modified:
   trunk/osprey/be/cg/cg_flags.cxx
   trunk/osprey/be/cg/cg_flags.h
   trunk/osprey/be/cg/cgdriver.cxx
   trunk/osprey/be/cg/cgemit.cxx
Log:
Improve code alignment of basic blocks for x86 targets.

Use the "maximum number of bytes that should be skipped" parameter
to the .p2align assembler directive to provide better performance
for generated code.  If the branch-in frequency to the BB is higher,
then more padding bytes are allowed to align the BB, and vice-versa.
The exact values to use are determined empirically.

Approved by: Sun Chan


Modified: trunk/osprey/be/cg/cg_flags.cxx
===================================================================
--- trunk/osprey/be/cg/cg_flags.cxx     2011-05-20 02:19:52 UTC (rev 3613)
+++ trunk/osprey/be/cg/cg_flags.cxx     2011-05-20 21:21:26 UTC (rev 3614)
@@ -522,7 +522,11 @@
 INT32 CG_sse_load_execute = 0;
 INT32 CG_load_execute = 1;
 BOOL CG_loadbw_execute = FALSE;
-BOOL CG_p2align = FALSE;
+#if defined(TARG_X8664)
+INT32 CG_p2align = 2;
+#else
+INT32 CG_p2align = 0;
+#endif
 BOOL CG_loop32 = FALSE;
 BOOL CG_compute_to = FALSE;
 BOOL CG_valgrind_friendly = TRUE;

Modified: trunk/osprey/be/cg/cg_flags.h
===================================================================
--- trunk/osprey/be/cg/cg_flags.h       2011-05-20 02:19:52 UTC (rev 3613)
+++ trunk/osprey/be/cg/cg_flags.h       2011-05-20 21:21:26 UTC (rev 3614)
@@ -868,7 +868,7 @@
 extern BOOL CG_use_short_form;
 extern BOOL CG_loadbw_execute;
 extern BOOL CG_Movext_ICMP;
-extern BOOL CG_p2align;
+extern INT32 CG_p2align;
 extern BOOL CG_loop32;
 extern BOOL CG_compute_to;
 extern UINT64 CG_p2align_freq;

Modified: trunk/osprey/be/cg/cgdriver.cxx
===================================================================
--- trunk/osprey/be/cg/cgdriver.cxx     2011-05-20 02:19:52 UTC (rev 3613)
+++ trunk/osprey/be/cg/cgdriver.cxx     2011-05-20 21:21:26 UTC (rev 3614)
@@ -1213,8 +1213,8 @@
     0, 0, 0, &CG_use_setcc, NULL },
   { OVK_BOOL,  OV_INTERNAL, TRUE,"short_form", "",
     0, 0, 0, &CG_use_short_form, NULL },
-  { OVK_BOOL,  OV_INTERNAL, TRUE, "p2align", "p2align",
-    0, 0, 0,   &CG_p2align, NULL },
+  { OVK_INT32, OV_VISIBLE, TRUE, "p2align", "p2align",
+    2, 0, 2,   &CG_p2align, NULL },
   { OVK_BOOL,  OV_INTERNAL, TRUE, "loop32", "loop32",
     0, 0, 0,   &CG_loop32, NULL },
   { OVK_BOOL,  OV_INTERNAL, TRUE, "compute_to", "compute_to",

Modified: trunk/osprey/be/cg/cgemit.cxx
===================================================================
--- trunk/osprey/be/cg/cgemit.cxx       2011-05-20 02:19:52 UTC (rev 3613)
+++ trunk/osprey/be/cg/cgemit.cxx       2011-05-20 21:21:26 UTC (rev 3614)
@@ -4531,15 +4531,6 @@
   }
 
   if (bb == head) {
-#ifdef TARG_X8664
-    if (CG_p2align) 
-      fputs ("\t.p2align 6,,7\n", file);
-    else if (CG_loop32) {
-      if (BB_innermost(bb) && (Is_Target_Barcelona() || Is_Target_Orochi())) {
-        fputs ("\t.p2align 5,,\n", file);
-      }
-    }
-#endif
     SRCPOS srcpos = BB_Loop_Srcpos(bb);
     INT32 lineno = SRCPOS_linenum(srcpos);
 
@@ -4705,6 +4696,8 @@
   if (!BB_entry(bb)) {
     float fall_thru_freq = 0.0;
     float branch_in_freq = 0.0;
+    int fall_thru_preds = 0;
+    int branch_in_preds = 0;
     BBLIST *edge;
     BB *fall_thru_pred = BB_Fall_Thru_Predecessor(bb);
     FOR_ALL_BB_PREDS(bb, edge) {
@@ -4713,28 +4706,72 @@
       FmtAssert(succ_edge != NULL, ("EMT_Assemble_BB: succ bb not found"));
       if (pred == fall_thru_pred) {
        fall_thru_freq = BB_freq(pred) * BBLIST_prob(succ_edge);
+       fall_thru_preds = 1;
       } else {
        branch_in_freq += BB_freq(pred) * BBLIST_prob(succ_edge);
+       branch_in_preds ++;
       }
     }
 
-    int max_skip_bytes = CG_p2align_max_skip_bytes;
     float branch_in_ratio = branch_in_freq / fall_thru_freq;
-    bool add_p2align = FALSE;
 
+    if (fall_thru_freq == 0.0 && branch_in_freq > 0.0)
+      branch_in_ratio = 100;
+    else if (fall_thru_freq == 0.0 && branch_in_freq == 0.0){
+      if (fall_thru_preds == 0 && branch_in_preds > 0)
+       branch_in_ratio = 0.4;
+      else 
+       branch_in_ratio = 0.0;
+    }
+    if (!(Is_Target_Barcelona() || Is_Target_Orochi() || Is_Target_Wolfdale()) 
|| !CG_p2align)
+    {
     // bug 2191
     if (branch_in_freq > 100000000.0 &&
-       branch_in_ratio > 50.0) {
-      max_skip_bytes = 15;
-      add_p2align = TRUE;
+       branch_in_ratio > 50.0) 
+          fprintf(Asm_File, "\t.p2align 4,,\n");
     }
+    else if (branch_in_ratio > 0.0)
+    {
+      int max_skip_bytes;
+   
+      if (CG_p2align == 2){ 
+      if (branch_in_ratio > 50.0)
+        max_skip_bytes = 31;
+      else if (branch_in_ratio > 3.5)
+        max_skip_bytes = 20;
+      else if (branch_in_ratio > 0.5)
+       max_skip_bytes = 10;
+      else if (branch_in_ratio > 0.3)
+       max_skip_bytes = 3;
+      else 
+        max_skip_bytes = 0;
+      } else if (CG_p2align == 1)
+      {
+        if(branch_in_ratio > 50.0)
+          max_skip_bytes = 31;
+        else if (branch_in_ratio > 0.5)
+          max_skip_bytes = 3;
+        else max_skip_bytes = 0;
+      }
+/*  loop head are not aligned specificially anymore, as
+ *  1. If it is the first BB of the loop, the loop head is already honored 
with 
+ *     high branch in rate;
+ *  2. the loop head can be placed by CFLOW-OPT in the middle of the loop 
code, 
+ *     when there is none biased jump on condition instructions inside. If so, 
+ *     it is not desirable to align the loop head. 
+ */      
+      if(max_skip_bytes > 0)
+      {
+        if(!Is_Target_Barcelona() && !Is_Target_Orochi() || CG_p2align != 2){
+          if (max_skip_bytes > 15)
+           max_skip_bytes = 15;        
+          fprintf(Asm_File, "\t.p2align 4,,%d\n", max_skip_bytes);
+        }
+        else 
+          fprintf(Asm_File, "\t.p2align 5,,%d\n", max_skip_bytes);
+      }
+    }
 
-    if (add_p2align ||
-       (CG_p2align_freq > 0 &&
-         branch_in_freq > CG_p2align_freq &&
-        branch_in_ratio > 0.5)) {
-      fprintf(Asm_File, "\t.p2align 4,,%d\n", max_skip_bytes);
-    }
   }
 #endif
 
@@ -4827,6 +4864,8 @@
                // alt-entry
                if ( Assembly ) {
                        fprintf ( Asm_File, "\t%s\t%s\n", AS_AENT, 
ST_name(entry_sym)); // KEY
+                       if (CG_p2align != 0)
+                               fputs ("\t.p2align 5,,\n", Asm_File);
                        Print_Label (Asm_File, entry_sym, 0 );
                }
                EMT_Put_Elf_Symbol (entry_sym);
@@ -8743,7 +8782,7 @@
   if ( Assembly ) {
 #if defined(TARG_X8664) || defined(TARG_LOONGSON)
     if (CG_p2align) 
-      fputs ("\t.p2align 4,,15\n", Asm_File);
+      fputs ("\t.p2align 5,,\n", Asm_File);
     else if (PU_src_lang (Get_Current_PU()) & PU_CXX_LANG) {
       // g++ requires a minimum alignment because it uses the least significant
       // bit of function pointers to store the virtual bit.


------------------------------------------------------------------------------
What Every C/C++ and Fortran developer Should Know!
Read this article and learn how Intel has extended the reach of its 
next-generation tools to help Windows* and Linux* C/C++ and Fortran 
developers boost performance applications - including clusters. 
http://p.sf.net/sfu/intel-dev2devmay
_______________________________________________
Open64-devel mailing list
Open64-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/open64-devel

Reply via email to