Author: mlai
Date: 2012-03-13 04:20:27 -0400 (Tue, 13 Mar 2012)
New Revision: 3881
Modified:
trunk/osprey/be/cg/cg_flags.cxx
trunk/osprey/be/cg/cg_flags.h
trunk/osprey/be/cg/cg_loop.cxx
trunk/osprey/be/cg/cgdriver.cxx
trunk/osprey/be/cg/x8664/cgtarget.cxx
Log:
Added non-temporal store generation support for single basic block while loops.
Extended the option -CG:movnti to include -1 as a means for the user to
instruct the compiler to emit non-temporal stores unconditionally. (Behavior
of existing functionality is not affected.) Some TOP code cleanup for
non-temporal store.
Approved by Jian-Xin Lai.
Modified: trunk/osprey/be/cg/cg_flags.cxx
===================================================================
--- trunk/osprey/be/cg/cg_flags.cxx 2012-03-12 21:10:05 UTC (rev 3880)
+++ trunk/osprey/be/cg/cg_flags.cxx 2012-03-13 08:20:27 UTC (rev 3881)
@@ -291,7 +291,7 @@
BOOL CG_use_short_form = FALSE;
UINT64 CG_p2align_freq = 10000;
UINT32 CG_p2align_max_skip_bytes = 3;
-UINT32 CG_movnti = 1000;
+INT32 CG_movnti = 1000;
BOOL CG_use_incdec = TRUE;
BOOL CG_use_xortozero = TRUE; // bug 8592
BOOL CG_use_xortozero_Set = FALSE;
Modified: trunk/osprey/be/cg/cg_flags.h
===================================================================
--- trunk/osprey/be/cg/cg_flags.h 2012-03-12 21:10:05 UTC (rev 3880)
+++ trunk/osprey/be/cg/cg_flags.h 2012-03-13 08:20:27 UTC (rev 3881)
@@ -881,7 +881,7 @@
extern BOOL CG_compute_to;
extern UINT64 CG_p2align_freq;
extern UINT32 CG_p2align_max_skip_bytes;
-extern UINT32 CG_movnti;
+extern INT32 CG_movnti;
extern BOOL CG_use_xortozero;
extern BOOL CG_use_xortozero_Set;
extern BOOL CG_use_incdec;
Modified: trunk/osprey/be/cg/cg_loop.cxx
===================================================================
--- trunk/osprey/be/cg/cg_loop.cxx 2012-03-12 21:10:05 UTC (rev 3880)
+++ trunk/osprey/be/cg/cg_loop.cxx 2012-03-13 08:20:27 UTC (rev 3881)
@@ -7442,6 +7442,9 @@
CG_LOOP_Trace_Loop(loop, "*** Before SINGLE_BB_WHILELOOP_UNROLL ***");
cg_loop.Build_CG_LOOP_Info(TRUE);
+#ifdef TARG_X8664
+ CGTARG_LOOP_Optimize(loop);
+#endif
cg_loop.Determine_Unroll_Factor();
Unroll_Dowhile_Loop(loop, cg_loop.Unroll_factor());
cg_loop.Recompute_Liveness();
Modified: trunk/osprey/be/cg/cgdriver.cxx
===================================================================
--- trunk/osprey/be/cg/cgdriver.cxx 2012-03-12 21:10:05 UTC (rev 3880)
+++ trunk/osprey/be/cg/cgdriver.cxx 2012-03-13 08:20:27 UTC (rev 3881)
@@ -1237,8 +1237,8 @@
{ OVK_BOOL, OV_INTERNAL, TRUE, "idivbyconst_opt", "",
0, 0, 0, &CG_idivbyconst_opt, NULL },
{ OVK_UINT32, OV_INTERNAL, TRUE, "movnti", "",
- 120, 0, UINT32_MAX>>1, &CG_movnti, NULL,
- "Use x86-64's movnti instead of mov when writing memory blocks of this
size or larger (in KB)" },
+ 120, -1, UINT32_MAX>>1, &CG_movnti, NULL,
+ "Use x86-64's movnti instead of mov when writing memory blocks of this
size or larger (in KB). Value of -1 generates movnti unconditionally." },
{ OVK_BOOL, OV_INTERNAL, TRUE, "cloop", "",
0, 0, 0, &CG_LOOP_cloop, NULL },
{ OVK_BOOL, OV_INTERNAL, TRUE,"use_lddqu", "",
Modified: trunk/osprey/be/cg/x8664/cgtarget.cxx
===================================================================
--- trunk/osprey/be/cg/x8664/cgtarget.cxx 2012-03-12 21:10:05 UTC (rev
3880)
+++ trunk/osprey/be/cg/x8664/cgtarget.cxx 2012-03-13 08:20:27 UTC (rev
3881)
@@ -111,6 +111,8 @@
#include "stblock.h" //for Base_Symbol_And_Offset_For_Addressing
#include "be_symtab.h" //Preg_Lda
+#define UNCONDITIONAL_MOVNTI -1
+
UINT32 CGTARG_branch_taken_penalty;
BOOL CGTARG_branch_taken_penalty_overridden = FALSE;
@@ -377,9 +379,6 @@
case TOP_vstsd:
case TOP_vstsd_n32:
case TOP_vstsdx:
- case TOP_vstntsd:
- case TOP_vstntsdx:
- case TOP_vstntsdxx:
case TOP_vstorelpd:
case TOP_vdivxxxsd:
case TOP_vfaddxxxsd:
@@ -3232,12 +3231,20 @@
case TOP_stsd: return TOP_stntsd; break;
case TOP_stsdx: return TOP_stntsdx; break;
case TOP_stsdxx: return TOP_stntsdxx; break;
- case TOP_vstss: return TOP_vstntss; break;
- case TOP_vstssx: return TOP_vstntssx; break;
- case TOP_vstssxx: return TOP_vstntssxx; break;
- case TOP_vstsd: return TOP_vstntsd; break;
- case TOP_vstsdx: return TOP_vstntsdx; break;
- case TOP_vstsdxx: return TOP_vstntsdxx; break;
+
+ /* following cases asserted by CG_movnti == UNCONDITIONAL_MOVNTI */
+ case TOP_stups: return TOP_stntps; break;
+ case TOP_stupsx: return TOP_stntpsx; break;
+ case TOP_stupsxx: return TOP_stntpsxx; break;
+ case TOP_vstups: return TOP_vstntps; break;
+ case TOP_vstupsx: return TOP_vstntpsx; break;
+ case TOP_vstupsxx: return TOP_vstntpsxx; break;
+ case TOP_stupd: return TOP_stntpd; break;
+ case TOP_stupdx: return TOP_stntpdx; break;
+ case TOP_stupdxx: return TOP_stntpdxx; break;
+ case TOP_vstupd: return TOP_vstntpd; break;
+ case TOP_vstupdx: return TOP_vstntpdx; break;
+ case TOP_vstupdxx: return TOP_vstntpdxx; break;
}
FmtAssert(FALSE,("Non-Temporal Store: not supported!"));
return TOP_UNDEFINED;
@@ -3294,43 +3301,52 @@
{
if(CG_movnti==0) return;
- UINT32 trip_count = 0;
- TN* trip_count_tn = CG_LOOP_Trip_Count(loop);
BB* body = LOOP_DESCR_loophead(loop);
+ OP* op = NULL;
- if( trip_count_tn != NULL &&
- TN_is_constant(trip_count_tn) ){
- trip_count = TN_value( trip_count_tn );
+ if (CG_movnti != UNCONDITIONAL_MOVNTI)
+ {
+ UINT32 trip_count = 0;
+ TN* trip_count_tn = CG_LOOP_Trip_Count(loop);
- } else {
- const ANNOTATION* annot = ANNOT_Get(BB_annotations(body), ANNOT_LOOPINFO);
- const LOOPINFO* info = ANNOT_loopinfo(annot);
+ if( trip_count_tn != NULL &&
+ TN_is_constant(trip_count_tn) ){
+ trip_count = TN_value( trip_count_tn );
- trip_count = WN_loop_trip_est(LOOPINFO_wn(info));
- }
+ } else {
+ const ANNOTATION* annot = ANNOT_Get(BB_annotations(body),
ANNOT_LOOPINFO);
+ const LOOPINFO* info = NULL;
+ if (annot != NULL)
+ info = ANNOT_loopinfo(annot);
+ if (info != NULL)
+ trip_count = WN_loop_trip_est(LOOPINFO_wn(info));
+ }
- OP* op = NULL;
- INT64 size = 0;
+ INT64 size = 0;
- Working_Set.Clear();
+ Working_Set.Clear();
- /* First, estimate the totol size (in bytes) that this loop will
- bring to the cache.
- */
- FOR_ALL_BB_OPs_FWD( body, op ){
- if(((OP_store( op ) && !TOP_is_nt_store(OP_code(op))) || //stores
- OP_load(op) ) && //loads
- !Op_In_Working_Set(op)){ //that were not in the working set
- size += CGTARG_Mem_Ref_Bytes(op);
+ /* First, estimate the totol size (in bytes) that this loop will
+ bring to the cache.
+ */
+ FOR_ALL_BB_OPs_FWD( body, op ){
+ if(((OP_store( op ) && !TOP_is_nt_store(OP_code(op))) || //stores
+ OP_load(op) ) && //loads
+ !Op_In_Working_Set(op)){ //that were not in the working set
+ size += CGTARG_Mem_Ref_Bytes(op);
+ }
}
- }
- size *= trip_count;
+ size *= trip_count;
- const INT64 cache_size = CG_movnti * 1024;
+ const INT64 cache_size = CG_movnti * 1024;
- if( size < cache_size )
- return;
+ if( size < cache_size )
+ return;
+ }
+ // if CG_movnti == UNCONDITIONAL_MOVNTI: generate non-temporal stores
+ // unconditionally
+
FOR_ALL_BB_OPs_FWD( body, op ){
if( OP_prefetch( op ) ){
/* Get rid of any prefetchw operation, because it "loads the prefetched
@@ -3395,6 +3411,17 @@
new_top = Movnti_Top(OP_code(op));
break;
}
+ case TOP_stups:
+ case TOP_stupsx:
+ case TOP_stupsxx:
+ case TOP_vstups:
+ case TOP_vstupsx:
+ case TOP_vstupsxx:
+ {
+ if (CG_movnti == UNCONDITIONAL_MOVNTI)
+ new_top = Movnti_Top(OP_code(op));
+ break;
+ }
//SSE2 support
case TOP_stapd:
case TOP_stapdx:
@@ -3419,24 +3446,30 @@
new_top = Movnti_Top(OP_code(op));
break;
}
- //SSE4a support
+ //SSE4a/SSE41 support
case TOP_stss:
case TOP_stssx:
case TOP_stssxx:
case TOP_stsd:
case TOP_stsdx:
case TOP_stsdxx:
- case TOP_vstss:
- case TOP_vstssx:
- case TOP_vstssxx:
- case TOP_vstsd:
- case TOP_vstsdx:
- case TOP_vstsdxx:
{
- if(Is_Target_SSE4a())
+ if(Is_Target_SSE4a() || Is_Target_SSE41())
new_top = Movnti_Top(OP_code(op));
break;
}
+ case TOP_stupd:
+ case TOP_stupdx:
+ case TOP_stupdxx:
+ case TOP_vstupd:
+ case TOP_vstupdx:
+ case TOP_vstupdxx:
+ {
+ if (CG_movnti == UNCONDITIONAL_MOVNTI &&
+ (Is_Target_SSE4a() || Is_Target_SSE41()))
+ new_top = Movnti_Top(OP_code(op));
+ break;
+ }
}//end switch
if( new_top != TOP_UNDEFINED )
------------------------------------------------------------------------------
Keep Your Developer Skills Current with LearnDevNow!
The most comprehensive online learning library for Microsoft developers
is just $99.99! Visual Studio, SharePoint, SQL - plus HTML5, CSS3, MVC3,
Metro Style Apps, more. Free future releases when you subscribe now!
http://p.sf.net/sfu/learndevnow-d2d
_______________________________________________
Open64-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/open64-devel